Okay, I'm making a program that puts six word sequences into a hash table by taking the ascii value of each letter of the word and adding it to the sum, and multiplying it by a value, such as;

num = num + int(firstName[nq])*(nq+1);

in a loop.

Now, I have a hash table the exact size of the number of total words.
In theory, every six word sequence should have a unique number and be put into the hash table likewise, while those six word sequences that are the same should be put into the same spot of the hash table.

However, when I run my program, I get way too many collisions. It will say I've got 643 of the same six-word sequences between two different files that don't even have a single six word sequence that is the same.

Can anyone POSSIBLY help? Here is my code:

int getdir (string dir, vector<string> &files)
{
   DIR *dp;
   struct dirent *dirp;
   if((dp = opendir(dir.c_str())) == NULL) {
     cout << "Error(" << errno << ") opening " << dir << endl;
     return errno;
   }

   while ((dirp = readdir(dp)) != NULL) {
      files.push_back(string(dirp->d_name));
   }
   closedir(dp);
   return 0;
}

int main(int argc, char *argv[])
{
    List_3358<int> *newList=new List_3358<int>[33971];
    long long int num;
    num = 0;
    int sizey;
    int county = 0;
    int abc=0;
    int mastercount=0;

    
    string rand; // used to convert argv into a string
    for(int i = 1; i < argc-1; i++)
    {
      rand = rand+argv[i];
    }
    char tempo = *argv[argc-1]; // this and the line below stores the last parameter(the number) into an int
    int n = atoi(&tempo);
    string test; // used for running the program
    string dir = string(rand);
    vector<string> files = vector<string>(); // a vector created 
    
    getdir(dir,files); //runs the getdir function to get the file names
    
    for (unsigned int i = 0;i < files.size();i++) {
        cout << files[i] << endl;
    } //prints the file names (good for testing)
    ifstream fin[files.size()];//makes an array of fins
    ofstream fout;
    fout.open("test.txt");//opens the text output file


    
    const int SIZE = n;
    string str1[SIZE];
    string sum;
    
     

    //master loop, runs the main loop until there are no more files to read
    for(int z =2; z<files.size();z++){
    test=rand;
    test = test+files[z];
    cout << endl << test.data() << endl;
    fin[z].open(test.data());
    char firstName[100];
    num = 0;
    //sum = "";
    
    
    
       for(int i=0; i<SIZE; i++) //gets the original SIZE words into the string array.
       {
           fin[z] >> str1[i];
           mastercount++;
           //sum = sum + str1[i];
           
           strcpy(firstName, str1[i].c_str());
           sizey=strlen(firstName);
           //cout << endl << " " << firstName << " ";
           for(int q = 0; q<sizey; q++)
           {
              num = num + (int(firstName[q])*(q+1));
              //cout << num << endl;
           }
           
       }
       newList[num%33971].insert(z);
       fout << " " << num << " ";
       fout << endl << "|  ";
       for(int p=0; p<SIZE; p++)
         fout << str1[p] << " ";
       
       fout << "  |";
    
    while(!fin[z].eof())  // this loop gets six words, one word at a time, until the file is empty.
    {
       num = 0;
       for(int q = 0; q<SIZE-1; q++)
       {
               str1[q]=str1[q+1];
       }
       fin[z] >> str1[SIZE-1];
       mastercount ++;
       for(int np=0; np<SIZE; np++) //gets the original SIZE words into the string array.
       {
           strcpy(firstName, str1[np].c_str());
           sizey=strlen(firstName);
           //cout << sizey << " " << firstName;
           //cout << endl << firstName << endl;
           for(int nq = 0; nq<sizey; nq++)
           {
              num = num + int(firstName[nq])*(nq+1);
              //cout << num << endl;
           }
           
       }
       //cout << endl;
       newList[num%33971].insert(z);
       fout << " " << num << " ";
       
       fout << endl << "|  ";
       for(int r=0; r<SIZE; r++)
         fout << str1[r] << " ";
       
       fout << "  |";
       
    }
    
    
        
        
    fout << endl << endl << endl << endl << endl << endl << endl << endl << endl << endl << endl << endl << endl << endl << endl;
    fin[z].close();

    
    }
    fout.close();
    cout << 33946%num << endl;
    cout << sum << endl;
    newList[13699%33971].getCurrent();
    for (int rh=2;rh<files.size(); rh++){
    for (int rq=rh;rq<files.size(); rq++){ county =0;
    for(int x = 0; x < 33971; x++)
    {
            if(!newList[x].isEmpty())
            {
            if (newList[x].testy(rh,rq))
            {
               county++;
            }
            }
    }
    
    if (county >= 300)
        cout << "Number of collisions between file " << rh << " and " << rq << ": " << county << endl;
        
    }
    }
    cout << " " << mastercount;
    system("PAUSE");
    return 0;
}

Recommended Answers

All 4 Replies

You haven't told us how many words each of the two files have.

A few points: Sorry to preach...

You allocate 33971 lists!! Much better is to use a map.
std::map<int,std::string>

If you want to record clashes then us a multimap.

Next: Why open all the files at once, have one ifstream and open a file, process and open the next file.

C: The loop from line 68 shows that you missed that strings can expose their char data.

for(int i=0;i<SIZE;i++)
{
    fin>>str1[i];
    mastercount++;
    char* ptr=str1.c_str();
    for(unsigned int q=0;q<ptr;q++,ptr++)
      num+=static_cast<int>(*ptr)*(q+1);
}

Then I lost the will to live.... I can't figure out what the get six words bit is below is intended to do.

However, hope this helps a bit, anyway.

A few points: Sorry to preach...

You allocate 33971 lists!! Much better is to use a map.
std::map<int,std::string>

If you want to record clashes then us a multimap.

Are you joking? He's implementing a hash table.

Start from the beginning:

List_3358<int> *newList=new List_3358<int>[33971];

A magic number (explicit int constant > 2 - bad style). Possible solution:

const size_t TABLESIZE = 33971; // what is it and why...
typedef list<int> VChain;
...
VChain* pHash = new VChain[TABLESIZE];

An example of a wrong code and an irresponsible design:

char tempo = *argv[argc-1]; 
// this and the line below stores the last parameter(the number) into an int
int n = atoi(&tempo);

It was just your imagination that the last parameter is a single digit. Suppose the program started without parameters or with /? parameter (common convention: an user ask help on parameters, all good programs must support this feature). Alas, the next statement is wrong in any case. The atoi function wants a pointer to C-string (char array terminated by null chr). Of course, you get unpredictable result here.

Strictly speaking, it's the most valued reason of your troubles. This unpredictable wrong number becames your SIZE value...

Apropos, a very strange all cmd line parameters concatenation is a very dangerous action too.

Avoid unusual redeclarations of common and well-known library names (rand, for example). It's possible but looks like a bad style.

To spread butter on butter:

vector<string> files = vector<string>();
// absolutely the same effect as in clear and simple
vector<string> files;
// but the 1st strange construct is (potentially) more expensive.

The readdir function returns ALL directory entries: files, directories, links, sockets etc. You need file entries only. Use d_type field of struct dirent to recognize file entries. Consult your system docs for right values of the d_type field.

Try to correct errors then try again...

Be a part of the DaniWeb community

We're a friendly, industry-focused community of developers, IT pros, digital marketers, and technology enthusiasts meeting, networking, learning, and sharing knowledge.