使用哈希数组对罗密欧与朱丽叶文本进行统计分析。我差了 3684 个字中的 2 个字

Question

所以罗密欧与朱丽叶中使用的唯一单词的数量应该是3648。我得到3682。我已经尽可能地调试并重写了更新功能。 class 的老师和其他学生也得到了 3648。所以基本上我差了 2。

问题出在更新函数中，该函数将密钥输入哈希数组，检查冲突等。只要有新条目，它就会更新大小。

这是我粘贴在 hastebin 中的代码：http://hastebin.com/irawafeyen.avrasm

这是粘贴在 hastebin 中的罗密欧与朱丽叶的文本：http://hastebin.com/semuponata.hs

为了使我的代码运行，您必须将文件另存为 RomeoAndJuliet.txt 与代码

在同一文件夹中

为了方便起见，这里粘贴了我的代码。

    #define HASHSIZE 4001
    #define ARRAY_SIZE 23868
    #include <iostream>
    #include <fstream>
    #include <string>

    using namespace std;

    // Use folding on a string, summed 4 bytes at a time
    int sfold(const char* key) 
    {
        unsigned int *lkey = (unsigned int *)key;
        int intlength = strlen(key)/4;
        unsigned int sum = 0;
        for(int i=0; i<intlength; i++)
            sum += lkey[i];
    // Now deal with the extra chars at the end
        int extra = strlen(key) - intlength*4;
        char temp[4];
        lkey = (unsigned int *)temp;
        lkey[0] = 0;
        for(int i=0; i<extra; i++)
            temp[i] = key[intlength*4+i];
        sum += lkey[0];
        return sum % HASHSIZE;
    }

    class HashArray
    {

        struct Node
        {
            string word;// = "null";
            int frequency;
        };

        private:
            int size;


        public:
            Node hasharr[HASHSIZE];

            HashArray()
            {
                for(int i = 0; i < HASHSIZE; i++)
                    hasharr[i].frequency = 0;   
                size = 0;
            }
            ~HashArray() {} 

            void update(string charkey)
            {
                const char* temp = charkey.c_str();
                int intkey = sfold(temp);       
                if (hasharr[intkey].frequency == 0)
                    {
                        hasharr[intkey].word = charkey;
                        hasharr[intkey].frequency = 1;
                        size++;
                    }
                else if(hasharr[intkey].word == charkey)
                        hasharr[intkey].frequency++;
                else 
                    {
                        while(hasharr[intkey].frequency != 0 && hasharr[intkey].word != charkey)
                        {
                            if(intkey > 4000) 
                                intkey = 1;
                            intkey++;
                        }

                        if(hasharr[intkey].word == charkey)
                            hasharr[intkey].frequency++;
                        else 
                        {
                        hasharr[intkey].word = charkey;
                        hasharr[intkey].frequency++;    
                        size++;             
                        }
                    }
            }
            void hashsize()
            {
                cout <<"The amount of entries in the hash table is "<< size;
            }

            void find1timewords() // prints all words that appear only one and then prints the amount of these types of words.
            {
                int i = 0;
                int count = 0;
                while(i < 4000) 
                {
                    if(hasharr[i].frequency == 1)
                    {
                        cout << endl << i << " ";
                        cout << hasharr[i].frequency <<" "<< hasharr[i].word<<" ";
                        count++;
                    }
                    i++;
                }
                cout << " number of words appearing one time: " << count;
            }

            void printFirstN(int n) // Prints the first N hash entries at indexes 0 thru N.
            {
                for (int i = 0; i < n; i++)
                    cout << endl << "tag: " << i << " word: " << hasharr[i].word << " frequency: " << hasharr[i].frequency <<endl;
            }

            void printTopN(int n) //print the first N most used words.
            {
                int max;
            }
    };



    int main()
    {
        HashArray hasharray;

        ifstream inputFile;
        inputFile.open("RomeoAndJuliet.txt");

        string temp;

        while(inputFile >> temp)
            hasharray.update(temp);

        hasharray.hashsize();
        //hasharray.printFirstN(25);
        //hasharray.find1timewords();


        inputFile.close();
        cin.get();
    }

Answer 1

我认为你有一个 Hash Collision，这意味着两个键产生相同的值。您可能需要更改计算哈希值的方式，或者通过添加哈希冲突函数来捕获两个键生成相同哈希值的实例。

看看this。

Answer 2

您的哈希冲突检查代码超出了哈希数组的末尾。此代码

while(hasharr[intkey].frequency != 0 && hasharr[intkey].word != charkey)
{
    if(intkey > 4000) 
        intkey = 1;
    intkey++;
}

当 intkey 刚好是 4000 时出现问题。由于 intkey++ 行，intkey 循环时的值将是 4001。

一个简单的解决方法是在检查越界之前递增。另外，我不明白为什么要从要检查的哈希列表中排除 0，所以当它溢出时将值设置回 0。

while(hasharr[intkey].frequency != 0 && hasharr[intkey].word != charkey)
{
    intkey++;
    if(intkey > 4000) 
        intkey = 0;
}

为什么这会使您的代码出错 - 除了 "Undefined Behavior" 在数组边界之外写入，这可能会导致您的代码执行任何操作（它在我的系统上崩溃），如果它确实成功地写入过去数组的末尾，它会将单词添加到单词计数中，但随后将无法在您的哈希中正确找到该单词。

使用哈希数组对罗密欧与朱丽叶文本进行统计分析。我差了 3684 个字中的 2 个字

Using a Hash Array to statistically analyze Romeo and Juliet text. I am off by 2 words out of 3684

c++

hash

hashtable