编程珍珠：文件中所有单词的频率

Question

我正在尝试编写一个代码来使用散列计算文件中所有单词的频率 table（引用自 Programming Pearls）。我使用了那本书中的逻辑。但是我遇到了一些这样的错误

[Error] request for member 'next' in something not a structure or union
[Error] request for member 'word' in something not a structure or union
[Warning] conflicting types for 'incword'

这是我写的代码

#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>

#define NHASH 29989
#define MULT 31

typedef struct node *nodeptr;
typedef struct node {
    char *word;
    int count;
    struct node *next;
} node;

nodeptr bin[NHASH];

unsigned int
hash(char *p)
{
    unsigned int h = 0;

    for (; *p; p++)
        h = MULT * h + *p;

    return h % NHASH;
}

int
main(void)
{
    FILE *fptr;
    int i, buf_size = 512, *p;
    char buffer[1024], name[100];

    printf("\nEnter the file name:");
    scanf("%s", name);

    fptr = fopen(name, "r");
    if (fptr == NULL) {
        printf("\nProblem with opening the file");
        exit(1);
    }

    for (i = 0; i < NHASH; i++) {
        bin[i] = NULL;
    }

    // while scanf("%s", buf) != EOF{
    memset(buffer, ' ', buf_size * 2);
    while (feof(fptr) == 0) {
        memmove(&buffer[0], &buffer[buf_size - 1], buf_size);
        fread(&buffer[buf_size - 1], 1, buf_size, fptr);
        incword(buffer);
    }

    for (i = 0; i < NHASH; i++) {
        for (p = bin[i]; p != NULL; p = p->next) {
            printf("%s%d", p->word, p->count);
        }
        return 0;
    }
}

void
incword(char *s)
{
    int *p;

    h = hash(s);

    for (p = bin[h]; p != NULL; p = p->next) {
        if (strcmp(s, p->word) == 0 {
            (p->count)++;
            return;
        }
    }

    p = malloc(sizeof(hashnode));
    p->count = 1;
    p->word = malloc(strlen(s) + 1);
    strcpy(p->word, s);
    p->next = bin[h];
    bin[h] = p;
}

这让我很困惑。我不明白我犯过的错误。你能帮我解决一下吗？这是我正在使用的文件的内容（目前，仅用于测试）。

Once upon a time, there was a good man. 

He was a really good man!

Answer 1

我已经清理了你的编译错误并注释了解释错误的源代码。

我已将 your/old 代码括在 #if 0 下，将 new/my 代码括在 #if 1 下。

您收到关于 incword 的投诉的原因是它是在 main 之后定义的，因此它要么需要“前向声明”，要么需要物理移动到 main 以上[这是我在下面所做的]。

“请求成员...”消息是因为 p 被定义为 int *p 而不是 nodeptr p

这只是语法清理，而不是程序逻辑调试：

#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>

#define NHASH 29989
#define MULT 31

typedef struct node *nodeptr;
typedef struct node {
    char *word;
    int count;
    struct node *next;
} node;

nodeptr bin[NHASH];

unsigned int
hash(char *p)
{
    unsigned int h = 0;

    for (; *p; p++)
        h = MULT * h + *p;

    return h % NHASH;
}

void
incword(char *s)
{
// NOTE/BUG: this needs to be a pointer to a node, not merely an int
#if 0
    int *p;
#else
    nodeptr p;
#endif

// NOTE/BUG: 'h' was not defined
#if 0
    h = hash(s);
#else
    unsigned int h = hash(s);
#endif

    for (p = bin[h]; p != NULL; p = p->next) {
// NOTE/BUG: this had a missing closing paren
#if 0
        if (strcmp(s, p->word) == 0 {
#else
        if (strcmp(s, p->word) == 0) {
#endif
            (p->count)++;
            return;
        }
    }

// NOTE/BUG: the type name for a node is 'node' [and _not_ hashnode]
#if 0
    p = malloc(sizeof(hashnode));
#else
    p = malloc(sizeof(node));
#endif

    p->count = 1;
    p->word = malloc(strlen(s) + 1);
    strcpy(p->word, s);
    p->next = bin[h];
    bin[h] = p;
}

int
main(void)
{
    FILE *fptr;
// NOTE/BUG: p needs to be a pointer to a node
#if 0
    int i, buf_size = 512, *p;
#else
    int i, buf_size = 512;
    nodeptr p;
#endif
    char buffer[1024], name[100];

    printf("\nEnter the file name:");
    scanf("%s", name);

    fptr = fopen(name, "r");
    if (fptr == NULL) {
        printf("\nProblem with opening the file");
        exit(1);
    }

    for (i = 0; i < NHASH; i++) {
        bin[i] = NULL;
    }

    // while scanf("%s", buf) != EOF{
    memset(buffer, ' ', buf_size * 2);
    while (feof(fptr) == 0) {
        memmove(&buffer[0], &buffer[buf_size - 1], buf_size);
        fread(&buffer[buf_size - 1], 1, buf_size, fptr);
        incword(buffer);
    }

    for (i = 0; i < NHASH; i++) {
        for (p = bin[i]; p != NULL; p = p->next) {
            printf("%s%d", p->word, p->count);
        }
#if 0
        return 0;
#endif
    }

#if 1
    return 0;
#endif
}

更新：

好的，正如我在下面的评论中提到的，使用 fread 不会将输入行标记为单词。

因此，我重写了输入循环以使用 fgets 和 strtok。我生成了一个小的代表性输入文件并对其进行了测试。

好消息是您的散列逻辑似乎没有变化。考虑到掩盖这一事实的语法错误的数量，它使它更加令人印象深刻——干得好！

#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>

#define NHASH 29989
#define MULT 31

typedef struct node *nodeptr;
typedef struct node {
    char *word;
    int count;
    struct node *next;
} node;

nodeptr bin[NHASH];

unsigned int
hash(char *p)
{
    unsigned int h = 0;

    for (; *p; p++)
        h = MULT * h + *p;

    return h % NHASH;
}

void
incword(char *s)
{
// NOTE/BUG: this needs to be a pointer to a node, not merely an int
#if 0
    int *p;
#else
    nodeptr p;
#endif

// NOTE/BUG: 'h' was not defined
#if 0
    h = hash(s);
#else
    unsigned int h = hash(s);
#endif

    for (p = bin[h]; p != NULL; p = p->next) {
// NOTE/BUG: this had a missing closing paren
#if 0
        if (strcmp(s, p->word) == 0 {
#else
        if (strcmp(s, p->word) == 0) {
#endif
            (p->count)++;
            return;
        }
    }

// NOTE/BUG: the type name for a node is 'node' [and _not_ hashnode]
#if 0
    p = malloc(sizeof(hashnode));
#else
    p = malloc(sizeof(node));
#endif

    p->count = 1;
    p->word = malloc(strlen(s) + 1);
    strcpy(p->word, s);
    p->next = bin[h];
    bin[h] = p;
}

int
main(void)
{
    FILE *fptr;
// NOTE/BUG: p needs to be a pointer to a node
#if 0
    int i, buf_size = 512, *p;
#else
    int i, buf_size = 512;
    nodeptr p;
#endif
    char buffer[1024], name[100];

    printf("\nEnter the file name:");
    scanf("%s", name);

    fptr = fopen(name, "r");
    if (fptr == NULL) {
        printf("\nProblem with opening the file");
        exit(1);
    }

    for (i = 0; i < NHASH; i++) {
        bin[i] = NULL;
    }

    // while scanf("%s", buf) != EOF{
// NOTE/BUG: this won't read text words too well
#if 0
    memset(buffer, ' ', buf_size * 2);
    while (feof(fptr) == 0) {
        memmove(&buffer[0], &buffer[buf_size - 1], buf_size);
        fread(&buffer[buf_size - 1], 1, buf_size, fptr);
        incword(buffer);
    }
#else
    while (1) {
        // get next line of file
        if (fgets(buffer,sizeof(buffer),fptr) == NULL)
            break;

        char *bp = buffer;

        // split up line into words [separated by whitespace]
        while (1) {
            char *cp = strtok(bp," \t\n");
            bp = NULL;

            if (cp == NULL)
                break;

            // add an individual word
            incword(cp);
        }
    }
#endif

    for (i = 0; i < NHASH; i++) {
        int valid = 0;
        for (p = bin[i]; p != NULL; p = p->next) {
            valid += 1;
// NOTE/BUG: this needs spacing
#if 0
            printf("%s%d", p->word, p->count);
#else
            printf("%s %d\n", p->word, p->count);
#endif
        }
        if (valid > 1)
            printf("\n");
#if 0
        return 0;
#endif
    }

#if 1
    return 0;
#endif
}

更新#2：

为了简洁起见，这里有一个没有 #if 0 和错误注释的版本：

#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>

#define NHASH 29989
#define MULT 31

typedef struct node *nodeptr;
typedef struct node {
    char *word;
    int count;
    struct node *next;
} node;

nodeptr bin[NHASH];

unsigned int
hash(char *p)
{
    unsigned int h = 0;

    for (; *p; p++)
        h = MULT * h + *p;

    return h % NHASH;
}

void
incword(char *s)
{
    nodeptr p;

    unsigned int h = hash(s);

    for (p = bin[h]; p != NULL; p = p->next) {
        if (strcmp(s, p->word) == 0) {
            (p->count)++;
            return;
        }
    }

    p = malloc(sizeof(node));

    p->count = 1;
    p->word = malloc(strlen(s) + 1);
    strcpy(p->word, s);
    p->next = bin[h];
    bin[h] = p;
}

int
main(void)
{
    FILE *fptr;
    int i, buf_size = 512;
    nodeptr p;
    char buffer[1024], name[100];

    printf("\nEnter the file name:");
    scanf("%s", name);

    fptr = fopen(name, "r");
    if (fptr == NULL) {
        printf("\nProblem with opening the file");
        exit(1);
    }

    for (i = 0; i < NHASH; i++) {
        bin[i] = NULL;
    }

    while (1) {
        // get next line of file
        if (fgets(buffer,sizeof(buffer),fptr) == NULL)
            break;

        char *bp = buffer;

        // split up line into words [separated by whitespace]
        while (1) {
            char *cp = strtok(bp," \t\n");
            bp = NULL;

            if (cp == NULL)
                break;

            // add an individual word
            incword(cp);
        }
    }

    for (i = 0; i < NHASH; i++) {
        int valid = 0;
        for (p = bin[i]; p != NULL; p = p->next) {
            valid += 1;
            printf("%s %d\n", p->word, p->count);
        }
        if (valid > 1)
            printf("\n");
    }

    return 0;
}

更新#3：

And also when fgets() is used to store a string into the buffer, words at the end of buffer might get split across different buffers, that's why I preferred the previous logic over this

由于 fgets 读取单个行的文本直至并包括换行符，截断单词的唯一方法是如果缓冲区比最大值短预期的行长度。

usual/simple 解决方案是使用保证足够大以包含该行的缓冲区大小。 1024 的缓冲区长度通常足够大，但可以使用 10000 [或 100000] 的长度。

之前的逻辑有很多问题：

它没有将数据分解成单词。
它[仍然]遇到缓冲区不够大的问题。
因为它使用了512的固定读取长度，所以它更有可能在中间切掉一个单词。
它只是将 512 个字节读入缓冲区的后半部分。在第一次迭代中，前半部分会有空格。
它有 UB [未定义的行为]，因为它不保证缓冲区中有一个 EOS 字符（即确保 [=19 的参数=] 是一个 valid/single 字符串），所以 strcmp [and/or strlen] 可以运行结束。

处理文本文件最快的方法是使用 mmap 的高级技术。查看我的回答：

How does mmap improve file reading speed?

Thank you so much. I have a doubt. When the string is made in into tokens, the special characters are also included with the word. So in my input file, "man!" counted as being different from "man" and "man."

简单的解决方案是在 strtok 调用中添加更多分隔符（例如）：

cp = strtok(bp," \t\n.!,;?:");

但是，一次扫描文件一个字符可能更容易 [使用 fgetc]，只保留“单词”缓冲区中的字母字符，然后调用 incword遇到非字母字符 [使用 isalpha] 来区分它们。

另外，对于像这样的问题，[more] 通常忽略大小写并将 She 和 she 视为同一个词。因此，添加 tolower 可能会有帮助。

这不会处理像 don't 这样的词，因为它会将它们视为两个单独的词：don 和 t

所以，我们应该将 ' 视为一个字母字符，所以我们想要这样的东西：isalpha(c) || (c == '\'')

没关系除了我们会搞砸引用字符串:

I don't use phrases such as 'quick brown fox jumps over the lazy dog' often.

而且，我们如何对待所有物？ [复数] workers 是否应该与所有格形式 workers' [如 workers' club] 区别对待？

这是更新后的示例输入：

Once upon a time, there was a good man.

He was a really good man!

Poetically, a really good man was he.

Man, I'm tired of using the word 'man' everywhere.

I don't use phrases such as 'quick brown fox jumps over the lazy dog' often.

I prefer a brown dog and a lazy fox even though it cuts me to the quick to say
that.

A women's club is not quite the same as a woman's club if the woman possesses
a club.

A worker may belong to a union, which is a form of workers' club. Many workers
may belong to the same union.

这是一些更新的代码，可以处理 大多数 [但不区分复数所有格]：

#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>

#ifndef NHASH
#define NHASH 29989
#endif
#define MULT 31

typedef struct node *nodeptr;
typedef struct node {
    char *word;
    int count;
    struct node *next;
} node;

nodeptr bin[NHASH];

FILE *fptr;
int eof;                            // got eof (i.e previous word was last)
int peek;                           // next character in stream
char buffer[1000];                  // word buffer

unsigned int
hash(char *p)
{
    unsigned int h = 0;

    for (; *p; p++)
        h = MULT * h + *p;

    return h % NHASH;
}

void
incword(char *s)
{
    nodeptr p;

    unsigned int h = hash(s);

    for (p = bin[h]; p != NULL; p = p->next) {
        if (strcmp(s, p->word) == 0) {
            (p->count)++;
            return;
        }
    }

    p = malloc(sizeof(node));

    p->count = 1;
    p->word = strdup(s);
    p->next = bin[h];

    bin[h] = p;
}

char *
getword(void)
{
    int gotone = 0;
    int chr;
    int alfa;
    char *bp = buffer;

    while (! eof) {
        if (peek) {
            chr = peek;
            peek = 0;
        }
        else
            chr = fgetc(fptr);

        // handle eof
        if (chr == EOF) {
            eof = 1;
            break;
        }

        // is this an alphabetic char?
        alfa = isalpha(chr);

        // is this a single quote -- it can be:
        // (1) the start of a quoted string (it's a delimiter)
        // (2) or part of a contraction (it's quasi-alplabetic)
        // (3) the end of a quoted string (it's a delimiter)
        if (chr == '\'') {
            if (! gotone)
                continue;
            peek = fgetc(fptr);
            alfa = isalpha(peek);
        }

        // non-alpha char (i.e. a word delimiter)
        // if no chars have been stored in the word buffer, this is leading
        // whitespace [and we ignore it]
        // otherwise, it's trailing whitespace and the buffer is now a fully
        // formed word
        if (! alfa) {
            if (gotone)
                break;
            continue;
        }

        // unify upper/lower case
        chr = tolower(chr);

        // store the character in the word buffer
        *bp++ = chr;

        // remember that we have at least one valid character in the buffer
        gotone = 1;
    }

    // close off the string
    *bp = 0;

    // set up caller's return
    if (gotone)
        bp = buffer;
    else
        bp = NULL;

    return bp;
}

int
main(int argc,char **argv)
{
    int i;
    nodeptr p;
    char name[100];

    --argc;
    ++argv;

    for (i = 0; i < NHASH; i++)
        bin[i] = NULL;

    if (argc < 1) {
        ++argc;
        --argv;

        printf("\nEnter the file name:");
        fflush(stdout);

        scanf("%s", name);

        argv[0] = name;
    }

    for (;  argc > 0;  --argc, ++argv) {
        fptr = fopen(*argv, "r");
        if (fptr == NULL) {
            printf("\nProblem with opening the file");
            exit(1);
        }

        eof = 0;
        peek = 0;

        while (1) {
            char *cp = getword();
            if (cp == NULL)
                break;
            incword(cp);
        }

        fclose(fptr);
    }

    for (i = 0; i < NHASH; i++) {
        int valid = 0;
        for (p = bin[i]; p != NULL; p = p->next) {
            valid += 1;
            printf("%s %d\n", p->word, p->count);
        }
        if (valid > 1)
            printf("\n");
    }

    return 0;
}

编程珍珠：文件中所有单词的频率

Programming Pearls: Frequency of all words in a file

c

hashtable

file

hashmap