区分 trie 中的单词

Differentiating words in trie

我的 trie 中有一个词 "all" 和一个词 "alter" 但 "alt" 不是 trie 中的一个词。但是,当我检查 "alt" 时,它仍然 returns 为真,因为 is_word 为真,因为 "all" 是一个词。我应该如何处理这个错误。

//Here's the code
typedef struct node{
  bool is_word;    
  struct node *children[27];
} node;

unsigned int wsize=0;
node * root;

bool check(const char* word)
{
    // TODO
    node *chrawler=root;   
    for(int i=0;i<strlen(word)-1;i++)
    {
        int t;
        if(word[i]>=65&&word[i]<=90)
        {        
            t=word[i]-'A';
        }
        else if(isalpha(word[i]))
            t=word[i]-'a';
        else
            t=26;

        if(chrawler->children[t]==NULL)
            return false;
        else
            chrawler=chrawler->children[t];
    }

    if(chrawler->is_word)
        return true;
    return false;    

}

// Load function
bool load(const char* dictionary)
{
    // TODO

    FILE *inptr=fopen(dictionary,"r");
    if(inptr==NULL)
    {
        return false;
    }

    node *new_node=malloc(sizeof(node));
    root=new_node;

    char * word=malloc((LENGTH+1)*sizeof(char));
    int index=0;
    for(int c=fgetc(inptr);c!=EOF;c=fgetc(inptr))
    {
        char ch=c;
        if(ch=='\n')
        { 
            word[index]='[=10=]';
            index=0;
            node *chrawler=root;
            for(int i=1;i<strlen(word);i++)
            {
                    int t;
                    if(isalpha(word[i-1]))
                        t=word[i-1]-'a';
                    else
                        t=26;
                    if(chrawler->children[t]==NULL)
                    {
                        node *new_node=malloc(sizeof(node));
                        chrawler->children[t]=new_node;

                        chrawler=chrawler->children[t];
                    }
                    else
                        chrawler=chrawler->children[t];
            }
            chrawler->is_word=1;
            wsize++;

        }
        else
        {
            word[index]=ch;
            index++;
        }

    }

    return true;
}

您需要确保新节点中的所有指针都为空,并将is_word 值设置为false。这也许最容易通过使用 calloc() 分配 space 来完成。创建一个函数来分配和错误检查节点的分配使其更容易。同样,您有两个代码块将字符映射到 trie 索引。你应该更慷慨地使用函数——即使是小函数。

一行数据一个字一个字的输入也不是必须的;最好使用 fgets() 来读取行。

添加这些和其他各种更改(例如,本地数组 word 而不是动态分配的数组——它没有被释放;完成后关闭文件;等等)给出了 MCVE (Minimal, Complete, Verifiable Example) 像这样:

#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

enum { LENGTH = 256 };

// Here's the code
typedef struct node
{
    bool is_word;
    struct node *children[27];
} node;

unsigned int wsize = 0;
node *root;

static inline int map_char(unsigned char c)
{
    int t;
    if (isalpha(c))
        t = tolower(c) - 'a';
    else
        t = 26;
    return t;
}

static inline node *alloc_node(void)
{
    node *new_node = calloc(1, sizeof(node));
    if (new_node == 0)
    {
        fprintf(stderr, "Memory allocation failed in %s\n", __func__);
        exit(1);
    }
    return new_node;
}

static bool check(const char *word)
{
    node *chrawler = root;
    int len = strlen(word);
    for (int i = 0; i < len; i++)
    {
        int t = map_char(word[i]);
        if (chrawler->children[t] == NULL)
            return false;
        else
            chrawler = chrawler->children[t];
    }

    return chrawler->is_word;
}

// Load function
static bool load(const char *dictionary)
{
    FILE *inptr = fopen(dictionary, "r");
    if (inptr == NULL)
    {
        fprintf(stderr, "Failed to open file '%s' for reading\n", dictionary);
        return false;
    }

    root = alloc_node();

    char word[LENGTH];
    while (fgets(word, sizeof(word), inptr) != 0)
    {
        word[strcspn(word, "\n")] = '[=10=]';
        printf("[%s]\n", word);
        node *chrawler = root;
        int len = strlen(word);
        for (int i = 0; i < len; i++)
        {
            int t = map_char(word[i]);
            //printf("t = %d (%c)\n", t, word[i]);
            if (chrawler->children[t] == NULL)
                chrawler->children[t] = alloc_node();
            chrawler = chrawler->children[t];
        }
        chrawler->is_word = 1;
        wsize++;
    }
    printf("%d words read from %s\n", wsize, dictionary);
    fclose(inptr);

    return true;
}

int main(void)
{
    const char *wordfile = "words.txt";
    if (load(wordfile))
    {
        char line[4096];
        while (fgets(line, sizeof(line), stdin) != 0)
        {
            line[strcspn(line, "\n")] = '[=10=]';
            if (check(line))
                printf("[%s] is a word\n", line);
            else
                printf("[%s] is unknown\n", line);
        }
    }
    return 0;
}

还有其他需要更改的地方。例如, wsize 变量应该是非全局的;它并没有真正在 load() 函数之外使用。很容易争论根节点也不应该是全局的; load() 函数应该 return 根节点,check() 函数应该传递根节点。一般情况下,尽可能避免使用全局变量,通常是可以的。

给定一个文件 words.txt 包含:

abelone
abyssinia
archimedes
brachiosaurus
triceratops
all
alter
asparagus
watchamacallit
a
abracadabra
abyss
ant

程序 运行 的输出是:

[abelone]
[abyssinia]
[archimedes]
[brachiosaurus]
[triceratops]
[all]
[alter]
[asparagus]
[watchamacallit]
[a]
[abracadabra]
[abyss]
[ant]
13 words read from words.txt
a
[a] is a word
ab
[ab] is unknown
al
[al] is unknown
all
[all] is a word
alt
[alt] is unknown
alte
[alte] is unknown
alter
[alter] is a word
triceratops
[triceratops] is a word
brachiosaurus
[brachiosaurus] is a word
abys
[abys] is unknown
abbey
[abbey] is unknown
abyss
[abyss] is a word
ant
[ant] is a word
a
[a] is a word
archimedes
[archimedes] is a word