区分 trie 中的单词
Differentiating words in trie
我的 trie 中有一个词 "all" 和一个词 "alter" 但 "alt" 不是 trie 中的一个词。但是,当我检查 "alt" 时,它仍然 returns 为真,因为 is_word 为真,因为 "all" 是一个词。我应该如何处理这个错误。
//Here's the code
typedef struct node{
bool is_word;
struct node *children[27];
} node;
unsigned int wsize=0;
node * root;
bool check(const char* word)
{
// TODO
node *chrawler=root;
for(int i=0;i<strlen(word)-1;i++)
{
int t;
if(word[i]>=65&&word[i]<=90)
{
t=word[i]-'A';
}
else if(isalpha(word[i]))
t=word[i]-'a';
else
t=26;
if(chrawler->children[t]==NULL)
return false;
else
chrawler=chrawler->children[t];
}
if(chrawler->is_word)
return true;
return false;
}
// Load function
bool load(const char* dictionary)
{
// TODO
FILE *inptr=fopen(dictionary,"r");
if(inptr==NULL)
{
return false;
}
node *new_node=malloc(sizeof(node));
root=new_node;
char * word=malloc((LENGTH+1)*sizeof(char));
int index=0;
for(int c=fgetc(inptr);c!=EOF;c=fgetc(inptr))
{
char ch=c;
if(ch=='\n')
{
word[index]='[=10=]';
index=0;
node *chrawler=root;
for(int i=1;i<strlen(word);i++)
{
int t;
if(isalpha(word[i-1]))
t=word[i-1]-'a';
else
t=26;
if(chrawler->children[t]==NULL)
{
node *new_node=malloc(sizeof(node));
chrawler->children[t]=new_node;
chrawler=chrawler->children[t];
}
else
chrawler=chrawler->children[t];
}
chrawler->is_word=1;
wsize++;
}
else
{
word[index]=ch;
index++;
}
}
return true;
}
您需要确保新节点中的所有指针都为空,并将is_word
值设置为false
。这也许最容易通过使用 calloc()
分配 space 来完成。创建一个函数来分配和错误检查节点的分配使其更容易。同样,您有两个代码块将字符映射到 trie 索引。你应该更慷慨地使用函数——即使是小函数。
一行数据一个字一个字的输入也不是必须的;最好使用 fgets()
来读取行。
添加这些和其他各种更改(例如,本地数组 word
而不是动态分配的数组——它没有被释放;完成后关闭文件;等等)给出了 MCVE (Minimal, Complete, Verifiable Example) 像这样:
#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
enum { LENGTH = 256 };
// Here's the code
typedef struct node
{
bool is_word;
struct node *children[27];
} node;
unsigned int wsize = 0;
node *root;
static inline int map_char(unsigned char c)
{
int t;
if (isalpha(c))
t = tolower(c) - 'a';
else
t = 26;
return t;
}
static inline node *alloc_node(void)
{
node *new_node = calloc(1, sizeof(node));
if (new_node == 0)
{
fprintf(stderr, "Memory allocation failed in %s\n", __func__);
exit(1);
}
return new_node;
}
static bool check(const char *word)
{
node *chrawler = root;
int len = strlen(word);
for (int i = 0; i < len; i++)
{
int t = map_char(word[i]);
if (chrawler->children[t] == NULL)
return false;
else
chrawler = chrawler->children[t];
}
return chrawler->is_word;
}
// Load function
static bool load(const char *dictionary)
{
FILE *inptr = fopen(dictionary, "r");
if (inptr == NULL)
{
fprintf(stderr, "Failed to open file '%s' for reading\n", dictionary);
return false;
}
root = alloc_node();
char word[LENGTH];
while (fgets(word, sizeof(word), inptr) != 0)
{
word[strcspn(word, "\n")] = '[=10=]';
printf("[%s]\n", word);
node *chrawler = root;
int len = strlen(word);
for (int i = 0; i < len; i++)
{
int t = map_char(word[i]);
//printf("t = %d (%c)\n", t, word[i]);
if (chrawler->children[t] == NULL)
chrawler->children[t] = alloc_node();
chrawler = chrawler->children[t];
}
chrawler->is_word = 1;
wsize++;
}
printf("%d words read from %s\n", wsize, dictionary);
fclose(inptr);
return true;
}
int main(void)
{
const char *wordfile = "words.txt";
if (load(wordfile))
{
char line[4096];
while (fgets(line, sizeof(line), stdin) != 0)
{
line[strcspn(line, "\n")] = '[=10=]';
if (check(line))
printf("[%s] is a word\n", line);
else
printf("[%s] is unknown\n", line);
}
}
return 0;
}
还有其他需要更改的地方。例如,
wsize
变量应该是非全局的;它并没有真正在 load()
函数之外使用。很容易争论根节点也不应该是全局的; load()
函数应该 return 根节点,check()
函数应该传递根节点。一般情况下,尽可能避免使用全局变量,通常是可以的。
给定一个文件 words.txt
包含:
abelone
abyssinia
archimedes
brachiosaurus
triceratops
all
alter
asparagus
watchamacallit
a
abracadabra
abyss
ant
程序 运行 的输出是:
[abelone]
[abyssinia]
[archimedes]
[brachiosaurus]
[triceratops]
[all]
[alter]
[asparagus]
[watchamacallit]
[a]
[abracadabra]
[abyss]
[ant]
13 words read from words.txt
a
[a] is a word
ab
[ab] is unknown
al
[al] is unknown
all
[all] is a word
alt
[alt] is unknown
alte
[alte] is unknown
alter
[alter] is a word
triceratops
[triceratops] is a word
brachiosaurus
[brachiosaurus] is a word
abys
[abys] is unknown
abbey
[abbey] is unknown
abyss
[abyss] is a word
ant
[ant] is a word
a
[a] is a word
archimedes
[archimedes] is a word
我的 trie 中有一个词 "all" 和一个词 "alter" 但 "alt" 不是 trie 中的一个词。但是,当我检查 "alt" 时,它仍然 returns 为真,因为 is_word 为真,因为 "all" 是一个词。我应该如何处理这个错误。
//Here's the code
typedef struct node{
bool is_word;
struct node *children[27];
} node;
unsigned int wsize=0;
node * root;
bool check(const char* word)
{
// TODO
node *chrawler=root;
for(int i=0;i<strlen(word)-1;i++)
{
int t;
if(word[i]>=65&&word[i]<=90)
{
t=word[i]-'A';
}
else if(isalpha(word[i]))
t=word[i]-'a';
else
t=26;
if(chrawler->children[t]==NULL)
return false;
else
chrawler=chrawler->children[t];
}
if(chrawler->is_word)
return true;
return false;
}
// Load function
bool load(const char* dictionary)
{
// TODO
FILE *inptr=fopen(dictionary,"r");
if(inptr==NULL)
{
return false;
}
node *new_node=malloc(sizeof(node));
root=new_node;
char * word=malloc((LENGTH+1)*sizeof(char));
int index=0;
for(int c=fgetc(inptr);c!=EOF;c=fgetc(inptr))
{
char ch=c;
if(ch=='\n')
{
word[index]='[=10=]';
index=0;
node *chrawler=root;
for(int i=1;i<strlen(word);i++)
{
int t;
if(isalpha(word[i-1]))
t=word[i-1]-'a';
else
t=26;
if(chrawler->children[t]==NULL)
{
node *new_node=malloc(sizeof(node));
chrawler->children[t]=new_node;
chrawler=chrawler->children[t];
}
else
chrawler=chrawler->children[t];
}
chrawler->is_word=1;
wsize++;
}
else
{
word[index]=ch;
index++;
}
}
return true;
}
您需要确保新节点中的所有指针都为空,并将is_word
值设置为false
。这也许最容易通过使用 calloc()
分配 space 来完成。创建一个函数来分配和错误检查节点的分配使其更容易。同样,您有两个代码块将字符映射到 trie 索引。你应该更慷慨地使用函数——即使是小函数。
一行数据一个字一个字的输入也不是必须的;最好使用 fgets()
来读取行。
添加这些和其他各种更改(例如,本地数组 word
而不是动态分配的数组——它没有被释放;完成后关闭文件;等等)给出了 MCVE (Minimal, Complete, Verifiable Example) 像这样:
#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
enum { LENGTH = 256 };
// Here's the code
typedef struct node
{
bool is_word;
struct node *children[27];
} node;
unsigned int wsize = 0;
node *root;
static inline int map_char(unsigned char c)
{
int t;
if (isalpha(c))
t = tolower(c) - 'a';
else
t = 26;
return t;
}
static inline node *alloc_node(void)
{
node *new_node = calloc(1, sizeof(node));
if (new_node == 0)
{
fprintf(stderr, "Memory allocation failed in %s\n", __func__);
exit(1);
}
return new_node;
}
static bool check(const char *word)
{
node *chrawler = root;
int len = strlen(word);
for (int i = 0; i < len; i++)
{
int t = map_char(word[i]);
if (chrawler->children[t] == NULL)
return false;
else
chrawler = chrawler->children[t];
}
return chrawler->is_word;
}
// Load function
static bool load(const char *dictionary)
{
FILE *inptr = fopen(dictionary, "r");
if (inptr == NULL)
{
fprintf(stderr, "Failed to open file '%s' for reading\n", dictionary);
return false;
}
root = alloc_node();
char word[LENGTH];
while (fgets(word, sizeof(word), inptr) != 0)
{
word[strcspn(word, "\n")] = '[=10=]';
printf("[%s]\n", word);
node *chrawler = root;
int len = strlen(word);
for (int i = 0; i < len; i++)
{
int t = map_char(word[i]);
//printf("t = %d (%c)\n", t, word[i]);
if (chrawler->children[t] == NULL)
chrawler->children[t] = alloc_node();
chrawler = chrawler->children[t];
}
chrawler->is_word = 1;
wsize++;
}
printf("%d words read from %s\n", wsize, dictionary);
fclose(inptr);
return true;
}
int main(void)
{
const char *wordfile = "words.txt";
if (load(wordfile))
{
char line[4096];
while (fgets(line, sizeof(line), stdin) != 0)
{
line[strcspn(line, "\n")] = '[=10=]';
if (check(line))
printf("[%s] is a word\n", line);
else
printf("[%s] is unknown\n", line);
}
}
return 0;
}
还有其他需要更改的地方。例如,
wsize
变量应该是非全局的;它并没有真正在 load()
函数之外使用。很容易争论根节点也不应该是全局的; load()
函数应该 return 根节点,check()
函数应该传递根节点。一般情况下,尽可能避免使用全局变量,通常是可以的。
给定一个文件 words.txt
包含:
abelone
abyssinia
archimedes
brachiosaurus
triceratops
all
alter
asparagus
watchamacallit
a
abracadabra
abyss
ant
程序 运行 的输出是:
[abelone]
[abyssinia]
[archimedes]
[brachiosaurus]
[triceratops]
[all]
[alter]
[asparagus]
[watchamacallit]
[a]
[abracadabra]
[abyss]
[ant]
13 words read from words.txt
a
[a] is a word
ab
[ab] is unknown
al
[al] is unknown
all
[all] is a word
alt
[alt] is unknown
alte
[alte] is unknown
alter
[alter] is a word
triceratops
[triceratops] is a word
brachiosaurus
[brachiosaurus] is a word
abys
[abys] is unknown
abbey
[abbey] is unknown
abyss
[abyss] is a word
ant
[ant] is a word
a
[a] is a word
archimedes
[archimedes] is a word