cs50 pset5 拼写优化
cs50 pset5 Speller optimisation
我已经完成拼写并通过了所有检查。但我仍然对性能感到困惑。我已尽最大努力进行研究和 运行 测试,但与员工的实施相比,我的实施速度慢了 10-20%。我怎样才能改进我的代码?
// Implements a dictionary's functionality
#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include "dictionary.h"
// Represents a node in a hash table
typedef struct node
{
char word[LENGTH + 1];
struct node *next;
}
node;
//Prototypes
unsigned int hash(const char *word);
// TODO: Choose number of buckets in hash table
const unsigned int N = 187751; //odd prime number bigger than word count, because math
// Hash table
node *table[N]; //may need to use calloc
//word count
int word_count = 0;
// Returns true if word is in dictionary, else false
bool check(const char *word)
{
// TODO
//debug
//int hashcode = hash(word);
//printf("bucket: %i, word: %s\n", hash(word), table[hash(word)]->word);
for (node *tmp = table[hash(word)]; tmp != NULL; tmp = tmp->next) //iterate through bucket
{
if (strcasecmp(tmp->word, word) == 0) //compare strings
{
return true;
}
}
return false;
}
// Hashes word to a number
unsigned int hash(const char *word)
{
// TODO: Improve this hash function
//Used Data Structures youtube course of Dr. Rob Edwards from San Diego State University mostly
int hash = 0;
/*int w = 0;
//convert to upper case
while (word[w] != '[=10=]')
{
word[w] = toupper(word[w]);
w++;
}*/
for (int i = 0; i <= strlen(word); i++)
{
hash = (31 * hash + toupper(word[i])) % N; //31 because math
//check hash size
/*if (hash >= N)
{
printf("incorrect hash!");
}*/
}
return hash;
}
// Loads dictionary into memory, returning true if successful, else false
bool load(const char *dictionary)
{
// TODO
//open dictionary file
FILE *dict = fopen(dictionary, "r");
if (dict == NULL)
{
//printf("Could not open file.\n");
return false;
}
//create temp string for word
char *word = malloc(LENGTH + 1);
if (word == NULL)
{
return false;
}
//read words from dictionary and write to hash table
while (fscanf(dict, "%s", word) != EOF)
{
//node *tmp_word = NULL;
node *n = malloc(sizeof(node));
if (n == NULL)
{
return false;
}
int pos = hash(word); //run hash to find the bucket in table
strcpy(n->word, word); //write word to temp node
if (table[pos] == NULL) //check if node is first in bucket (may need to use calloc to create a table)
{
n->next = NULL;
}
else //if not the first, write node to the head
{
n->next = table[pos]; //reference first node
}
table[pos] = n; //write node to table
word_count++; //count new word
}
fclose(dict);
free(word);
//debug
/*int j = 0;
while (j <= 11)
{
for (node *tmp = table[j]; tmp != NULL; tmp = tmp->next) //делаем ноду; пока не дойдем до последней; переходим к следующей
{
printf("%s\n", tmp->word); //забираем значение ноды
}
printf("Bucket number: %i\n", j);
j++;
}*/
//printf("word count:%i\n", word_count);
return true;
}
// Returns number of words in dictionary if loaded, else 0 if not yet loaded
unsigned int size(void)
{
return word_count;
}
// Unloads dictionary from memory, returning true if successful, else false
bool unload(void)
{
// TODO
for (int i = 0; i <= N; i++)
{
while (table[i] != NULL)
{
node *tmp = table[i]->next; //делаем ноду и переходим на следующую ячейку
free(table[i]); //освобождаем текущую
table[i] = tmp; //начинаем list со второй ячейки
}
}
return true;
}
我的结果:
WORDS MISSPELLED: 955
WORDS IN DICTIONARY: 143091
WORDS IN TEXT: 17756
TIME IN load: 0.04
TIME IN check: 0.02
TIME IN size: 0.00
TIME IN unload: 0.01
TIME IN TOTAL: 0.07
Cs50人员成绩:
WORDS MISSPELLED: 955
WORDS IN DICTIONARY: 143091
WORDS IN TEXT: 17756
TIME IN load: 0.02
TIME IN check: 0.02
TIME IN size: 0.00
TIME IN unload: 0.01
TIME IN TOTAL: 0.06
我已经玩过bucket size,没用(((
Pset5 指令:https://cs50.harvard.edu/x/2022/psets/5/speller/
如果您需要的话,我也包括 dictionary.h。检查正在从另一个文本文件获取字符串,而不是加载中使用的文件。
#ifndef DICTIONARY_H
#define DICTIONARY_H
#include <stdbool.h>
// Maximum length for a word
// (e.g., pneumonoultramicroscopicsilicovolcanoconiosis)
#define LENGTH 45
// Prototypes
bool check(const char *word);
unsigned int hash(const char *word);
bool load(const char *dictionary);
unsigned int size(void);
bool unload(void);
#endif // DICTIONARY_H
Craig Estey 几乎完美地回答了我的问题。与我的代码相比,他的代码运行速度超快,而且比 cs50 员工的代码运行速度更快:
更大的文字:
我学到了很多东西,真的很感激。这是我的第一个问题,关于 SO,我从来没有预料到它会是一个如此有帮助和友好的地方。
几个问题:
while (fscanf(dict, "%s", word) != EOF)
是错误的。你想要:while (fscanf(dict, "%s", word) == 1)
- 更快地将给定单词存储到table中作为大写。 (即)我们对每个单词只
toupper
一次。
check
函数会更快,因为它可以 [then] 使用 strcmp
而不是 strcasecmp
[ 更慢 ]
- 如果我们可以将字段添加到
node
结构,我们可以让它保存一个哈希值。这可以加速 check
函数(参见下面代码中的 NODEHASH
)。
- 为每个节点做一个
malloc
是slow/wasteful。 pooled/arena/slab 分配器可以更快(参见下面代码中的 FASTALLOC
)。同样,if 我们可以向 node
结构添加一个字段。
下面的代码中记录了一些其他错误(请参阅“注意”注释)。我使用 cpp
条件来表示旧代码与新代码:
#if 0
// old code
#else
// new code
#endif
#if 1
// new code
#endif
这是重构后的代码。我编译了它,但 没有 测试过它:
// Implements a dictionary's functionality
#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
//#include "dictionary.h"
// NOTE/BUG: dictionary.h was _not_ posted, so we have to guess
#define LENGTH 20
#ifndef NODEHASH
#define NODEHASH 1
#endif
#ifndef FASTALLOC
#define FASTALLOC 1
#endif
// Represents a node in a hash table
typedef struct node {
char word[LENGTH + 1];
struct node *next;
#if NODEHASH
unsigned int hash;
#endif
#if FASTALLOC
int alloc;
#endif
} node;
//Prototypes
unsigned int hash(const char *word);
// TODO: Choose number of buckets in hash table
// odd prime number bigger than word count, because math
// NOTE/BUG: this doesn't work too well for C (may be okay in C++) when used
// as dimension for table below
// NOTE/BUG: enum may be faster in calculations
#if 0
const unsigned int N = 187751;
#else
enum {
N = 187751
};
#endif
// Hash table
node *table[N]; // may need to use calloc
//word count
int word_count = 0;
// Returns true if word is in dictionary, else false
bool
check(const char *word)
{
// TODO
// debug
// int hashcode = hash(word);
// printf("bucket: %i, word: %s\n", hash(word), table[hash(word)]->word);
#if 1
unsigned int hval = hash(word);
#endif
// iterate through bucket
for (node *tmp = table[hval]; tmp != NULL; tmp = tmp->next) {
#if NODEHASH
// check hash value first -- limits number of str*cmp calls
if (tmp->hash != hval)
continue;
#endif
// compare strings
// NOTE/BUG: strcasecmp is _slow_ and if we _store_ uppercase we don't need it
#if 0
if (strcasecmp(tmp->word, word) == 0) {
return true;
}
#else
if (strcmp(tmp->word, word) == 0)
return true;
#endif
}
return false;
}
// Hashes word to a number
unsigned int
hash(const char *word)
{
// TODO: Improve this hash function
// Used Data Structures youtube course of Dr. Rob Edwards from San Diego State University mostly
int hash = 0;
/* int w = 0; //convert to upper case while (word[w] != '[=11=]') { word[w] = toupper(word[w]); w++; } */
// NOTE/BUG: _without_ optimization this is O(n^2) instead of O(n) because it
// calls strlen on _each_ loop iteration
#if 0
for (int i = 0; i <= strlen(word); i++) {
hash = (31 * hash + toupper(word[i])) % N; // 31 because math
// check hash size
/* if (hash >= N) { printf("incorrect hash!"); } */
}
#else
for (int chr = *word++; chr != 0; chr = *word++)
hash = (31 * hash + chr) % N; // 31 because math
#endif
return hash;
}
#if FASTALLOC
node *freelist = NULL;
int freecount = 0;
// adjust this value to suit
#define ARENASIZE 1000
// newnode -- allocate from pool
node *
newnode(void)
{
node *cur;
while (1) {
// get node from free list cache
cur = freelist;
// use the cached node if possible
if (cur != NULL) {
freelist = cur->next;
break;
}
// allocate a new arena
freelist = malloc(sizeof(node) * ARENASIZE);
// out of memory
if (freelist == NULL)
break;
// link all nodes in the arena
node *prev = NULL;
for (cur = freelist; cur < &freelist[ARENASIZE]; ++cur) {
cur->alloc = 0;
cur->next = prev;
prev = cur;
}
// mark the arena head
freelist->alloc = 1;
}
return cur;
}
#endif
// Loads dictionary into memory, returning true if successful, else false
bool
load(const char *dictionary)
{
// TODO
// open dictionary file
FILE *dict = fopen(dictionary, "r");
if (dict == NULL) {
// printf("Could not open file.\n");
return false;
}
// create temp string for word
// NOTE/BUG: no need to malloc this (and it is slightly slower)
#if 0
char *word = malloc(LENGTH + 1);
if (word == NULL) {
return false;
}
#else
char word[LENGTH + 1];
#endif
// read words from dictionary and write to hash table
// NOTE/BUG: this scanf is wrong (and may loop too much)
#if 0
while (fscanf(dict, "%s", word) != EOF) {
#else
while (fscanf(dict, "%s", word) == 1) {
#endif
// node *tmp_word = NULL;
#if FASTALLOC
node *n = newnode();
#else
node *n = malloc(sizeof(node));
#endif
if (n == NULL)
return false;
// NOTE/FIX: convert to uppercase _once_
int idx;
for (idx = 0; word[idx] != 0; ++idx)
n->word[idx] = toupper(word[idx] & 0xFF);
n->word[idx] = 0;
// run hash to find the bucket in table
// NOTE/BUG: pos should be unsigned
#if 0
int pos = hash(word);
#else
unsigned int pos = hash(n->word);
#endif
#if 0
strcpy(n->word, word); // write word to temp node
#endif
#if NODEHASH
n->hash = pos;
#endif
// NOTE/BUG: no need to check table[pos]
#if 0
// check if node is first in bucket
// (may need to use calloc to create a table)
if (table[pos] == NULL)
{
n->next = NULL;
}
// if not the first, write node to the head
else
{
n->next = table[pos]; // reference first node
}
#else
n->next = table[pos];
#endif
table[pos] = n; // write node to table
word_count++; // count new word
}
fclose(dict);
#if 0
free(word);
#endif
// debug
/* int j = 0; while (j <= 11) { for (node *tmp = table[j]; tmp != NULL; tmp = tmp->next) //делаем ноду; пока не дойдем до последней; переходим к следующей { printf("%s\n", tmp->word); //забираем значение ноды
} printf("Bucket number: %i\n", j); j++; } */
// printf("word count:%i\n", word_count);
return true;
}
// Returns number of words in dictionary if loaded, else 0 if not yet loaded
unsigned int
size(void)
{
return word_count;
}
// Unloads dictionary from memory, returning true if successful, else false
#if ! FASTALLOC
bool
unload(void)
{
// TODO
// NOTE/BUG: this goes one beyond the table end (i.e. want: i < N)
#if 0
for (int i = 0; i <= N; i++) {
while (table[i] != NULL) {
node *tmp = table[i]->next; // делаем ноду и переходим на следующую ячейку
free(table[i]); // освобождаем текущую
table[i] = tmp; // начинаем list со второй ячейки
}
}
#else
for (int i = 0; i < N; ++i) {
node *cur = table[i];
if (cur == NULL)
continue;
node *next;
for (; cur != NULL; cur = next) {
next = cur->next;
free(cur);
}
#endif
return true;
}
#endif
// Unloads dictionary from memory, returning true if successful, else false
#if FASTALLOC
bool
unload(void)
{
node *cur;
node *next;
// remove all nodes from the table
for (int i = 0; i < N; ++i) {
cur = table[i];
if (cur == NULL)
continue;
// put all nodes in hash table back on [cached] free list
for (; cur != NULL; cur = next) {
next = cur->next;
cur->next = freelist;
freelist = cur;
}
table[i] = NULL;
}
cur = freelist;
freelist = NULL;
// trim free list to allocation/arena heads
for (; cur != NULL; cur = next) {
next = cur->next;
if (cur->alloc) {
cur->next = freelist;
freelist = cur;
}
}
// free the allocated nodes
cur = freelist;
freelist = NULL;
for (; cur != NULL; cur = next) {
next = cur->next;
free(cur);
}
return true;
}
#endif
这是上面的清理版本(我 运行 通过 unifdef
):
// Implements a dictionary's functionality
#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
//#include "dictionary.h"
// NOTE/BUG: dictionary.h was _not_ posted, so we have to guess
#define LENGTH 20
// Represents a node in a hash table
typedef struct node {
char word[LENGTH + 1];
struct node *next;
unsigned int hash;
int alloc;
} node;
//Prototypes
unsigned int hash(const char *word);
// TODO: Choose number of buckets in hash table
// odd prime number bigger than word count, because math
enum {
N = 187751
};
// Hash table
node *table[N]; // may need to use calloc
//word count
int word_count = 0;
// Returns true if word is in dictionary, else false
bool
check(const char *word)
{
// TODO
// debug
// int hashcode = hash(word);
// printf("bucket: %i, word: %s\n", hash(word), table[hash(word)]->word);
unsigned int hval = hash(word);
// iterate through bucket
for (node *tmp = table[hval]; tmp != NULL; tmp = tmp->next) {
// check hash value first -- limits number of str*cmp calls
if (tmp->hash != hval)
continue;
// compare strings
if (strcmp(tmp->word, word) == 0)
return true;
}
return false;
}
// Hashes word to a number
unsigned int
hash(const char *word)
{
// TODO: Improve this hash function
// Used Data Structures youtube course of Dr. Rob Edwards from San Diego State University mostly
int hash = 0;
/* int w = 0; //convert to upper case while (word[w] != '[=12=]') { word[w] = toupper(word[w]); w++; } */
for (int chr = *word++; chr != 0; chr = *word++)
hash = (31 * hash + chr) % N; // 31 because math
return hash;
}
node *freelist = NULL;
int freecount = 0;
// adjust this value to suit
#define ARENASIZE 1000
// newnode -- allocate from pool
node *
newnode(void)
{
node *cur;
while (1) {
// get node from free list cache
cur = freelist;
// use the cached node if possible
if (cur != NULL) {
freelist = cur->next;
break;
}
// allocate a new arena
freelist = malloc(sizeof(node) * ARENASIZE);
// out of memory
if (freelist == NULL)
break;
// link all nodes in the arena
node *prev = NULL;
for (cur = freelist; cur < &freelist[ARENASIZE]; ++cur) {
cur->alloc = 0;
cur->next = prev;
prev = cur;
}
// mark the arena head
freelist->alloc = 1;
}
return cur;
}
// Loads dictionary into memory, returning true if successful, else false
bool
load(const char *dictionary)
{
// TODO
// open dictionary file
FILE *dict = fopen(dictionary, "r");
if (dict == NULL) {
// printf("Could not open file.\n");
return false;
}
// create temp string for word
// NOTE/BUG: no need to malloc this (and it is slightly slower)
char word[LENGTH + 1];
// read words from dictionary and write to hash table
// NOTE/BUG: this scanf is wrong (and may loop too much)
while (fscanf(dict, "%s", word) == 1) {
// node *tmp_word = NULL;
node *n = newnode();
if (n == NULL)
return false;
// NOTE/FIX: convert to uppercase _once_
int idx;
for (idx = 0; word[idx] != 0; ++idx)
n->word[idx] = toupper(word[idx] & 0xFF);
n->word[idx] = 0;
// run hash to find the bucket in table
// NOTE/BUG: pos should be unsigned
unsigned int pos = hash(n->word);
n->hash = pos;
// NOTE/BUG: no need to check table[pos]
n->next = table[pos];
table[pos] = n; // write node to table
word_count++; // count new word
}
fclose(dict);
// debug
/* int j = 0; while (j <= 11) { for (node *tmp = table[j]; tmp != NULL; tmp = tmp->next) //делаем ноду; пока не дойдем до последней; переходим к следующей { printf("%s\n", tmp->word); //забираем значение ноды
} printf("Bucket number: %i\n", j); j++; } */
// printf("word count:%i\n", word_count);
return true;
}
// Returns number of words in dictionary if loaded, else 0 if not yet loaded
unsigned int
size(void)
{
return word_count;
}
// Unloads dictionary from memory, returning true if successful, else false
bool
unload(void)
{
node *cur;
node *next;
// remove all nodes from the table
for (int i = 0; i < N; ++i) {
cur = table[i];
if (cur == NULL)
continue;
// put all nodes in hash table back on [cached] free list
for (; cur != NULL; cur = next) {
next = cur->next;
cur->next = freelist;
freelist = cur;
}
table[i] = NULL;
}
cur = freelist;
freelist = NULL;
// trim free list to allocation/arena heads
for (; cur != NULL; cur = next) {
next = cur->next;
if (cur->alloc) {
cur->next = freelist;
freelist = cur;
}
}
// free the allocated nodes
cur = freelist;
freelist = NULL;
for (; cur != NULL; cur = next) {
next = cur->next;
free(cur);
}
return true;
}
更新#2:
I've already submitted my version. Just trying to understand the subject better. –
meotet
好的,因此,我制作了一个最终版本,这是我对速度的最大努力。如果你还在努力,我做了更多的重构。
注意事项...
为每个节点做一个 single malloc
是浪费。 malloc
比较慢。请注意,在您的基准计时中,主要是 load
阶段比员工实施慢。
此外,还有一个 hidden 结构,malloc
将它放在内存中的指针 returns 之前,用于跟踪拨款。此结构占用 space,对于像节点结构这样的小分配,额外的内存开销可能很大。
最好调用 malloc
并在连续块或数组中分配大量 node
结构。进一步的分配可以来自该块内。
这类似于“平板分配”、“对象池”或“内存池”:
-
-
-
在下面的代码中,节点以 1000 个为一组进行分配。这将 malloc
调用的次数从 100,000 次减少到大约 100 次。这个版本比我的上一个分配器提高了速度,因为它使用了一个额外的“段”(又名“竞技场”)结构来控制分配。它不再需要将每个节点结构预链接到“空闲列表”或跟踪节点是否是从 malloc
.
接收到的给定内存片段中的第一个节点。
看字典不用toupper/tolower
。因为字典保证运行全部小写,每行只有一个字,所以用fgets
看字典速度更快。我们可以将字典单词 直接 读入给定 node
结构的 word
字段 [if 我们做节点在执行 fgets
].
之前进行分配
在计算散列时,我们不需要在每次迭代中都执行 % N
。由于模运算的性质,我们可以在最后做一次取余运算一次。
我们还可以将 hash
值存储到 node
结构中。然后,在 check
中,我们可以比较哈希值并跳过一些 [expensive] strcmp
调用。在您的代码中,由于 N
太大,大多数哈希桶只有一两个条目,因此这种加速可能影响不大 [并且可能会稍微慢一些]。
在 check
中,最好将参数 word
复制到一个临时缓冲区中,在我们进行 一次 时小写。然后,我们可以使用较快的 strcmp
而不是较慢的 strcasecmp
。无论我们是否将哈希值存储在节点结构中,这都有效。同样,由于 N
值较大,此加速效果可能有限。
无论如何,这里是重构代码:
// fix2.c -- Implements a dictionary's functionality
#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include "dictionary.h"
#if _USE_ZPRT_
#define dbgprt(_fmt...) printf(_fmt)
#else
#define dbgprt(_fmt...) do { } while (0)
#endif
#ifndef NODEHASH
#define NODEHASH 1
#endif
#ifndef FASTNODE
#define FASTNODE 1
#endif
// Represents a node in a hash table
typedef struct node {
char word[LENGTH + 1 + 1];
struct node *next;
#if NODEHASH
unsigned int hash;
#endif
} node;
//Prototypes
unsigned int hash(const char *word);
// TODO: Choose number of buckets in hash table
// odd prime number bigger than word count, because math
#ifndef N
#define N 187751
#endif
// Hash table
node *table[N]; // may need to use calloc
// word count
int word_count = 0;
// 31 because math
// modulus only really needed to clip hash to table index
// so do it _once_ by caller of hash/hash2
#if 0
#define HASH \
hash = (31 * hash + chr) % N
#else
#define HASH \
hash = (31 * hash) + chr
#endif
// Hashes word to a number and copies lowercase chars to output buffer
unsigned int
hash2(char *dest,const char *word)
{
unsigned int hash = 0;
for (int chr = *word++; chr != 0; chr = *word++) {
chr = tolower((unsigned char) chr);
HASH;
*dest++ = chr;
}
*dest = 0;
return hash;
}
// Returns true if word is in dictionary, else false
bool
check(const char *word)
{
char tmp[LENGTH + 10];
unsigned int hval = hash2(tmp,word);
unsigned int hidx = hval % N;
// iterate through bucket
for (node *cur = table[hidx]; cur != NULL; cur = cur->next) {
// check hash value first -- limits number of str*cmp calls
#if NODEHASH
if (cur->hash != hval)
continue;
#endif
if (strcmp(cur->word, tmp) == 0)
return true;
}
return false;
}
// Hashes word to a number
unsigned int
hash(const char *word)
{
unsigned int hash = 0;
for (int chr = *word++; chr != 0; chr = *word++)
HASH;
return hash;
}
// adjust this value to suit
#ifndef ARENASIZE
#define ARENASIZE 1000
#endif
typedef struct seg seg_t;
struct seg {
seg_t *seg_next; // next segment
int seg_count; // number of used nodes in this segment
node seg_node[ARENASIZE]; // nodes in this segment
};
node *nodelist; // list of freed nodes
seg_t *seglist; // head of linked list of segments
// newnode -- allocate from pool
node *
newnode(void)
{
seg_t *seg;
node *cur;
while (1) {
// get node from free node cache
// use the cached node if possible
// this only happens if freenode were called [which it isn't]
cur = nodelist;
if (cur != NULL) {
nodelist = cur->next;
break;
}
// try to allocate from current segment
seg = seglist;
if (seg != NULL) {
if (seg->seg_count < ARENASIZE) {
cur = &seg->seg_node[seg->seg_count++];
break;
}
}
// allocate a new segment
seg = malloc(sizeof(*seg));
// out of memory
if (seg == NULL)
break;
// mark segment as completely unused
seg->seg_count = 0;
// attach to segment list
seg->seg_next = seglist;
seglist = seg;
}
return cur;
}
// freenode -- free a node
void
freenode(node *cur)
{
#if FASTNODE
cur->next = nodelist;
nodelist = cur;
#else
free(cur);
#endif
}
// segunload -- release all segments
void
segunload(void)
{
seg_t *seg;
seg_t *next;
seg = seglist;
seglist = NULL;
for (; seg != NULL; seg = next) {
next = seg->seg_next;
free(seg);
}
}
// Unloads dictionary from memory, returning true if successful, else false
bool
unload(void)
{
node *cur;
// remove all nodes from the table
for (int i = 0; i < N; ++i) {
cur = table[i];
// put all nodes in hash table back on [cached] free list
// NOTE: not really necessary
#if FASTNODE
word_count = 0;
cur = NULL;
#else
node *next;
for (; cur != NULL; cur = next, --word_count) {
next = cur->next;
freenode(cur);
}
#endif
table[i] = cur;
}
segunload();
return true;
}
// Loads dictionary into memory, returning true if successful, else false
bool
load(const char *dictionary)
{
// open dictionary file
FILE *dict = fopen(dictionary, "r");
if (dict == NULL)
return false;
// read words from dictionary and write to hash table
while (1) {
#if FASTNODE
node *n = newnode();
#else
node *n = malloc(sizeof(*n));
#endif
if (n == NULL)
return false;
char *word = n->word;
if (fgets(word,sizeof(n->word),dict) == NULL) {
freenode(n);
break;
}
// strip newline
#if 0
word += strlen(word);
if (--word >= n->word) {
if (*word == '\n');
*word = 0;
}
#else
word = strchr(word,'\n');
if (word != NULL)
*word = 0;
#endif
// run hash to find the bucket in table
unsigned int pos = hash(n->word);
#if NODEHASH
n->hash = pos;
#endif
// put node in hash table
pos %= N;
n->next = table[pos];
table[pos] = n;
word_count++;
}
fclose(dict);
// DEBUG: show the bucket counts
#if _USE_ZPRT_
node *cur;
int mincount = 0;
int maxcount = 0;
int buckcount = 0;
int totcount = 0;
for (int i = 0; i < N; ++i) {
cur = table[i];
if (cur == NULL)
continue;
int count = 0;
for (; cur != NULL; cur = cur->next)
++count;
printf("load: BUCKET/%d count=%d\n",i,count);
if (i == 0) {
mincount = count;
maxcount = count;
}
if (count < mincount)
mincount = count;
if (count > maxcount)
maxcount = count;
totcount += count;
++buckcount;
}
printf("load: TOTAL buckcnt=%d/%d mincount=%d maxcount=%d avgcount=%d/%d\n",
buckcount,N,mincount,maxcount,totcount / buckcount,totcount / N);
#endif
return true;
}
// Returns number of words in dictionary if loaded, else 0 if not yet loaded
unsigned int
size(void)
{
return word_count;
}
我已经完成拼写并通过了所有检查。但我仍然对性能感到困惑。我已尽最大努力进行研究和 运行 测试,但与员工的实施相比,我的实施速度慢了 10-20%。我怎样才能改进我的代码?
// Implements a dictionary's functionality
#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include "dictionary.h"
// Represents a node in a hash table
typedef struct node
{
char word[LENGTH + 1];
struct node *next;
}
node;
//Prototypes
unsigned int hash(const char *word);
// TODO: Choose number of buckets in hash table
const unsigned int N = 187751; //odd prime number bigger than word count, because math
// Hash table
node *table[N]; //may need to use calloc
//word count
int word_count = 0;
// Returns true if word is in dictionary, else false
bool check(const char *word)
{
// TODO
//debug
//int hashcode = hash(word);
//printf("bucket: %i, word: %s\n", hash(word), table[hash(word)]->word);
for (node *tmp = table[hash(word)]; tmp != NULL; tmp = tmp->next) //iterate through bucket
{
if (strcasecmp(tmp->word, word) == 0) //compare strings
{
return true;
}
}
return false;
}
// Hashes word to a number
unsigned int hash(const char *word)
{
// TODO: Improve this hash function
//Used Data Structures youtube course of Dr. Rob Edwards from San Diego State University mostly
int hash = 0;
/*int w = 0;
//convert to upper case
while (word[w] != '[=10=]')
{
word[w] = toupper(word[w]);
w++;
}*/
for (int i = 0; i <= strlen(word); i++)
{
hash = (31 * hash + toupper(word[i])) % N; //31 because math
//check hash size
/*if (hash >= N)
{
printf("incorrect hash!");
}*/
}
return hash;
}
// Loads dictionary into memory, returning true if successful, else false
bool load(const char *dictionary)
{
// TODO
//open dictionary file
FILE *dict = fopen(dictionary, "r");
if (dict == NULL)
{
//printf("Could not open file.\n");
return false;
}
//create temp string for word
char *word = malloc(LENGTH + 1);
if (word == NULL)
{
return false;
}
//read words from dictionary and write to hash table
while (fscanf(dict, "%s", word) != EOF)
{
//node *tmp_word = NULL;
node *n = malloc(sizeof(node));
if (n == NULL)
{
return false;
}
int pos = hash(word); //run hash to find the bucket in table
strcpy(n->word, word); //write word to temp node
if (table[pos] == NULL) //check if node is first in bucket (may need to use calloc to create a table)
{
n->next = NULL;
}
else //if not the first, write node to the head
{
n->next = table[pos]; //reference first node
}
table[pos] = n; //write node to table
word_count++; //count new word
}
fclose(dict);
free(word);
//debug
/*int j = 0;
while (j <= 11)
{
for (node *tmp = table[j]; tmp != NULL; tmp = tmp->next) //делаем ноду; пока не дойдем до последней; переходим к следующей
{
printf("%s\n", tmp->word); //забираем значение ноды
}
printf("Bucket number: %i\n", j);
j++;
}*/
//printf("word count:%i\n", word_count);
return true;
}
// Returns number of words in dictionary if loaded, else 0 if not yet loaded
unsigned int size(void)
{
return word_count;
}
// Unloads dictionary from memory, returning true if successful, else false
bool unload(void)
{
// TODO
for (int i = 0; i <= N; i++)
{
while (table[i] != NULL)
{
node *tmp = table[i]->next; //делаем ноду и переходим на следующую ячейку
free(table[i]); //освобождаем текущую
table[i] = tmp; //начинаем list со второй ячейки
}
}
return true;
}
我的结果:
WORDS MISSPELLED: 955
WORDS IN DICTIONARY: 143091
WORDS IN TEXT: 17756
TIME IN load: 0.04
TIME IN check: 0.02
TIME IN size: 0.00
TIME IN unload: 0.01
TIME IN TOTAL: 0.07
Cs50人员成绩:
WORDS MISSPELLED: 955
WORDS IN DICTIONARY: 143091
WORDS IN TEXT: 17756
TIME IN load: 0.02
TIME IN check: 0.02
TIME IN size: 0.00
TIME IN unload: 0.01
TIME IN TOTAL: 0.06
我已经玩过bucket size,没用(((
Pset5 指令:https://cs50.harvard.edu/x/2022/psets/5/speller/
如果您需要的话,我也包括 dictionary.h。检查正在从另一个文本文件获取字符串,而不是加载中使用的文件。
#ifndef DICTIONARY_H
#define DICTIONARY_H
#include <stdbool.h>
// Maximum length for a word
// (e.g., pneumonoultramicroscopicsilicovolcanoconiosis)
#define LENGTH 45
// Prototypes
bool check(const char *word);
unsigned int hash(const char *word);
bool load(const char *dictionary);
unsigned int size(void);
bool unload(void);
#endif // DICTIONARY_H
Craig Estey 几乎完美地回答了我的问题。与我的代码相比,他的代码运行速度超快,而且比 cs50 员工的代码运行速度更快:
更大的文字:
我学到了很多东西,真的很感激。这是我的第一个问题,关于 SO,我从来没有预料到它会是一个如此有帮助和友好的地方。
几个问题:
while (fscanf(dict, "%s", word) != EOF)
是错误的。你想要:while (fscanf(dict, "%s", word) == 1)
- 更快地将给定单词存储到table中作为大写。 (即)我们对每个单词只
toupper
一次。 check
函数会更快,因为它可以 [then] 使用strcmp
而不是strcasecmp
[ 更慢 ]- 如果我们可以将字段添加到
node
结构,我们可以让它保存一个哈希值。这可以加速check
函数(参见下面代码中的NODEHASH
)。 - 为每个节点做一个
malloc
是slow/wasteful。 pooled/arena/slab 分配器可以更快(参见下面代码中的FASTALLOC
)。同样,if 我们可以向node
结构添加一个字段。
下面的代码中记录了一些其他错误(请参阅“注意”注释)。我使用 cpp
条件来表示旧代码与新代码:
#if 0
// old code
#else
// new code
#endif
#if 1
// new code
#endif
这是重构后的代码。我编译了它,但 没有 测试过它:
// Implements a dictionary's functionality
#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
//#include "dictionary.h"
// NOTE/BUG: dictionary.h was _not_ posted, so we have to guess
#define LENGTH 20
#ifndef NODEHASH
#define NODEHASH 1
#endif
#ifndef FASTALLOC
#define FASTALLOC 1
#endif
// Represents a node in a hash table
typedef struct node {
char word[LENGTH + 1];
struct node *next;
#if NODEHASH
unsigned int hash;
#endif
#if FASTALLOC
int alloc;
#endif
} node;
//Prototypes
unsigned int hash(const char *word);
// TODO: Choose number of buckets in hash table
// odd prime number bigger than word count, because math
// NOTE/BUG: this doesn't work too well for C (may be okay in C++) when used
// as dimension for table below
// NOTE/BUG: enum may be faster in calculations
#if 0
const unsigned int N = 187751;
#else
enum {
N = 187751
};
#endif
// Hash table
node *table[N]; // may need to use calloc
//word count
int word_count = 0;
// Returns true if word is in dictionary, else false
bool
check(const char *word)
{
// TODO
// debug
// int hashcode = hash(word);
// printf("bucket: %i, word: %s\n", hash(word), table[hash(word)]->word);
#if 1
unsigned int hval = hash(word);
#endif
// iterate through bucket
for (node *tmp = table[hval]; tmp != NULL; tmp = tmp->next) {
#if NODEHASH
// check hash value first -- limits number of str*cmp calls
if (tmp->hash != hval)
continue;
#endif
// compare strings
// NOTE/BUG: strcasecmp is _slow_ and if we _store_ uppercase we don't need it
#if 0
if (strcasecmp(tmp->word, word) == 0) {
return true;
}
#else
if (strcmp(tmp->word, word) == 0)
return true;
#endif
}
return false;
}
// Hashes word to a number
unsigned int
hash(const char *word)
{
// TODO: Improve this hash function
// Used Data Structures youtube course of Dr. Rob Edwards from San Diego State University mostly
int hash = 0;
/* int w = 0; //convert to upper case while (word[w] != '[=11=]') { word[w] = toupper(word[w]); w++; } */
// NOTE/BUG: _without_ optimization this is O(n^2) instead of O(n) because it
// calls strlen on _each_ loop iteration
#if 0
for (int i = 0; i <= strlen(word); i++) {
hash = (31 * hash + toupper(word[i])) % N; // 31 because math
// check hash size
/* if (hash >= N) { printf("incorrect hash!"); } */
}
#else
for (int chr = *word++; chr != 0; chr = *word++)
hash = (31 * hash + chr) % N; // 31 because math
#endif
return hash;
}
#if FASTALLOC
node *freelist = NULL;
int freecount = 0;
// adjust this value to suit
#define ARENASIZE 1000
// newnode -- allocate from pool
node *
newnode(void)
{
node *cur;
while (1) {
// get node from free list cache
cur = freelist;
// use the cached node if possible
if (cur != NULL) {
freelist = cur->next;
break;
}
// allocate a new arena
freelist = malloc(sizeof(node) * ARENASIZE);
// out of memory
if (freelist == NULL)
break;
// link all nodes in the arena
node *prev = NULL;
for (cur = freelist; cur < &freelist[ARENASIZE]; ++cur) {
cur->alloc = 0;
cur->next = prev;
prev = cur;
}
// mark the arena head
freelist->alloc = 1;
}
return cur;
}
#endif
// Loads dictionary into memory, returning true if successful, else false
bool
load(const char *dictionary)
{
// TODO
// open dictionary file
FILE *dict = fopen(dictionary, "r");
if (dict == NULL) {
// printf("Could not open file.\n");
return false;
}
// create temp string for word
// NOTE/BUG: no need to malloc this (and it is slightly slower)
#if 0
char *word = malloc(LENGTH + 1);
if (word == NULL) {
return false;
}
#else
char word[LENGTH + 1];
#endif
// read words from dictionary and write to hash table
// NOTE/BUG: this scanf is wrong (and may loop too much)
#if 0
while (fscanf(dict, "%s", word) != EOF) {
#else
while (fscanf(dict, "%s", word) == 1) {
#endif
// node *tmp_word = NULL;
#if FASTALLOC
node *n = newnode();
#else
node *n = malloc(sizeof(node));
#endif
if (n == NULL)
return false;
// NOTE/FIX: convert to uppercase _once_
int idx;
for (idx = 0; word[idx] != 0; ++idx)
n->word[idx] = toupper(word[idx] & 0xFF);
n->word[idx] = 0;
// run hash to find the bucket in table
// NOTE/BUG: pos should be unsigned
#if 0
int pos = hash(word);
#else
unsigned int pos = hash(n->word);
#endif
#if 0
strcpy(n->word, word); // write word to temp node
#endif
#if NODEHASH
n->hash = pos;
#endif
// NOTE/BUG: no need to check table[pos]
#if 0
// check if node is first in bucket
// (may need to use calloc to create a table)
if (table[pos] == NULL)
{
n->next = NULL;
}
// if not the first, write node to the head
else
{
n->next = table[pos]; // reference first node
}
#else
n->next = table[pos];
#endif
table[pos] = n; // write node to table
word_count++; // count new word
}
fclose(dict);
#if 0
free(word);
#endif
// debug
/* int j = 0; while (j <= 11) { for (node *tmp = table[j]; tmp != NULL; tmp = tmp->next) //делаем ноду; пока не дойдем до последней; переходим к следующей { printf("%s\n", tmp->word); //забираем значение ноды
} printf("Bucket number: %i\n", j); j++; } */
// printf("word count:%i\n", word_count);
return true;
}
// Returns number of words in dictionary if loaded, else 0 if not yet loaded
unsigned int
size(void)
{
return word_count;
}
// Unloads dictionary from memory, returning true if successful, else false
#if ! FASTALLOC
bool
unload(void)
{
// TODO
// NOTE/BUG: this goes one beyond the table end (i.e. want: i < N)
#if 0
for (int i = 0; i <= N; i++) {
while (table[i] != NULL) {
node *tmp = table[i]->next; // делаем ноду и переходим на следующую ячейку
free(table[i]); // освобождаем текущую
table[i] = tmp; // начинаем list со второй ячейки
}
}
#else
for (int i = 0; i < N; ++i) {
node *cur = table[i];
if (cur == NULL)
continue;
node *next;
for (; cur != NULL; cur = next) {
next = cur->next;
free(cur);
}
#endif
return true;
}
#endif
// Unloads dictionary from memory, returning true if successful, else false
#if FASTALLOC
bool
unload(void)
{
node *cur;
node *next;
// remove all nodes from the table
for (int i = 0; i < N; ++i) {
cur = table[i];
if (cur == NULL)
continue;
// put all nodes in hash table back on [cached] free list
for (; cur != NULL; cur = next) {
next = cur->next;
cur->next = freelist;
freelist = cur;
}
table[i] = NULL;
}
cur = freelist;
freelist = NULL;
// trim free list to allocation/arena heads
for (; cur != NULL; cur = next) {
next = cur->next;
if (cur->alloc) {
cur->next = freelist;
freelist = cur;
}
}
// free the allocated nodes
cur = freelist;
freelist = NULL;
for (; cur != NULL; cur = next) {
next = cur->next;
free(cur);
}
return true;
}
#endif
这是上面的清理版本(我 运行 通过 unifdef
):
// Implements a dictionary's functionality
#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
//#include "dictionary.h"
// NOTE/BUG: dictionary.h was _not_ posted, so we have to guess
#define LENGTH 20
// Represents a node in a hash table
typedef struct node {
char word[LENGTH + 1];
struct node *next;
unsigned int hash;
int alloc;
} node;
//Prototypes
unsigned int hash(const char *word);
// TODO: Choose number of buckets in hash table
// odd prime number bigger than word count, because math
enum {
N = 187751
};
// Hash table
node *table[N]; // may need to use calloc
//word count
int word_count = 0;
// Returns true if word is in dictionary, else false
bool
check(const char *word)
{
// TODO
// debug
// int hashcode = hash(word);
// printf("bucket: %i, word: %s\n", hash(word), table[hash(word)]->word);
unsigned int hval = hash(word);
// iterate through bucket
for (node *tmp = table[hval]; tmp != NULL; tmp = tmp->next) {
// check hash value first -- limits number of str*cmp calls
if (tmp->hash != hval)
continue;
// compare strings
if (strcmp(tmp->word, word) == 0)
return true;
}
return false;
}
// Hashes word to a number
unsigned int
hash(const char *word)
{
// TODO: Improve this hash function
// Used Data Structures youtube course of Dr. Rob Edwards from San Diego State University mostly
int hash = 0;
/* int w = 0; //convert to upper case while (word[w] != '[=12=]') { word[w] = toupper(word[w]); w++; } */
for (int chr = *word++; chr != 0; chr = *word++)
hash = (31 * hash + chr) % N; // 31 because math
return hash;
}
node *freelist = NULL;
int freecount = 0;
// adjust this value to suit
#define ARENASIZE 1000
// newnode -- allocate from pool
node *
newnode(void)
{
node *cur;
while (1) {
// get node from free list cache
cur = freelist;
// use the cached node if possible
if (cur != NULL) {
freelist = cur->next;
break;
}
// allocate a new arena
freelist = malloc(sizeof(node) * ARENASIZE);
// out of memory
if (freelist == NULL)
break;
// link all nodes in the arena
node *prev = NULL;
for (cur = freelist; cur < &freelist[ARENASIZE]; ++cur) {
cur->alloc = 0;
cur->next = prev;
prev = cur;
}
// mark the arena head
freelist->alloc = 1;
}
return cur;
}
// Loads dictionary into memory, returning true if successful, else false
bool
load(const char *dictionary)
{
// TODO
// open dictionary file
FILE *dict = fopen(dictionary, "r");
if (dict == NULL) {
// printf("Could not open file.\n");
return false;
}
// create temp string for word
// NOTE/BUG: no need to malloc this (and it is slightly slower)
char word[LENGTH + 1];
// read words from dictionary and write to hash table
// NOTE/BUG: this scanf is wrong (and may loop too much)
while (fscanf(dict, "%s", word) == 1) {
// node *tmp_word = NULL;
node *n = newnode();
if (n == NULL)
return false;
// NOTE/FIX: convert to uppercase _once_
int idx;
for (idx = 0; word[idx] != 0; ++idx)
n->word[idx] = toupper(word[idx] & 0xFF);
n->word[idx] = 0;
// run hash to find the bucket in table
// NOTE/BUG: pos should be unsigned
unsigned int pos = hash(n->word);
n->hash = pos;
// NOTE/BUG: no need to check table[pos]
n->next = table[pos];
table[pos] = n; // write node to table
word_count++; // count new word
}
fclose(dict);
// debug
/* int j = 0; while (j <= 11) { for (node *tmp = table[j]; tmp != NULL; tmp = tmp->next) //делаем ноду; пока не дойдем до последней; переходим к следующей { printf("%s\n", tmp->word); //забираем значение ноды
} printf("Bucket number: %i\n", j); j++; } */
// printf("word count:%i\n", word_count);
return true;
}
// Returns number of words in dictionary if loaded, else 0 if not yet loaded
unsigned int
size(void)
{
return word_count;
}
// Unloads dictionary from memory, returning true if successful, else false
bool
unload(void)
{
node *cur;
node *next;
// remove all nodes from the table
for (int i = 0; i < N; ++i) {
cur = table[i];
if (cur == NULL)
continue;
// put all nodes in hash table back on [cached] free list
for (; cur != NULL; cur = next) {
next = cur->next;
cur->next = freelist;
freelist = cur;
}
table[i] = NULL;
}
cur = freelist;
freelist = NULL;
// trim free list to allocation/arena heads
for (; cur != NULL; cur = next) {
next = cur->next;
if (cur->alloc) {
cur->next = freelist;
freelist = cur;
}
}
// free the allocated nodes
cur = freelist;
freelist = NULL;
for (; cur != NULL; cur = next) {
next = cur->next;
free(cur);
}
return true;
}
更新#2:
I've already submitted my version. Just trying to understand the subject better. – meotet
好的,因此,我制作了一个最终版本,这是我对速度的最大努力。如果你还在努力,我做了更多的重构。
注意事项...
为每个节点做一个 single malloc
是浪费。 malloc
比较慢。请注意,在您的基准计时中,主要是 load
阶段比员工实施慢。
此外,还有一个 hidden 结构,malloc
将它放在内存中的指针 returns 之前,用于跟踪拨款。此结构占用 space,对于像节点结构这样的小分配,额外的内存开销可能很大。
最好调用 malloc
并在连续块或数组中分配大量 node
结构。进一步的分配可以来自该块内。
这类似于“平板分配”、“对象池”或“内存池”:
在下面的代码中,节点以 1000 个为一组进行分配。这将 malloc
调用的次数从 100,000 次减少到大约 100 次。这个版本比我的上一个分配器提高了速度,因为它使用了一个额外的“段”(又名“竞技场”)结构来控制分配。它不再需要将每个节点结构预链接到“空闲列表”或跟踪节点是否是从 malloc
.
看字典不用toupper/tolower
。因为字典保证运行全部小写,每行只有一个字,所以用fgets
看字典速度更快。我们可以将字典单词 直接 读入给定 node
结构的 word
字段 [if 我们做节点在执行 fgets
].
在计算散列时,我们不需要在每次迭代中都执行 % N
。由于模运算的性质,我们可以在最后做一次取余运算一次。
我们还可以将 hash
值存储到 node
结构中。然后,在 check
中,我们可以比较哈希值并跳过一些 [expensive] strcmp
调用。在您的代码中,由于 N
太大,大多数哈希桶只有一两个条目,因此这种加速可能影响不大 [并且可能会稍微慢一些]。
在 check
中,最好将参数 word
复制到一个临时缓冲区中,在我们进行 一次 时小写。然后,我们可以使用较快的 strcmp
而不是较慢的 strcasecmp
。无论我们是否将哈希值存储在节点结构中,这都有效。同样,由于 N
值较大,此加速效果可能有限。
无论如何,这里是重构代码:
// fix2.c -- Implements a dictionary's functionality
#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include "dictionary.h"
#if _USE_ZPRT_
#define dbgprt(_fmt...) printf(_fmt)
#else
#define dbgprt(_fmt...) do { } while (0)
#endif
#ifndef NODEHASH
#define NODEHASH 1
#endif
#ifndef FASTNODE
#define FASTNODE 1
#endif
// Represents a node in a hash table
typedef struct node {
char word[LENGTH + 1 + 1];
struct node *next;
#if NODEHASH
unsigned int hash;
#endif
} node;
//Prototypes
unsigned int hash(const char *word);
// TODO: Choose number of buckets in hash table
// odd prime number bigger than word count, because math
#ifndef N
#define N 187751
#endif
// Hash table
node *table[N]; // may need to use calloc
// word count
int word_count = 0;
// 31 because math
// modulus only really needed to clip hash to table index
// so do it _once_ by caller of hash/hash2
#if 0
#define HASH \
hash = (31 * hash + chr) % N
#else
#define HASH \
hash = (31 * hash) + chr
#endif
// Hashes word to a number and copies lowercase chars to output buffer
unsigned int
hash2(char *dest,const char *word)
{
unsigned int hash = 0;
for (int chr = *word++; chr != 0; chr = *word++) {
chr = tolower((unsigned char) chr);
HASH;
*dest++ = chr;
}
*dest = 0;
return hash;
}
// Returns true if word is in dictionary, else false
bool
check(const char *word)
{
char tmp[LENGTH + 10];
unsigned int hval = hash2(tmp,word);
unsigned int hidx = hval % N;
// iterate through bucket
for (node *cur = table[hidx]; cur != NULL; cur = cur->next) {
// check hash value first -- limits number of str*cmp calls
#if NODEHASH
if (cur->hash != hval)
continue;
#endif
if (strcmp(cur->word, tmp) == 0)
return true;
}
return false;
}
// Hashes word to a number
unsigned int
hash(const char *word)
{
unsigned int hash = 0;
for (int chr = *word++; chr != 0; chr = *word++)
HASH;
return hash;
}
// adjust this value to suit
#ifndef ARENASIZE
#define ARENASIZE 1000
#endif
typedef struct seg seg_t;
struct seg {
seg_t *seg_next; // next segment
int seg_count; // number of used nodes in this segment
node seg_node[ARENASIZE]; // nodes in this segment
};
node *nodelist; // list of freed nodes
seg_t *seglist; // head of linked list of segments
// newnode -- allocate from pool
node *
newnode(void)
{
seg_t *seg;
node *cur;
while (1) {
// get node from free node cache
// use the cached node if possible
// this only happens if freenode were called [which it isn't]
cur = nodelist;
if (cur != NULL) {
nodelist = cur->next;
break;
}
// try to allocate from current segment
seg = seglist;
if (seg != NULL) {
if (seg->seg_count < ARENASIZE) {
cur = &seg->seg_node[seg->seg_count++];
break;
}
}
// allocate a new segment
seg = malloc(sizeof(*seg));
// out of memory
if (seg == NULL)
break;
// mark segment as completely unused
seg->seg_count = 0;
// attach to segment list
seg->seg_next = seglist;
seglist = seg;
}
return cur;
}
// freenode -- free a node
void
freenode(node *cur)
{
#if FASTNODE
cur->next = nodelist;
nodelist = cur;
#else
free(cur);
#endif
}
// segunload -- release all segments
void
segunload(void)
{
seg_t *seg;
seg_t *next;
seg = seglist;
seglist = NULL;
for (; seg != NULL; seg = next) {
next = seg->seg_next;
free(seg);
}
}
// Unloads dictionary from memory, returning true if successful, else false
bool
unload(void)
{
node *cur;
// remove all nodes from the table
for (int i = 0; i < N; ++i) {
cur = table[i];
// put all nodes in hash table back on [cached] free list
// NOTE: not really necessary
#if FASTNODE
word_count = 0;
cur = NULL;
#else
node *next;
for (; cur != NULL; cur = next, --word_count) {
next = cur->next;
freenode(cur);
}
#endif
table[i] = cur;
}
segunload();
return true;
}
// Loads dictionary into memory, returning true if successful, else false
bool
load(const char *dictionary)
{
// open dictionary file
FILE *dict = fopen(dictionary, "r");
if (dict == NULL)
return false;
// read words from dictionary and write to hash table
while (1) {
#if FASTNODE
node *n = newnode();
#else
node *n = malloc(sizeof(*n));
#endif
if (n == NULL)
return false;
char *word = n->word;
if (fgets(word,sizeof(n->word),dict) == NULL) {
freenode(n);
break;
}
// strip newline
#if 0
word += strlen(word);
if (--word >= n->word) {
if (*word == '\n');
*word = 0;
}
#else
word = strchr(word,'\n');
if (word != NULL)
*word = 0;
#endif
// run hash to find the bucket in table
unsigned int pos = hash(n->word);
#if NODEHASH
n->hash = pos;
#endif
// put node in hash table
pos %= N;
n->next = table[pos];
table[pos] = n;
word_count++;
}
fclose(dict);
// DEBUG: show the bucket counts
#if _USE_ZPRT_
node *cur;
int mincount = 0;
int maxcount = 0;
int buckcount = 0;
int totcount = 0;
for (int i = 0; i < N; ++i) {
cur = table[i];
if (cur == NULL)
continue;
int count = 0;
for (; cur != NULL; cur = cur->next)
++count;
printf("load: BUCKET/%d count=%d\n",i,count);
if (i == 0) {
mincount = count;
maxcount = count;
}
if (count < mincount)
mincount = count;
if (count > maxcount)
maxcount = count;
totcount += count;
++buckcount;
}
printf("load: TOTAL buckcnt=%d/%d mincount=%d maxcount=%d avgcount=%d/%d\n",
buckcount,N,mincount,maxcount,totcount / buckcount,totcount / N);
#endif
return true;
}
// Returns number of words in dictionary if loaded, else 0 if not yet loaded
unsigned int
size(void)
{
return word_count;
}