在 C 中实现拼写检查器:Valgrind 报告内存错误
Implementing a spell checker in C: Valgrind reports memory errors
我正在尝试实现哈佛 CS50 的第五题:C 中的拼写检查器。
我的程序本身确实有效。但是,我确实一直遇到内存分配错误。我已经使用 Valgrind 试图找出错误的确切位置,但没有成功。我所在大学的教学人员也查看了代码,但不幸的是也无法确定错误的来源。
下面的第一段代码实现了另一个程序的功能,该程序执行实际的拼写检查。第二个是 Valgrind 日志。在第三块中,显示了帮助文件。最后,还显示了主要的拼写检查器(为了完整性)。
// 实现字典的功能
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <strings.h>
// The struct 'node' is defined here
#include "dictionary.h"
// defining SIZE as a constant 26
#define SIZE 26
// Implementing the amount of 'buckets'
node *hashtable[SIZE];
// Implementing the hash function
int hash_function(char* word)
{
unsigned int hash = toupper(word[0]) - 'A';
return hash % SIZE;
}
// Implementing the counter for 'size'
int number = 0;
// Returns true if word is in dictionary else false
bool check(const char *word)
{
// Converts the word that is going to be checked from a const char* to a modifiable string
// Naming word to be checked 'wtbc' and converting it to all lower case letters
int length = strlen(word) + 1;
char wtbc[length];
for (int i = 0; i < (length - 1); i++)
{
wtbc[i] = tolower(word[i]);
}
wtbc[(length - 1)] = '[=10=]';
// Calls on the hashfunction to output the 'bucket' where the wtbc is compared to the dictionary
node* cursor = hashtable[hash_function(wtbc)];
int result = 0;
while (cursor != NULL)
{
result = strcasecmp(cursor->word, wtbc);
if (result == 0)
{
return true;
}
cursor = cursor -> next;
}
return false;
}
// Loads dictionary into memory, returning true if successful else false
bool load(const char *dictionary)
{
// Making sure each 'head / bucket' points to NULL, using a variable named ptn (pointer to null)
for (int ptn = 0; ptn < SIZE; ptn++)
{
hashtable[ptn] = NULL;
}
// Iterate over each word in the dictionary
// Open the file (FILE I/O FUNCTIONS)
FILE *input = fopen(dictionary, "r");
if (input == NULL)
{
fprintf(stderr, "Could not open dictionary.\n");
return 1;
}
// Converting the const char *dictionary to a usable string
char dictionaryword[(LENGTH + 1)];
// Each new word should become a new node
while (fscanf(input, "%s", dictionaryword) != EOF)
{
// Hash the word: insert into hash table
int bucket = hash_function(dictionaryword);
// Creating a new node
node *new_node = malloc(sizeof(node));
if (new_node == NULL)
{
unload();
return false;
}
// Copy the new word into the new node
strcpy(new_node->word, dictionaryword);
// Creates a new node for the new word if the current head node is pointing to NULL
if (hashtable[bucket] == NULL)
{
// Pointing the head (hashtable[bucket] to point to the first new node)
hashtable[bucket] = new_node;
}
else
{
// Make sure the node points to the first node in the bucket
new_node->next = hashtable[bucket];
hashtable[bucket] = new_node;
}
number++;
}
fclose(input);
return true;
}
// Returns number of words in dictionary if loaded else 0 if not yet loaded
unsigned int size(void)
{
return number;
}
// Unloads dictionary from memory, returning true if successful else false
bool unload(void)
{
for (int j = 0; j < SIZE; j++)
{
node* cursorfree = hashtable[j];
while (cursorfree)
{
// create a temporary node to save the position of the next node
node* temp = cursorfree;
// free the current node
cursorfree = cursorfree->next;
free(temp);
}
}
return true;
}
Valgrind日志如下:
==3226== Memcheck, a memory error detector
==3226== Copyright (C) 2002-2013, and GNU GPL'd, by Julian Seward et al.
==3226== Using Valgrind-3.10.1 and LibVEX; rerun with -h for copyright info
==3226== Command: ./speller texts/lalaland.txt
==3226==
MISSPELLED WORDS
==3226== Conditional jump or move depends on uninitialised value(s)
==3226== at 0x4228DC: check (dictionary.c:46)
==3226== by 0x421443: main (speller.c:112)
==3226== Uninitialised value was created by a heap allocation
==3226== at 0x4C2AB80: malloc (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so)
==3226== by 0x422C23: load (dictionary.c:89)
==3226== by 0x420A72: main (speller.c:40)
==3226==
Chazelle
L
TECHNO
L
Thelonious
Prius
MIA
L
MIA
...
Sebastian's
Mia
Mia
Mia
MIA
Mia
Mia
Mia
Sebastian's
L
==3226== Conditional jump or move depends on uninitialised value(s)
==3226== at 0x4231FC: unload (dictionary.c:132)
==3226== by 0x421644: main (speller.c:152)
==3226== Uninitialised value was created by a heap allocation
==3226== at 0x4C2AB80: malloc (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so)
==3226== by 0x422C23: load (dictionary.c:89)
==3226== by 0x420A72: main (speller.c:40)
==3226==
WORDS MISSPELLED: 955
WORDS IN DICTIONARY: 143091
WORDS IN TEXT: 17756
TIME IN load: 2.07
TIME IN check: 140.60
TIME IN size: 0.00
TIME IN unload: 0.22
TIME IN TOTAL: 142.88
==3226==
==3226== HEAP SUMMARY:
==3226== in use at exit: 0 bytes in 0 blocks
==3226== total heap usage: 143,093 allocs, 143,093 frees, 8,014,232 bytes allocated
==3226==
==3226== All heap blocks were freed -- no leaks are possible
==3226==
==3226== For counts of detected and suppressed errors, rerun with: -v
==3226== ERROR SUMMARY: 981 errors from 2 contexts (suppressed: 0 from 0)
我的节点结构在以下文件中定义:
// Declares a dictionary's functionality
#ifndef DICTIONARY_H
#define DICTIONARY_H
#include <stdbool.h>
// Maximum length for a word
// (e.g., pneumonoultramicroscopicsilicovolcanoconiosis)
#define LENGTH 46
typedef struct node
{
char word[LENGTH + 1];
struct node *next;
}
node;
node* head;
// Prototypes
bool check(const char *word);
bool load(const char *dictionary);
unsigned int size(void);
bool unload(void);
#endif // DICTIONARY_H
为了完整起见,这里是主要的拼写检查程序:
// Implements a spell-checker
#include <ctype.h>
#include <stdio.h>
#include <sys/resource.h>
#include <sys/time.h>
#include "dictionary.h"
// Undefine any definitions
#undef calculate
#undef getrusage
// Default dictionary
#define DICTIONARY "dictionaries/large"
// Prototype
double calculate(const struct rusage *b, const struct rusage *a);
int main(int argc, char *argv[])
{
// Check for correct number of args
if (argc != 2 && argc != 3)
{
printf("Usage: speller [dictionary] text\n");
return 1;
}
// Structures for timing data
struct rusage before, after;
// Benchmarks
double time_load = 0.0, time_check = 0.0, time_size = 0.0, time_unload = 0.0;
// Determine dictionary to use
char *dictionary = (argc == 3) ? argv[1] : DICTIONARY;
// Load dictionary
getrusage(RUSAGE_SELF, &before);
bool loaded = load(dictionary);
getrusage(RUSAGE_SELF, &after);
// Exit if dictionary not loaded
if (!loaded)
{
printf("Could not load %s.\n", dictionary);
return 1;
}
// Calculate time to load dictionary
time_load = calculate(&before, &after);
// Try to open text
char *text = (argc == 3) ? argv[2] : argv[1];
FILE *file = fopen(text, "r");
if (file == NULL)
{
printf("Could not open %s.\n", text);
unload();
return 1;
}
// Prepare to report misspellings
printf("\nMISSPELLED WORDS\n\n");
// Prepare to spell-check
int index = 0, misspellings = 0, words = 0;
char word[LENGTH + 1];
// Spell-check each word in text
for (int c = fgetc(file); c != EOF; c = fgetc(file))
{
// Allow only alphabetical characters and apostrophes
if (isalpha(c) || (c == '\'' && index > 0))
{
// Append character to word
word[index] = c;
index++;
// Ignore alphabetical strings too long to be words
if (index > LENGTH)
{
// Consume remainder of alphabetical string
while ((c = fgetc(file)) != EOF && isalpha(c));
// Prepare for new word
index = 0;
}
}
// Ignore words with numbers (like MS Word can)
else if (isdigit(c))
{
// Consume remainder of alphanumeric string
while ((c = fgetc(file)) != EOF && isalnum(c));
// Prepare for new word
index = 0;
}
// We must have found a whole word
else if (index > 0)
{
// Terminate current word
word[index] = '[=13=]';
// Update counter
words++;
// Check word's spelling
getrusage(RUSAGE_SELF, &before);
bool misspelled = !check(word);
getrusage(RUSAGE_SELF, &after);
// Update benchmark
time_check += calculate(&before, &after);
// Print word if misspelled
if (misspelled)
{
printf("%s\n", word);
misspellings++;
}
// Prepare for next word
index = 0;
}
}
// Check whether there was an error
if (ferror(file))
{
fclose(file);
printf("Error reading %s.\n", text);
unload();
return 1;
}
// Close text
fclose(file);
// Determine dictionary's size
getrusage(RUSAGE_SELF, &before);
unsigned int n = size();
getrusage(RUSAGE_SELF, &after);
// Calculate time to determine dictionary's size
time_size = calculate(&before, &after);
// Unload dictionary
getrusage(RUSAGE_SELF, &before);
bool unloaded = unload();
getrusage(RUSAGE_SELF, &after);
// Abort if dictionary not unloaded
if (!unloaded)
{
printf("Could not unload %s.\n", dictionary);
return 1;
}
// Calculate time to unload dictionary
time_unload = calculate(&before, &after);
// Report benchmarks
printf("\nWORDS MISSPELLED: %d\n", misspellings);
printf("WORDS IN DICTIONARY: %d\n", n);
printf("WORDS IN TEXT: %d\n", words);
printf("TIME IN load: %.2f\n", time_load);
printf("TIME IN check: %.2f\n", time_check);
printf("TIME IN size: %.2f\n", time_size);
printf("TIME IN unload: %.2f\n", time_unload);
printf("TIME IN TOTAL: %.2f\n\n",
time_load + time_check + time_size + time_unload);
// Success
return 0;
}
// Returns number of seconds between b and a
double calculate(const struct rusage *b, const struct rusage *a)
{
if (b == NULL || a == NULL)
{
return 0.0;
}
else
{
return ((((a->ru_utime.tv_sec * 1000000 + a->ru_utime.tv_usec) -
(b->ru_utime.tv_sec * 1000000 + b->ru_utime.tv_usec)) +
((a->ru_stime.tv_sec * 1000000 + a->ru_stime.tv_usec) -
(b->ru_stime.tv_sec * 1000000 + b->ru_stime.tv_usec)))
/ 1000000.0);
}
}
抱歉这么久 post。我不确定有多少代码与解决这个问题有关,我认为最好尽可能多地上传。我已经在这个问题上停留了一段时间了。如果有人知道导致此错误的原因,我将不胜感激!
当没有散列冲突时,您将让 next
字段未初始化。这会导致稍后在检查 next
时出现未定义的行为。
malloc
没有为您做任何零初始化。您必须将 NULL
分配给 next
。
事实上这整个片段
// Creates a new node for the new word if the current head node is pointing to NULL
if (hashtable[bucket] == NULL)
{
// Pointing the head (hashtable[bucket] to point to the first new node)
hashtable[bucket] = new_node;
new_node->next = NULL; // <--- this is missing
}
else
{
// Make sure the node points to the first node in the bucket
new_node->next = hashtable[bucket];
hashtable[bucket] = new_node;
}
可以减少到
new_node->next = hashtable[bucket];
hashtable[bucket] = new_node;
它总是做正确的事,无论 hashtable[bucket]
是否包含 NULL
。
我正在尝试实现哈佛 CS50 的第五题:C 中的拼写检查器。
我的程序本身确实有效。但是,我确实一直遇到内存分配错误。我已经使用 Valgrind 试图找出错误的确切位置,但没有成功。我所在大学的教学人员也查看了代码,但不幸的是也无法确定错误的来源。
下面的第一段代码实现了另一个程序的功能,该程序执行实际的拼写检查。第二个是 Valgrind 日志。在第三块中,显示了帮助文件。最后,还显示了主要的拼写检查器(为了完整性)。
// 实现字典的功能
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <strings.h>
// The struct 'node' is defined here
#include "dictionary.h"
// defining SIZE as a constant 26
#define SIZE 26
// Implementing the amount of 'buckets'
node *hashtable[SIZE];
// Implementing the hash function
int hash_function(char* word)
{
unsigned int hash = toupper(word[0]) - 'A';
return hash % SIZE;
}
// Implementing the counter for 'size'
int number = 0;
// Returns true if word is in dictionary else false
bool check(const char *word)
{
// Converts the word that is going to be checked from a const char* to a modifiable string
// Naming word to be checked 'wtbc' and converting it to all lower case letters
int length = strlen(word) + 1;
char wtbc[length];
for (int i = 0; i < (length - 1); i++)
{
wtbc[i] = tolower(word[i]);
}
wtbc[(length - 1)] = '[=10=]';
// Calls on the hashfunction to output the 'bucket' where the wtbc is compared to the dictionary
node* cursor = hashtable[hash_function(wtbc)];
int result = 0;
while (cursor != NULL)
{
result = strcasecmp(cursor->word, wtbc);
if (result == 0)
{
return true;
}
cursor = cursor -> next;
}
return false;
}
// Loads dictionary into memory, returning true if successful else false
bool load(const char *dictionary)
{
// Making sure each 'head / bucket' points to NULL, using a variable named ptn (pointer to null)
for (int ptn = 0; ptn < SIZE; ptn++)
{
hashtable[ptn] = NULL;
}
// Iterate over each word in the dictionary
// Open the file (FILE I/O FUNCTIONS)
FILE *input = fopen(dictionary, "r");
if (input == NULL)
{
fprintf(stderr, "Could not open dictionary.\n");
return 1;
}
// Converting the const char *dictionary to a usable string
char dictionaryword[(LENGTH + 1)];
// Each new word should become a new node
while (fscanf(input, "%s", dictionaryword) != EOF)
{
// Hash the word: insert into hash table
int bucket = hash_function(dictionaryword);
// Creating a new node
node *new_node = malloc(sizeof(node));
if (new_node == NULL)
{
unload();
return false;
}
// Copy the new word into the new node
strcpy(new_node->word, dictionaryword);
// Creates a new node for the new word if the current head node is pointing to NULL
if (hashtable[bucket] == NULL)
{
// Pointing the head (hashtable[bucket] to point to the first new node)
hashtable[bucket] = new_node;
}
else
{
// Make sure the node points to the first node in the bucket
new_node->next = hashtable[bucket];
hashtable[bucket] = new_node;
}
number++;
}
fclose(input);
return true;
}
// Returns number of words in dictionary if loaded else 0 if not yet loaded
unsigned int size(void)
{
return number;
}
// Unloads dictionary from memory, returning true if successful else false
bool unload(void)
{
for (int j = 0; j < SIZE; j++)
{
node* cursorfree = hashtable[j];
while (cursorfree)
{
// create a temporary node to save the position of the next node
node* temp = cursorfree;
// free the current node
cursorfree = cursorfree->next;
free(temp);
}
}
return true;
}
Valgrind日志如下:
==3226== Memcheck, a memory error detector
==3226== Copyright (C) 2002-2013, and GNU GPL'd, by Julian Seward et al.
==3226== Using Valgrind-3.10.1 and LibVEX; rerun with -h for copyright info
==3226== Command: ./speller texts/lalaland.txt
==3226==
MISSPELLED WORDS
==3226== Conditional jump or move depends on uninitialised value(s)
==3226== at 0x4228DC: check (dictionary.c:46)
==3226== by 0x421443: main (speller.c:112)
==3226== Uninitialised value was created by a heap allocation
==3226== at 0x4C2AB80: malloc (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so)
==3226== by 0x422C23: load (dictionary.c:89)
==3226== by 0x420A72: main (speller.c:40)
==3226==
Chazelle
L
TECHNO
L
Thelonious
Prius
MIA
L
MIA
...
Sebastian's
Mia
Mia
Mia
MIA
Mia
Mia
Mia
Sebastian's
L
==3226== Conditional jump or move depends on uninitialised value(s)
==3226== at 0x4231FC: unload (dictionary.c:132)
==3226== by 0x421644: main (speller.c:152)
==3226== Uninitialised value was created by a heap allocation
==3226== at 0x4C2AB80: malloc (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so)
==3226== by 0x422C23: load (dictionary.c:89)
==3226== by 0x420A72: main (speller.c:40)
==3226==
WORDS MISSPELLED: 955
WORDS IN DICTIONARY: 143091
WORDS IN TEXT: 17756
TIME IN load: 2.07
TIME IN check: 140.60
TIME IN size: 0.00
TIME IN unload: 0.22
TIME IN TOTAL: 142.88
==3226==
==3226== HEAP SUMMARY:
==3226== in use at exit: 0 bytes in 0 blocks
==3226== total heap usage: 143,093 allocs, 143,093 frees, 8,014,232 bytes allocated
==3226==
==3226== All heap blocks were freed -- no leaks are possible
==3226==
==3226== For counts of detected and suppressed errors, rerun with: -v
==3226== ERROR SUMMARY: 981 errors from 2 contexts (suppressed: 0 from 0)
我的节点结构在以下文件中定义:
// Declares a dictionary's functionality
#ifndef DICTIONARY_H
#define DICTIONARY_H
#include <stdbool.h>
// Maximum length for a word
// (e.g., pneumonoultramicroscopicsilicovolcanoconiosis)
#define LENGTH 46
typedef struct node
{
char word[LENGTH + 1];
struct node *next;
}
node;
node* head;
// Prototypes
bool check(const char *word);
bool load(const char *dictionary);
unsigned int size(void);
bool unload(void);
#endif // DICTIONARY_H
为了完整起见,这里是主要的拼写检查程序:
// Implements a spell-checker
#include <ctype.h>
#include <stdio.h>
#include <sys/resource.h>
#include <sys/time.h>
#include "dictionary.h"
// Undefine any definitions
#undef calculate
#undef getrusage
// Default dictionary
#define DICTIONARY "dictionaries/large"
// Prototype
double calculate(const struct rusage *b, const struct rusage *a);
int main(int argc, char *argv[])
{
// Check for correct number of args
if (argc != 2 && argc != 3)
{
printf("Usage: speller [dictionary] text\n");
return 1;
}
// Structures for timing data
struct rusage before, after;
// Benchmarks
double time_load = 0.0, time_check = 0.0, time_size = 0.0, time_unload = 0.0;
// Determine dictionary to use
char *dictionary = (argc == 3) ? argv[1] : DICTIONARY;
// Load dictionary
getrusage(RUSAGE_SELF, &before);
bool loaded = load(dictionary);
getrusage(RUSAGE_SELF, &after);
// Exit if dictionary not loaded
if (!loaded)
{
printf("Could not load %s.\n", dictionary);
return 1;
}
// Calculate time to load dictionary
time_load = calculate(&before, &after);
// Try to open text
char *text = (argc == 3) ? argv[2] : argv[1];
FILE *file = fopen(text, "r");
if (file == NULL)
{
printf("Could not open %s.\n", text);
unload();
return 1;
}
// Prepare to report misspellings
printf("\nMISSPELLED WORDS\n\n");
// Prepare to spell-check
int index = 0, misspellings = 0, words = 0;
char word[LENGTH + 1];
// Spell-check each word in text
for (int c = fgetc(file); c != EOF; c = fgetc(file))
{
// Allow only alphabetical characters and apostrophes
if (isalpha(c) || (c == '\'' && index > 0))
{
// Append character to word
word[index] = c;
index++;
// Ignore alphabetical strings too long to be words
if (index > LENGTH)
{
// Consume remainder of alphabetical string
while ((c = fgetc(file)) != EOF && isalpha(c));
// Prepare for new word
index = 0;
}
}
// Ignore words with numbers (like MS Word can)
else if (isdigit(c))
{
// Consume remainder of alphanumeric string
while ((c = fgetc(file)) != EOF && isalnum(c));
// Prepare for new word
index = 0;
}
// We must have found a whole word
else if (index > 0)
{
// Terminate current word
word[index] = '[=13=]';
// Update counter
words++;
// Check word's spelling
getrusage(RUSAGE_SELF, &before);
bool misspelled = !check(word);
getrusage(RUSAGE_SELF, &after);
// Update benchmark
time_check += calculate(&before, &after);
// Print word if misspelled
if (misspelled)
{
printf("%s\n", word);
misspellings++;
}
// Prepare for next word
index = 0;
}
}
// Check whether there was an error
if (ferror(file))
{
fclose(file);
printf("Error reading %s.\n", text);
unload();
return 1;
}
// Close text
fclose(file);
// Determine dictionary's size
getrusage(RUSAGE_SELF, &before);
unsigned int n = size();
getrusage(RUSAGE_SELF, &after);
// Calculate time to determine dictionary's size
time_size = calculate(&before, &after);
// Unload dictionary
getrusage(RUSAGE_SELF, &before);
bool unloaded = unload();
getrusage(RUSAGE_SELF, &after);
// Abort if dictionary not unloaded
if (!unloaded)
{
printf("Could not unload %s.\n", dictionary);
return 1;
}
// Calculate time to unload dictionary
time_unload = calculate(&before, &after);
// Report benchmarks
printf("\nWORDS MISSPELLED: %d\n", misspellings);
printf("WORDS IN DICTIONARY: %d\n", n);
printf("WORDS IN TEXT: %d\n", words);
printf("TIME IN load: %.2f\n", time_load);
printf("TIME IN check: %.2f\n", time_check);
printf("TIME IN size: %.2f\n", time_size);
printf("TIME IN unload: %.2f\n", time_unload);
printf("TIME IN TOTAL: %.2f\n\n",
time_load + time_check + time_size + time_unload);
// Success
return 0;
}
// Returns number of seconds between b and a
double calculate(const struct rusage *b, const struct rusage *a)
{
if (b == NULL || a == NULL)
{
return 0.0;
}
else
{
return ((((a->ru_utime.tv_sec * 1000000 + a->ru_utime.tv_usec) -
(b->ru_utime.tv_sec * 1000000 + b->ru_utime.tv_usec)) +
((a->ru_stime.tv_sec * 1000000 + a->ru_stime.tv_usec) -
(b->ru_stime.tv_sec * 1000000 + b->ru_stime.tv_usec)))
/ 1000000.0);
}
}
抱歉这么久 post。我不确定有多少代码与解决这个问题有关,我认为最好尽可能多地上传。我已经在这个问题上停留了一段时间了。如果有人知道导致此错误的原因,我将不胜感激!
当没有散列冲突时,您将让 next
字段未初始化。这会导致稍后在检查 next
时出现未定义的行为。
malloc
没有为您做任何零初始化。您必须将 NULL
分配给 next
。
事实上这整个片段
// Creates a new node for the new word if the current head node is pointing to NULL
if (hashtable[bucket] == NULL)
{
// Pointing the head (hashtable[bucket] to point to the first new node)
hashtable[bucket] = new_node;
new_node->next = NULL; // <--- this is missing
}
else
{
// Make sure the node points to the first node in the bucket
new_node->next = hashtable[bucket];
hashtable[bucket] = new_node;
}
可以减少到
new_node->next = hashtable[bucket];
hashtable[bucket] = new_node;
它总是做正确的事,无论 hashtable[bucket]
是否包含 NULL
。