在字符数组 C 中搜索

Question

我想在 bsearch 排序的文本中找到字符串（在代码中命名为 word）。为什么 bsearch 似乎不起作用？

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

int cmp(const void *a, const void *b) {
    const char *ia = *(const char **)a;
    const char *ib = *(const char **)b;
    return strcmp(ia, ib);
}

int main() {
    char *text = malloc(sizeof(char *) * 1000);
    char *word = malloc(sizeof(char *) * 30);
    char **all_words = malloc(sizeof(char *) * 1);
    int count_word = 0;

    fgets(text, 1000, stdin);
    fgets(word, 30, stdin);
    char *tok = strtok(text, " .\n");

    while (tok != NULL) {
        all_words[count_word] = malloc(sizeof(char) * (strlen(tok) + 1));
        all_words[count_word] = tok;
        all_words[count_word][strlen(tok)] = '[=10=]';

        count_word++;
        all_words = realloc(all_words, (count_word + 1) * sizeof(char *));

        tok = strtok(NULL, " .\n");
    }

    qsort(all_words, count_word, sizeof(char *), cmp);
    for (int i = 0; i < count_word; i++) {
        puts(all_words[i]);
    }

    void *pointer = bsearch(&word, all_words, count_word, sizeof(char*), cmp);

    if (pointer != NULL) {
        puts("exists");
    } else {
        puts("doesn't exist");
    }

    for (int i = 0; i < count_word; i++) {
        free(all_words[i]);    //---- error here
    }

    free(all_words);
    free(word);
    free(text);
}

为什么在使用 strtok 后释放 char* 数组时出现此错误？使用strtok后有问题吗？（由@wildplasser 和@JonathanLeffler tnx 解决）

还有一个问题是关于我的 cmp 函数的。我知道它可以正常工作，但我不记得为什么会这样……这段代码有什么区别？（也由@JonathanLeffler 解释）

int cmp(const void *a, const void *b) {
    const char *ia = (const char *)a;
    const char *ib = (const char *)b;
    puts(ia);
    return strcmp(ia, ib);
}

Answer 1

如评论中所述，主要问题是您在不使用 strcpy() 的情况下错误分配了令牌。作为 wildplasser noted in a , it is simpler to use the POSIX strdup() 函数（它还不是标准 C 的一部分，尽管在 C2x 中可能会改变）。

第二个问题是 word 包含换行符，而标记从不包含换行符。你需要删除它。

此代码进行了这些更改并添加了一个函数来转储字符串数组。

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

static int cmp(const void *a, const void *b)
{
    const char *ia = *(const char **)a;
    const char *ib = *(const char **)b;
    return strcmp(ia, ib);
}

static void dump_strings(const char *tag, size_t num_words, char **words);

int main(void)
{
    char *text = malloc(sizeof(char *) * 1000);
    char *word = malloc(sizeof(char *) * 30);
    char **all_words = malloc(sizeof(char *) * 1);
    int count_word = 0;

    fgets(text, 1000, stdin);
    fgets(word, 30, stdin);
    word[strcspn(word, "\n")] = '[=10=]';
    char *tok = strtok(text, " .\n");

    while (tok != NULL)
    {
        all_words[count_word++] = strdup(tok);
        all_words = realloc(all_words, (count_word + 1) * sizeof(char *));
        tok = strtok(NULL, " .\n");
    }

    dump_strings("Before", count_word, all_words);

    qsort(all_words, count_word, sizeof(char *), cmp);
    dump_strings("After", count_word, all_words);

    void *pointer = bsearch(&word, all_words, count_word, sizeof(char *), cmp);

    if (pointer != NULL)
       printf("Word [%s] exists\n", word);
    else
       printf("Word [%s] does not exists\n", word);

    for (int i = 0; i < count_word; i++)
        free(all_words[i]);

    free(all_words);
    free(word);
    free(text);
    return 0;
}

static void dump_strings(const char *tag, size_t num_words, char **words)
{
    printf("%s (%zu):\n", tag, num_words);
    for (size_t i = 0; i < num_words; i++)
        printf("%zu: [%s]\n", i, words[i]);
}

那是在一个文件 cpystr89.c 中，编译成一个名为 cpystr89 的可执行文件。我使用以下 shell 脚本来运行命令：

./cpystr89 <<'EOF'
this is the text of the long string with many words in it and some repetition and so on.  of course, it is in mono-case for simplicity.  messieurs o'rourke and o'sullivan cause trouble; so does a semicolon
the
EOF

输出是：

Before (37):
0: [this]
1: [is]
2: [the]
3: [text]
4: [of]
5: [the]
6: [long]
7: [string]
8: [with]
9: [many]
10: [words]
11: [in]
12: [it]
13: [and]
14: [some]
15: [repetition]
16: [and]
17: [so]
18: [on]
19: [of]
20: [course,]
21: [it]
22: [is]
23: [in]
24: [mono-case]
25: [for]
26: [simplicity]
27: [messieurs]
28: [o'rourke]
29: [and]
30: [o'sullivan]
31: [cause]
32: [trouble;]
33: [so]
34: [does]
35: [a]
36: [semicolon]
After (37):
0: [a]
1: [and]
2: [and]
3: [and]
4: [cause]
5: [course,]
6: [does]
7: [for]
8: [in]
9: [in]
10: [is]
11: [is]
12: [it]
13: [it]
14: [long]
15: [many]
16: [messieurs]
17: [mono-case]
18: [o'rourke]
19: [o'sullivan]
20: [of]
21: [of]
22: [on]
23: [repetition]
24: [semicolon]
25: [simplicity]
26: [so]
27: [so]
28: [some]
29: [string]
30: [text]
31: [the]
32: [the]
33: [this]
34: [trouble;]
35: [with]
36: [words]
Word [the] exists

输入循环中的realloc()代码不理想。通常，您应该对每次迭代增加一个单元的内存分配的任何循环持怀疑态度。随着时间的推移，这会导致二次行为，因为旧的分配被复制到每个（或大多数）分配上。通常最好使用一种分配许多额外指针并一次分发一个的策略。在这种情况下，您可以简单地分配 strlen(text) / 2 + 1 个指针；最多每隔一个字符就是一个单词（例如，如果您键入 a b c d 作为文本）。通常，您会使用两个计数器：num_alloc 和 num_used。 num_alloc值记录分配了多少个指针； num_used 值记录正在使用的数字。您可以在每次调用时将 num_alloc 加倍。如果你担心输入完成后过度分配，你可以使用收缩 realloc() 来释放额外的 space.

您的代码没有检查每个内存分配是否成功。应该！

我还会观察到我不会将 malloc() 用于 text 和 word。您可以在堆栈上分配几千字节的内存。当您开始在堆栈上分配数百 KB 时，就会出现问题。 Windows 上的默认堆栈限制通常为 1 MiB；在大多数类 Unix 系统上，它是 8 MiB。在任何一种情况下，一旦您考虑分配数百 KB 的内存，就该考虑使用 malloc() 等进行动态内存分配。不分配这些可以简化错误清理工作。

此代码实现了这些更改。请注意对 bsearch().

调用的更改

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

static int cmp(const void *a, const void *b)
{
    const char *ia = *(const char **)a;
    const char *ib = *(const char **)b;
    return strcmp(ia, ib);
}

static void dump_strings(const char *tag, size_t num_words, char **words);
static void free_strings(size_t num_words, char **words);

int main(void)
{
    char text[1000];
    char word[30];
    char **all_words = NULL;
    int count_words = 0;
    int alloc_words = 0;

    fgets(text, 1000, stdin);
    fgets(word, 30, stdin);
    word[strcspn(word, "\n")] = '[=13=]';

    for (char *tok = strtok(text, " .\n"); tok != NULL; tok = strtok(NULL, " .\n"))
    {
        if (count_words >= alloc_words)
        {
            size_t new_count = alloc_words * 2 + 2;     // + 2 to get away from zero
            char **new_words = realloc(all_words, new_count * sizeof(*all_words));
            if (new_words == NULL)
            {
                fprintf(stderr, "Failed to reallocate %zu bytes memory\n",
                        new_count * sizeof(*all_words));
                free_strings(count_words, all_words);
                exit(EXIT_FAILURE);
            }
            all_words = new_words;
            alloc_words = new_count;
        }
        if ((all_words[count_words++] = strdup(tok)) == NULL)
        {
            fprintf(stderr, "Failed to allocate %zu bytes memory\n", strlen(tok) + 1);
            free_strings(count_words, all_words);
            exit(EXIT_FAILURE);
        }
    }

    dump_strings("Before", count_words, all_words);

    qsort(all_words, count_words, sizeof(char *), cmp);
    dump_strings("After", count_words, all_words);

    char *p_word = word;        // Irksome but necessary; using &word in call to bsearch() crashes
    void *pointer = bsearch(&p_word, all_words, count_words, sizeof(char *), cmp);

    if (pointer != NULL)
       printf("Word [%s] exists\n", word);
    else
       printf("Word [%s] does not exists\n", word);

    free_strings(count_words, all_words);

    return 0;
}

static void dump_strings(const char *tag, size_t num_words, char **words)
{
    printf("%s (%zu):\n", tag, num_words);
    for (size_t i = 0; i < num_words; i++)
        printf("%zu: [%s]\n", i, words[i]);
}

static void free_strings(size_t num_words, char **words)
{
    for (size_t i = 0; i < num_words; i++)
        free(words[i]);
    free(words);
}

输出——惊喜，惊喜——和以前一样。

我还将错误报告和退出代码封装到函数中。我要使用的代码可以在我的 SOQ (Stack Overflow Questions) repository on GitHub as files stderr.c and stderr.h in the src/libsoq 子目录中找到。

        if ((all_words[count_words++] = strdup(tok)) == NULL)
        {
            free_strings(count_words, all_words);
            err_syserr("Failed to allocate %zu bytes memory\n", strlen(tok) + 1);
        }

这不像有时因为调用 free_strings() 那样引人注目，但对于简单的情况，它将 4 行代码减少到 1 行，这让我不太愿意添加必要的错误检查。

Answer 2

bsearch 使用的比较函数考虑一个元素数组（您必须将其大小作为参数传递），然后通过引用进行比较（这意味着，您会收到指向字符的双指针，而不是单个指针）这是因为 bsearch 例程是通用的，可以传递给它的元素可以是结构或大型数组本身。唯一的要求是它们必须大小相等。

这就是您在调用比较器时取消引用指针然后使用取消引用的指针调用 strcmp 的原因。 Strcmp 只得到指向 char 的指针，而不是指向 char 的指针的引用，这就是 strcmp 本身不足以用来传递给 bsearch 的原因。同样的问题适用于qsort和朋友

最后一个提示：如果不需要，切勿使用强制转换。这适用于您在 ia 和 ib 的声明中所做的转换。你最好把你的比较函数写成：

int cmp(const void *a, const void *b) {
    const char * const *ia = a;
    const char * const *ib = b;
    return strcmp(*ia, *ib);
}

如果你犯了错误，你会收到编译器的投诉（就像我在准备这个例子时所做的那样；））。

在字符数组 C 中搜索

Bsearch in char array C

c

arrays

string

qsort