在 C 中查找数组中的 n 个重复数字

Question

我已经使用 C 中的哈希表库函数实现了下面的问题陈述。由于我从未在 C 中使用过标准库哈希表，所以我的问题是：

我是否正确使用哈希表函数（我相信获得输出并不意味着正确使用）？
是否有更好的方法来解决给定的问题陈述？

问题陈述：找到数组中出现次数最多的 n 个元素。

1 < N < 100000 [数组长度]
-1000000 < n < 1000000 [数组整数]

我已经在 SO 上解决了一些类似的问题 - 在其中一个 answers 中，我确实看到推荐的方法是使用哈希表。

#include <stdio.h>
#include <stdlib.h>
#include <search.h>
#include <stdbool.h>

#define REPEAT 3
#define BUFFERSIZE 10

void freqElement(int* arr, int len, int times);
int createHT(int* arr, int len);

int main(void)
{
    int arr[] = {2, 3, 5, 6, 10, 10, 2, 5, 2};
    int len = sizeof(arr)/sizeof(int);
    ENTRY e;
    ENTRY *ep;

    hcreate(len);

    if (!createHT(arr, len))
    {
        printf(" error in entering data \n");
    }

    freqElement(arr, len, REPEAT);

    hdestroy();
    return 0;
}

int createHT(int* arr, int len)
{
    ENTRY e, *ep;

    for(int i = 0; i < len; i++)
    {
        char buffer[BUFFERSIZE];
        snprintf(buffer, BUFFERSIZE, "%d", arr[i]);
        e.key = buffer;
        e.data = (void *)1;

        ep = hsearch(e, FIND);
        if (ep)
        {
            ep->data = (void *)((int)ep->data + (int)e.data);
        }
        ep = hsearch(e, ENTER);
        if (ep == NULL)
        {
            fprintf(stderr, "entry failed\n");
            exit(EXIT_FAILURE);
        }
    }
    return 1;
}

void freqElement(int* arr, int len, int times)
{
   ENTRY *ep, e;

   for (int i = 0; i < len; i++)
   {
       char buffer[BUFFERSIZE];
       snprintf(buffer, BUFFERSIZE, "%d", arr[i]);
       e.key = buffer;
       ep = hsearch(e, FIND);
       if(ep)
       {
           if((int)ep->data == times)
           {
               printf(" value %s is repeated %d times \n", ep->key, times);
               break;
           }
       }
   }

}

Answer 1

让我们从问题陈述开始，因为正如我在评论中提到的，我认为您当前的代码没有解决问题：

Find the n most frequent element in an array.

1 < N < 100000 [Length of Array]

-1000000 < n < 1000000 [Array integers]

我认为这意味着我们想要这样的函数：

size_t n_most_popular(int input[], size_t input_size, int output[], size_t output_size);

此函数采用整数（介于 -1000000 和 1000000 之间）的输入数组（大小最大为 100000），并使用输入中 N 个最常见的元素填充输出数组，其中 N 是 output_size.为了方便起见，我们可以规定将最常见的元素放在输出的前面，而不常见的元素放在后面。

一个直接的方法是首先对输入数组进行排序（可能就地，可能使用标准 qsort()）。然后你会有一个这样的数组：

[1,1,1,1,2,2,3,3,3,3,3,4,7,...]

然后，创建一个结构数组，其中每个结构都包含来自输入的唯一值，加上它出现的次数。它的最大长度是 input_size，并且从排序的输入中一次性构建它是微不足道的。

最后，使用标准 qsort() 按计数字段降序排列这个结构数组。将第一个 output_size 个元素复制到输出数组，return 输出数组的实际填充大小（如果输入数组中没有足够的唯一值，它可能小于 output_size ).

这里有一个工作的 C 程序可以做到这一点：

#include <string.h>
#include <stdio.h>
#include <stdlib.h>

size_t most_popular(int input[], size_t input_size, int output[], size_t output_size);

int main(void)
{
    int arr[] = {2, 3, 5, 6, 10, 10, 2, 5, 2};
    size_t len = sizeof(arr)/sizeof(int);

    int out[3];
    size_t outlen = sizeof(out)/sizeof(int);

    size_t count = most_popular(arr, len, out, outlen);

    for (size_t ii = 0; ii < count; ii++) {
        printf("most popular rank %lu: %d\n", ii+1, out[ii]);
    }

    return 0;
}

typedef struct
{
    int value;
    int count;
} value_count;

int value_count_greater(const void* lhs, const void* rhs)
{
    const value_count *vcl = lhs, *vcr = rhs;
    return vcr->count - vcl->count;
}

int int_less(const void *lhs, const void *rhs)
{
    const int *il = lhs, *ir = rhs;
    return *il - *ir;
}

// returns 0 if out of memory or input_size is 0, else returns valid portion of output                                                                                    
size_t most_popular(int input[], size_t input_size, int output[], size_t output_size)
{
    qsort(input, input_size, sizeof(input[0]), int_less);

    value_count* value_counts = malloc(input_size * sizeof(value_count));
    if (value_counts == NULL) {
        return 0;
    }

    // count how many times each value occurs in input                                                                                                                    
    size_t unique_count = 0;
    for (size_t ii = 0; ii < input_size; ii++) {
        if (ii == 0 || input[ii] != value_counts[unique_count-1].value) {
            value_counts[unique_count].value = input[ii];
            value_counts[unique_count].count = 1;
            unique_count++;
        } else {
            value_counts[unique_count-1].count++;
        }
    }

    // sort unique values by how often they occur, most popular first                                                                                                     
    qsort(value_counts, unique_count, sizeof(value_counts[0]), value_count_greater);

    size_t result_size = unique_count < output_size ? unique_count : output_size;
    for (size_t ii = 0; ii < result_size; ii++) {
        output[ii] = value_counts[ii].value;
    }

    free(value_counts);
    return result_size;
}

Answer 2

我不确定是否会为此任务使用 hcreate()、hsearch()、hdestroy() 三元组函数，但可以使用。 POSIX规范在某些问题上没有明确说明，例如htdestroy()释放密钥，但是Mac OS X手册说：

The hdestroy() function disposes of the search table, and may be followed by another call to hcreate(). After the call to hdestroy(), the data can no longer be considered accessible. The hdestroy() function calls free(3) for each comparison key in the search table but not the data item associated with the key.

（POSIX 没有提到 hdestroy() 在比较键上调用 free()。）

这是对您的代码的相对简单的改编，可以在 valgrind 下正常运行，至少在 Mac OS 上使用 GCC 6.1.0 和 Valgrind 3.12.0-SVN 是这样X 10.11.4.

$ gcc -O3 -g -std=c11 -Wall -Wextra -Wmissing-prototypes \
>     -Wstrict-prototypes -Wold-style-definition -Werror hs17.c -o hs17
$

代码

#include <search.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define BUFFERSIZE 10

void freqElement(int *arr, int len, int times);
int createHT(int *arr, int len);

int main(void)
{
    int arr[] = { 2, 3, 5, 6, 10, 10, 2, 5, 2, 8, 8, 7, 8, 7, 8, 7, };
    int len = sizeof(arr) / sizeof(int);

    if (hcreate(len) == 0)
        fprintf(stderr, "Failed to create hash table of size %d\n", len);
    else
    {
        if (!createHT(arr, len))
            fprintf(stderr, "error in entering data\n");
        else
        {
            for (int i = 1; i < len; i++)
                freqElement(arr, len, i);
        }

        hdestroy();
    }
    return 0;
}

int createHT(int *arr, int len)
{
    ENTRY e, *ep;

    for (int i = 0; i < len; i++)
    {
        char buffer[BUFFERSIZE];
        snprintf(buffer, sizeof(buffer), "%d", arr[i]);
        e.key = strdup(buffer);
        e.data = (void *)0;
        printf("Processing [%s]\n", e.key);

        ep = hsearch(e, ENTER);
        if (ep)
        {
            ep->data = (void *)((intptr_t)ep->data + 1);
            if (ep->key != e.key)
                free(e.key);
        }
        else
        {
            fprintf(stderr, "entry failed for [%s]\n", e.key);
            free(e.key);    // Not dreadfully important
            exit(EXIT_FAILURE);
        }
    }
    return 1;
}

// Check whether this number has been processed before
static bool processed_before(int *arr, int len, int value)
{
    for (int j = 0; j < len; j++)
    {
        if (value == arr[j])
            return true;
    }
    return false;
}

void freqElement(int *arr, int len, int times)
{
    ENTRY *ep, e;

    for (int i = 0; i < len; i++)
    {
        char buffer[BUFFERSIZE];
        snprintf(buffer, BUFFERSIZE, "%d", arr[i]);
        e.key = buffer;
        ep = hsearch(e, FIND);
        if (ep)
        {
            if ((intptr_t)ep->data == times && !processed_before(arr, i, arr[i]))
                printf(" value %s is repeated %d times\n", ep->key, times);
        }
    }
}

processed_before() 函数可防止多次打印具有多个条目的值 — 这是 freqElement() 函数更改的结果，该函数报告具有给定出现次数的所有条目，而不是不仅仅是第一个这样的条目。这不是完全可取的，但代码包含一些打印，以便可以监控进度，这有助于确保代码正常工作。

示例输出

Processing [2]
Processing [3]
Processing [5]
Processing [6]
Processing [10]
Processing [10]
Processing [2]
Processing [5]
Processing [2]
Processing [8]
Processing [8]
Processing [7]
Processing [8]
Processing [7]
Processing [8]
Processing [7]
 value 3 is repeated 1 times 
 value 6 is repeated 1 times 
 value 5 is repeated 2 times 
 value 10 is repeated 2 times 
 value 2 is repeated 3 times 
 value 7 is repeated 3 times 
 value 8 is repeated 4 times

在 C 中查找数组中的 n 个重复数字

Find n repeating numbers in an array in C

c

arrays

algorithm

hashtable

代码

示例输出