使用 2 个线程对 2 个数组进行排序比将 2 个数组一一排序需要更多时间

Question

我有 2 个未排序的数组和这些数组的 2 个副本。我正在使用两个不同的线程对两个数组进行排序，然后我将另外两个未排序的数组一一排序。本以为线程处理会更快，其实不然，线程怎么会占用更多时间呢？

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <pthread.h>

struct thread_data
{
    int count;
    unsigned int *arr;
};

struct thread_data thread_data_array[2];

void insertionSort(unsigned int arr[], int n)
{
   int i, key, j;
   for (i = 1; i < n; i++)
   {
       key = arr[i];
       j = i-1;

       while (j >= 0 && arr[j] > key)
       {
           arr[j+1] = arr[j];
           j = j-1;
       }
       arr[j+1] = key;
   }
}

void *sortAndMergeArrays(void *threadarg)
{
    int count;
    unsigned int *arr;
    struct thread_data *my_data;

    my_data = (struct thread_data *) threadarg;
    count = my_data->count;
    arr =  my_data->arr;

    insertionSort(arr, count);

    pthread_exit(NULL);
}

int main(int argc, char *argv[])
{
    int count, i, rc;
    clock_t start, end, total_t;
    pthread_t threads[2];

    //get the loop count. If loop count is not provided take 10000 as default loop count.
    if(argc == 2){
        count = atoi(argv[1]);
    }
    else{
        count = 10000;
    }

    unsigned int arr1[count], arr2[count], copyArr1[count], copyArr2[count];    

    srand(time(0));

    for(i = 0; i<count; i++){
        arr1[i] = rand();
        arr2[i] = rand();

        copyArr1[i] = arr1[i];
        copyArr2[i] = arr2[i];
    }

    start = clock();
    for(int t=0; t<2; t++) {
        thread_data_array[t].count = count;
        if(t==0)
            thread_data_array[t].arr = arr1;
        else
            thread_data_array[t].arr = arr2;    

        rc = pthread_create(&threads[t], NULL, sortAndMergeArrays, (void *) &thread_data_array[t]);
        if (rc) {
                printf("ERROR; return code from pthread_create() is %d\n", rc);
                exit(-1);
            }
    }

    pthread_join(threads[0], NULL);
    pthread_join(threads[1], NULL);
    end = clock();

    total_t = (double)(end - start);
    printf("Total time taken by CPU to sort using threads: %d\n", total_t);


    start = clock();
    insertionSort(copyArr1, count);
    insertionSort(copyArr2, count);
    end = clock();

    total_t = (double)(end - start);
    printf("Total time taken by CPU to sort sequentially: %d\n", total_t);

    pthread_exit(NULL);
}

我正在使用 Linux 服务器来执行代码。首先，我随机填充数组并将它们复制到两个单独的数组中。对于前两个数组，我使用 pthread 创建两个线程并将两个数组传递给它们，它使用插入排序对它们进行排序。对于其他两个数组，我只是一个一个地排序。

我预计通过使用线程我会减少执行时间，但实际上需要更多时间。

Answer 1

诊断

你得到几乎相同时间的原因——线程代码的时间比顺序代码的时间略多——是 clock() 测量 CPU 时间，并且两种排序方式需要CPU 时间几乎相同，因为他们在做同样的工作（并且由于设置和拆除线程的时间，线程数可能略大）。

The clock() function shall return the implementation's best approximation to the processor time used by the process since the beginning of an implementation-defined era related only to the process invocation.

BSD (macOS) 手册页：

The clock() function determines the amount of processor time used since the invocation of the calling process, measured in CLOCKS_PER_SECs of a second.

两个数组排序所花费的CPU时间量基本相同；不同之处在于线程的开销（或多或少）。

修改后的代码

我有一组函数可以使用clock_gettime() instead (code in timer.c and timer.h at GitHub)。这些测量挂钟时间 — 经过的时间，而不是 CPU 时间。

这是你的代码的一个稍微调整的版本——实质性的变化是将排序函数中 key 的类型从 int 更改为 unsigned int 以匹配数组中的数据, 并将 %d 的转换规范固定为 %ld 以匹配 GCC 识别为 clock_t 的类型。我稍微调整了参数处理和计时消息，使它们的长度保持一致，并添加了经过时间测量代码：

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <pthread.h>
#include "timer.h"

struct thread_data
{
    int count;
    unsigned int *arr;
};

struct thread_data thread_data_array[2];

static
void insertionSort(unsigned int arr[], int n)
{
    for (int i = 1; i < n; i++)
    {
        unsigned int key = arr[i];
        int j = i - 1;

        while (j >= 0 && arr[j] > key)
        {
            arr[j + 1] = arr[j];
            j = j - 1;
        }
        arr[j + 1] = key;
    }
}

static
void *sortAndMergeArrays(void *threadarg)
{
    int count;
    unsigned int *arr;
    struct thread_data *my_data;

    my_data = (struct thread_data *)threadarg;
    count = my_data->count;
    arr =  my_data->arr;

    insertionSort(arr, count);

    pthread_exit(NULL);
}

int main(int argc, char *argv[])
{
    int count = 10000;
    int i, rc;
    clock_t start, end, total_t;
    pthread_t threads[2];

    // get the loop count. If loop count is not provided take 10000 as default loop count.
    if (argc == 2)
        count = atoi(argv[1]);

    unsigned int arr1[count], arr2[count], copyArr1[count], copyArr2[count];

    srand(time(0));

    for (i = 0; i < count; i++)
    {
        arr1[i] = rand();
        arr2[i] = rand();

        copyArr1[i] = arr1[i];
        copyArr2[i] = arr2[i];
    }

    Clock clk;
    clk_init(&clk);

    start = clock();
    clk_start(&clk);
    for (int t = 0; t < 2; t++)
    {
        thread_data_array[t].count = count;
        if (t == 0)
            thread_data_array[t].arr = arr1;
        else
            thread_data_array[t].arr = arr2;

        rc = pthread_create(&threads[t], NULL, sortAndMergeArrays, (void *)&thread_data_array[t]);
        if (rc)
        {
            printf("ERROR; return code from pthread_create() is %d\n", rc);
            exit(-1);
        }
    }

    pthread_join(threads[0], NULL);
    pthread_join(threads[1], NULL);
    clk_stop(&clk);
    end = clock();

    char buffer[32];
    printf("Elapsed using threads:  %s s\n", clk_elapsed_us(&clk, buffer, sizeof(buffer)));

    total_t = (double)(end - start);
    printf("CPU time using threads: %ld\n", total_t);

    start = clock();
    clk_start(&clk);
    insertionSort(copyArr1, count);
    insertionSort(copyArr2, count);
    clk_stop(&clk);
    end = clock();

    printf("Elapsed sequentially:   %s s\n", clk_elapsed_us(&clk, buffer, sizeof(buffer)));
    total_t = (double)(end - start);
    printf("CPU time sequentially:  %ld\n", total_t);

    return 0;
}

结果

示例运行s（程序 inssortthread23）— 运行在配备 16 GiB RAM 和 2.7 GHz Intel Core i7 的 MacBook Pro（15" 2016）上 CPU , 运行ning macOS High Sierra 10.13，编译使用GCC 7.2.0。我有常规的后台程序运行ning——例如浏览器未被积极使用，没有音乐或视频播放，没有正在进行的下载等。（这些事情对基准测试很重要。）

$ inssortthread23 100000
Elapsed using threads:  1.060299 s
CPU time using threads: 2099441
Elapsed sequentially:   2.146059 s
CPU time sequentially:  2138465
$ inssortthread23 200000
Elapsed using threads:  4.332935 s
CPU time using threads: 8616953
Elapsed sequentially:   8.496348 s
CPU time sequentially:  8469327
$ inssortthread23 300000
Elapsed using threads:  9.984021 s
CPU time using threads: 19880539
Elapsed sequentially:   20.000900 s
CPU time sequentially:  19959341
$

结论

在这里，你可以清楚地看到：

非线程代码的运行时间大约是线程代码的两倍。
线程和非线程代码的CPU时间几乎相同。
总时间是排序行数的二次方。

所有这些都非常符合预期 — 一旦您意识到 clock() 测量的是 CPU 时间，而不是经过的时间。

小谜题

您还可以看到，我得到的线程化 CPU 时间略小于顺序排序的 CPU 时间，有时是这样。我对此没有任何解释——我认为它 'lost in the noise'，尽管效果是持久的：

$ inssortthread23 100000
Elapsed using threads:  1.051229 s
CPU time using threads: 2081847
Elapsed sequentially:   2.138538 s
CPU time sequentially:  2132083
$ inssortthread23 100000
Elapsed using threads:  1.053656 s
CPU time using threads: 2089886
Elapsed sequentially:   2.128908 s
CPU time sequentially:  2122983
$ inssortthread23 100000
Elapsed using threads:  1.058283 s
CPU time using threads: 2093644
Elapsed sequentially:   2.126402 s
CPU time sequentially:  2120625
$

$ inssortthread23 200000
Elapsed using threads:  4.259660 s
CPU time using threads: 8479978
Elapsed sequentially:   8.872929 s
CPU time sequentially:  8843207
$ inssortthread23 200000
Elapsed using threads:  4.463954 s
CPU time using threads: 8883267
Elapsed sequentially:   8.603401 s
CPU time sequentially:  8580240
$ inssortthread23 200000
Elapsed using threads:  4.227154 s
CPU time using threads: 8411582
Elapsed sequentially:   8.816412 s
CPU time sequentially:  8797965
$

使用 2 个线程对 2 个数组进行排序比将 2 个数组一一排序需要更多时间

Sorting 2 arrays using 2 threads takes more time than sorting the 2 arrays one by one

c

linux

sorting

pthreads

pthread-join

诊断

修改后的代码

结果

结论

小谜题