使用 2 个线程对 2 个数组进行排序比将 2 个数组一一排序需要更多时间

Sorting 2 arrays using 2 threads takes more time than sorting the 2 arrays one by one

我有 2 个未排序的数组和这些数组的 2 个副本。我正在使用两个不同的线程对两个数组进行排序,然后我将另外两个未排序的数组一一排序。本以为线程处理会更快,其实不然,线程怎么会占用更多时间呢?

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <pthread.h>

struct thread_data
{
    int count;
    unsigned int *arr;
};

struct thread_data thread_data_array[2];

void insertionSort(unsigned int arr[], int n)
{
   int i, key, j;
   for (i = 1; i < n; i++)
   {
       key = arr[i];
       j = i-1;

       while (j >= 0 && arr[j] > key)
       {
           arr[j+1] = arr[j];
           j = j-1;
       }
       arr[j+1] = key;
   }
}

void *sortAndMergeArrays(void *threadarg)
{
    int count;
    unsigned int *arr;
    struct thread_data *my_data;

    my_data = (struct thread_data *) threadarg;
    count = my_data->count;
    arr =  my_data->arr;

    insertionSort(arr, count);

    pthread_exit(NULL);
}

int main(int argc, char *argv[])
{
    int count, i, rc;
    clock_t start, end, total_t;
    pthread_t threads[2];

    //get the loop count. If loop count is not provided take 10000 as default loop count.
    if(argc == 2){
        count = atoi(argv[1]);
    }
    else{
        count = 10000;
    }

    unsigned int arr1[count], arr2[count], copyArr1[count], copyArr2[count];    

    srand(time(0));

    for(i = 0; i<count; i++){
        arr1[i] = rand();
        arr2[i] = rand();

        copyArr1[i] = arr1[i];
        copyArr2[i] = arr2[i];
    }

    start = clock();
    for(int t=0; t<2; t++) {
        thread_data_array[t].count = count;
        if(t==0)
            thread_data_array[t].arr = arr1;
        else
            thread_data_array[t].arr = arr2;    

        rc = pthread_create(&threads[t], NULL, sortAndMergeArrays, (void *) &thread_data_array[t]);
        if (rc) {
                printf("ERROR; return code from pthread_create() is %d\n", rc);
                exit(-1);
            }
    }

    pthread_join(threads[0], NULL);
    pthread_join(threads[1], NULL);
    end = clock();

    total_t = (double)(end - start);
    printf("Total time taken by CPU to sort using threads: %d\n", total_t);


    start = clock();
    insertionSort(copyArr1, count);
    insertionSort(copyArr2, count);
    end = clock();

    total_t = (double)(end - start);
    printf("Total time taken by CPU to sort sequentially: %d\n", total_t);

    pthread_exit(NULL);
}

我正在使用 Linux 服务器来执行代码。首先,我随机填充数组并将它们复制到两个单独的数组中。对于前两个数组,我使用 pthread 创建两个线程并将两个数组传递给它们,它使用插入排序对它们进行排序。对于其他两个数组,我只是一个一个地排序。

我预计通过使用线程我会减少执行时间,但实际上需要更多时间。

诊断

你得到几乎相同时间的原因——线程代码的时间比顺序代码的时间略多——是 clock() 测量 CPU 时间,并且两种排序方式需要CPU 时间几乎相同,因为他们在做同样的工作(并且由于设置和拆除线程的时间,线程数可能略大)。

The clock() function shall return the implementation's best approximation to the processor time used by the process since the beginning of an implementation-defined era related only to the process invocation.

BSD (macOS) 手册页:

The clock() function determines the amount of processor time used since the invocation of the calling process, measured in CLOCKS_PER_SECs of a second.

两个数组排序所花费的CPU时间量基本相同;不同之处在于线程的开销(或多或少)。

修改后的代码

我有一组函数可以使用clock_gettime() instead (code in timer.c and timer.h at GitHub)。这些测量挂钟时间 — 经过的时间,而不是 CPU 时间。

这是你的代码的一个稍微调整的版本——实质性的变化是将排序函数中 key 的类型从 int 更改为 unsigned int 以匹配数组中的数据, 并将 %d 的转换规范固定为 %ld 以匹配 GCC 识别为 clock_t 的类型。我稍微调整了参数处理和计时消息,使它们的长度保持一致,并添加了经过时间测量代码:

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <pthread.h>
#include "timer.h"

struct thread_data
{
    int count;
    unsigned int *arr;
};

struct thread_data thread_data_array[2];

static
void insertionSort(unsigned int arr[], int n)
{
    for (int i = 1; i < n; i++)
    {
        unsigned int key = arr[i];
        int j = i - 1;

        while (j >= 0 && arr[j] > key)
        {
            arr[j + 1] = arr[j];
            j = j - 1;
        }
        arr[j + 1] = key;
    }
}

static
void *sortAndMergeArrays(void *threadarg)
{
    int count;
    unsigned int *arr;
    struct thread_data *my_data;

    my_data = (struct thread_data *)threadarg;
    count = my_data->count;
    arr =  my_data->arr;

    insertionSort(arr, count);

    pthread_exit(NULL);
}

int main(int argc, char *argv[])
{
    int count = 10000;
    int i, rc;
    clock_t start, end, total_t;
    pthread_t threads[2];

    // get the loop count. If loop count is not provided take 10000 as default loop count.
    if (argc == 2)
        count = atoi(argv[1]);

    unsigned int arr1[count], arr2[count], copyArr1[count], copyArr2[count];

    srand(time(0));

    for (i = 0; i < count; i++)
    {
        arr1[i] = rand();
        arr2[i] = rand();

        copyArr1[i] = arr1[i];
        copyArr2[i] = arr2[i];
    }

    Clock clk;
    clk_init(&clk);

    start = clock();
    clk_start(&clk);
    for (int t = 0; t < 2; t++)
    {
        thread_data_array[t].count = count;
        if (t == 0)
            thread_data_array[t].arr = arr1;
        else
            thread_data_array[t].arr = arr2;

        rc = pthread_create(&threads[t], NULL, sortAndMergeArrays, (void *)&thread_data_array[t]);
        if (rc)
        {
            printf("ERROR; return code from pthread_create() is %d\n", rc);
            exit(-1);
        }
    }

    pthread_join(threads[0], NULL);
    pthread_join(threads[1], NULL);
    clk_stop(&clk);
    end = clock();

    char buffer[32];
    printf("Elapsed using threads:  %s s\n", clk_elapsed_us(&clk, buffer, sizeof(buffer)));

    total_t = (double)(end - start);
    printf("CPU time using threads: %ld\n", total_t);

    start = clock();
    clk_start(&clk);
    insertionSort(copyArr1, count);
    insertionSort(copyArr2, count);
    clk_stop(&clk);
    end = clock();

    printf("Elapsed sequentially:   %s s\n", clk_elapsed_us(&clk, buffer, sizeof(buffer)));
    total_t = (double)(end - start);
    printf("CPU time sequentially:  %ld\n", total_t);

    return 0;
}

结果

示例 运行s(程序 inssortthread23)— 运行 在配备 16 GiB RAM 和 2.7 GHz Intel Core i7 的 MacBook Pro(15" 2016)上 CPU , 运行ning macOS High Sierra 10.13,编译使用GCC 7.2.0。 我有常规的后台程序 运行ning——例如浏览器未被积极使用,没有音乐或视频播放,没有正在进行的下载等。(这些事情对基准测试很重要。)

$ inssortthread23 100000
Elapsed using threads:  1.060299 s
CPU time using threads: 2099441
Elapsed sequentially:   2.146059 s
CPU time sequentially:  2138465
$ inssortthread23 200000
Elapsed using threads:  4.332935 s
CPU time using threads: 8616953
Elapsed sequentially:   8.496348 s
CPU time sequentially:  8469327
$ inssortthread23 300000
Elapsed using threads:  9.984021 s
CPU time using threads: 19880539
Elapsed sequentially:   20.000900 s
CPU time sequentially:  19959341
$

结论

在这里,你可以清楚地看到:

  1. 非线程代码的运行时间大约是线程代码的两倍。
  2. 线程和非线程代码的CPU时间几乎相同。
  3. 总时间是排序行数的二次方。

所有这些都非常符合预期 — 一旦您意识到 clock() 测量的是 CPU 时间,而不是经过的时间。

小谜题

您还可以看到,我得到的线程化 CPU 时间略小于顺序排序的 CPU 时间,有时是这样。我对此没有任何解释——我认为它 'lost in the noise',尽管效果是持久的:

$ inssortthread23 100000
Elapsed using threads:  1.051229 s
CPU time using threads: 2081847
Elapsed sequentially:   2.138538 s
CPU time sequentially:  2132083
$ inssortthread23 100000
Elapsed using threads:  1.053656 s
CPU time using threads: 2089886
Elapsed sequentially:   2.128908 s
CPU time sequentially:  2122983
$ inssortthread23 100000
Elapsed using threads:  1.058283 s
CPU time using threads: 2093644
Elapsed sequentially:   2.126402 s
CPU time sequentially:  2120625
$

$ inssortthread23 200000
Elapsed using threads:  4.259660 s
CPU time using threads: 8479978
Elapsed sequentially:   8.872929 s
CPU time sequentially:  8843207
$ inssortthread23 200000
Elapsed using threads:  4.463954 s
CPU time using threads: 8883267
Elapsed sequentially:   8.603401 s
CPU time sequentially:  8580240
$ inssortthread23 200000
Elapsed using threads:  4.227154 s
CPU time using threads: 8411582
Elapsed sequentially:   8.816412 s
CPU time sequentially:  8797965
$