多线程算法在数组中查找最大数的性能
Performance of multithreaded algorithm to find max number in array
我正在尝试学习多线程算法,所以我实现了一个简单的数组查找最大数函数。
我制作了一个基线程序 (findMax1.c),它从一个文件中加载大约 2.63 亿个 int 数字到内存中。
然后我简单地使用 for 循环来查找最大数量。然后我制作了另一个使用 4 个线程的程序 (findMax2.c)。
我选择了 4 个线程,因为我使用的 CPU (intel i5 4460) 有 4 个内核,每个内核有 1 个线程。所以我的猜测是
如果我为每个核心分配一个数组块来处理它会更有效率,因为那样我会有更少的缓存
错过。现在,每个线程从每个块中找到最大数量,然后我加入所有线程以最终找到最大数量
来自所有这些块。基准程序 findMax1.c 完成任务大约需要 660ms,所以我最初的想法是
findMax2.c(使用 4 个线程)大约需要 165 毫秒(660 毫秒/4)才能完成,因为现在我有 4 个线程 运行
所有并行执行相同的任务,但 findMax2.c 大约需要 610 毫秒。只比 findMax1.c 少了 50ms。
我错过了什么?线程程序的实现有问题吗?
findMax1.c
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
int main(void)
{
int i, *array, max = 0, position;
size_t array_size_in_bytes = 1024*1024*1024, elements_read, array_size;
FILE *f;
clock_t t;
double time;
array = (int*) malloc(array_size_in_bytes);
assert(array != NULL); // assert if condition is falsa
printf("Loading array...");
t = clock();
f = fopen("numbers.bin", "rb");
assert(f != NULL);
elements_read = fread(array, array_size_in_bytes, 1, f);
t = clock() - t;
time = ((double) t) / CLOCKS_PER_SEC;
assert(elements_read == 1);
printf("done!\n");
printf("File load time: %f [s]\n", time);
fclose(f);
array_size = array_size_in_bytes / sizeof(int);
printf("Finding max...");
t = clock();
for(i = 0; i < array_size; i++)
if(array[i] > max)
{
max = array[i];
position = i;
}
t = clock() - t;
time = ((double) t) / CLOCKS_PER_SEC;
printf("done!\n");
printf("----------- Program results -------------\nMax number: %d position %d\n", max, position);
printf("Time %f [s]\n", time);
return 0;
}
findMax2.c:
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
#include <pthread.h>
#include <stdlib.h>
#include <unistd.h>
#include <sched.h>
#define NUM_THREADS 4
int max_chunk[NUM_THREADS], pos_chunk[NUM_THREADS];
int *array;
pthread_t tid[NUM_THREADS];
void *thread(void *arg)
{
size_t array_size_in_bytes = 1024*1024*1024;
int i, rc, offset, chunk_size, array_size, *core_id = (int*) arg, num_cores = sysconf(_SC_NPROCESSORS_ONLN);
pthread_t id = pthread_self();
cpu_set_t cpuset;
if (*core_id < 0 || *core_id >= num_cores)
return NULL;
CPU_ZERO(&cpuset);
CPU_SET(*core_id, &cpuset);
rc = pthread_setaffinity_np(id, sizeof(cpu_set_t), &cpuset);
if(rc != 0)
{
printf("pthread_setaffinity_np() failed! - rc %d\n", rc);
return NULL;
}
printf("Thread running on CPU %d\n", sched_getcpu());
array_size = (int) (array_size_in_bytes / sizeof(int));
chunk_size = (int) (array_size / NUM_THREADS);
offset = chunk_size * (*core_id);
// Find max number in the array chunk
for(i = offset; i < (offset + chunk_size); i++)
{
if(array[i] > max_chunk[*core_id])
{
max_chunk[*core_id] = array[i];
pos_chunk[*core_id] = i;
}
}
return NULL;
}
void load_array(void)
{
FILE *f;
size_t array_size_in_bytes = 1024*1024*1024, elements_read;
array = (int*) malloc(array_size_in_bytes);
assert(array != NULL); // assert if condition is false
printf("Loading array...");
f = fopen("numbers.bin", "rb");
assert(f != NULL);
elements_read = fread(array, array_size_in_bytes, 1, f);
assert(elements_read == 1);
printf("done!\n");
fclose(f);
}
int main(void)
{
int i, max = 0, position, id[NUM_THREADS], rc;
clock_t t;
double time;
load_array();
printf("Finding max...");
t = clock();
// Create threads
for(i = 0; i < NUM_THREADS; i++)
{
id[i] = i; // uso id para pasarle un puntero distinto a cada thread
rc = pthread_create(&(tid[i]), NULL, &thread, (void*)(id + i));
if (rc != 0)
printf("Can't create thread! rc = %d\n", rc);
else
printf("Thread %lu created\n", tid[i]);
}
// Join threads
for(i = 0; i < NUM_THREADS; i++)
pthread_join(tid[i], NULL);
// Find max number from all chunks
for(i = 0; i < NUM_THREADS; i++)
if(max_chunk[i] > max)
{
max = max_chunk[i];
position = pos_chunk[i];
}
t = clock() - t;
time = ((double) t) / CLOCKS_PER_SEC;
printf("done!\n");
free(array);
printf("----------- Program results -------------\nMax number: %d position %d\n", max, position);
printf("Time %f [s]\n", time);
pthread_exit(NULL);
return 0;
}
首先,你测错了时间。
clock()
测量进程 CPU 时间,即所有线程使用的时间。实际经过的时间将是其中的一小部分。 clock_gettime(CLOCK_MONOTONIC,...)
应该会产生更好的测量结果。
其次,您的核心循环根本没有可比性。
在多线程程序中,您在每次循环迭代中写入彼此非常接近的全局变量,这对于缓存争用来说是可怕的。
您可以 space 将全局内存分开(使每个数组项成为缓存对齐的结构(_Alignas(64)
)),这将有助于节省时间,但更好和更公平的方法是使用局部变量(这应该进入寄存器),复制第一个循环的方法,然后在循环结束时将块结果写入内存:
int l_max_chunk=0, l_pos_chunk=0, *a;
for(i = 0,a=array+offset; i < chunk_size; i++)
if(a[i] > l_max_chunk) l_max_chunk=a[i], l_pos_chunk=i;
max_chunk[*core_id] = l_max_chunk;
pos_chunk[*core_id] = l_pos_chunk;
这是您修改后的测试程序,具有预期的加速(我在我的双核处理器上获得了大约 2 倍的加速)。
(我还冒昧地将文件加载替换为内存中初始化,以使其更易于测试。)
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
#include <pthread.h>
#include <stdlib.h>
#include <unistd.h>
#include <sched.h>
#include <stdint.h>
struct timespec ts0,ts1;
uint64_t sc_timespec_diff(struct timespec Ts1, struct timespec Ts0) { return (Ts1.tv_sec - Ts0.tv_sec)*1000000000+(Ts1.tv_nsec - Ts0.tv_nsec); }
#define NUM_THREADS 4
int max_chunk[NUM_THREADS], pos_chunk[NUM_THREADS];
int *array;
pthread_t tid[NUM_THREADS];
void *thread(void *arg)
{
size_t array_size_in_bytes = 1024*1024*1024;
int i, rc, offset, chunk_size, array_size, *core_id = (int*) arg, num_cores = sysconf(_SC_NPROCESSORS_ONLN);
#if 1 //shouldn't make much difference
pthread_t id = pthread_self();
cpu_set_t cpuset;
if (*core_id < 0 || *core_id >= num_cores)
return NULL;
CPU_ZERO(&cpuset);
CPU_SET(*core_id, &cpuset);
rc = pthread_setaffinity_np(id, sizeof(cpu_set_t), &cpuset);
if(rc != 0)
{
printf("pthread_setaffinity_np() failed! - rc %d\n", rc);
return NULL;
}
printf("Thread running on CPU %d\n", sched_getcpu());
#endif
array_size = (int) (array_size_in_bytes / sizeof(int));
chunk_size = (int) (array_size / NUM_THREADS);
offset = chunk_size * (*core_id);
// Find max number in the array chunk
#if 0 //horrible for caches
for(i = offset; i < (offset + chunk_size); i++)
{
if(array[i] > max_chunk[*core_id])
{
max_chunk[*core_id] = array[i];
pos_chunk[*core_id] = i;
}
}
#else
int l_max_chunk=0, l_pos_chunk=0, *a;
for(i = 0,a=array+offset; i < chunk_size; i++)
if(a[i] > l_max_chunk) l_max_chunk=a[i], l_pos_chunk=i;
max_chunk[*core_id] = l_max_chunk;
pos_chunk[*core_id] = l_pos_chunk;
#endif
return NULL;
}
void load_array(void)
{
FILE *f;
size_t array_size_in_bytes = 1024*1024*1024, array_size=array_size_in_bytes/sizeof(int);
array = (int*) malloc(array_size_in_bytes);
if(array == NULL) abort(); // assert if condition is false
for(size_t i=0; i<array_size; i++) array[i]=i;
}
int main(void)
{
int i, max = 0, position, id[NUM_THREADS], rc;
clock_t t;
double time;
load_array();
printf("Finding max...");
t = clock();
clock_gettime(CLOCK_MONOTONIC,&ts0);
// Create threads
for(i = 0; i < NUM_THREADS; i++)
{
id[i] = i; // uso id para pasarle un puntero distinto a cada thread
rc = pthread_create(&(tid[i]), NULL, &thread, (void*)(id + i));
if (rc != 0)
printf("Can't create thread! rc = %d\n", rc);
else
printf("Thread %lu created\n", tid[i]);
}
// Join threads
for(i = 0; i < NUM_THREADS; i++)
pthread_join(tid[i], NULL);
// Find max number from all chunks
for(i = 0; i < NUM_THREADS; i++)
if(max_chunk[i] > max)
{
max = max_chunk[i];
position = pos_chunk[i];
}
clock_gettime(CLOCK_MONOTONIC,&ts1);
printf("Time2 %.6LF\n", sc_timespec_diff(ts1,ts0)/1E9L);
t = clock() - t;
time = ((double) t) / CLOCKS_PER_SEC;
printf("done!\n");
free(array);
printf("----------- Program results -------------\nMax number: %d position %d\n", max, position);
printf("Time %f [s]\n", time);
pthread_exit(NULL);
return 0;
}
我的时间:
- 0.188917 用于单线程版本
- 2.511590 用于原始多线程版本(用 clock_gettime(CLOCK_MONOTONIC,...)
测量
- 0.099802 带有修改后的螺纹版本(用 clock_gettime(CLOCK_MONOTONIC,...)
测量
运行 在配备 Intel(R) Core(TM) i7-2620M CPU @ 2.70GHz 的 Linux 机器上。
我正在尝试学习多线程算法,所以我实现了一个简单的数组查找最大数函数。 我制作了一个基线程序 (findMax1.c),它从一个文件中加载大约 2.63 亿个 int 数字到内存中。 然后我简单地使用 for 循环来查找最大数量。然后我制作了另一个使用 4 个线程的程序 (findMax2.c)。 我选择了 4 个线程,因为我使用的 CPU (intel i5 4460) 有 4 个内核,每个内核有 1 个线程。所以我的猜测是 如果我为每个核心分配一个数组块来处理它会更有效率,因为那样我会有更少的缓存 错过。现在,每个线程从每个块中找到最大数量,然后我加入所有线程以最终找到最大数量 来自所有这些块。基准程序 findMax1.c 完成任务大约需要 660ms,所以我最初的想法是 findMax2.c(使用 4 个线程)大约需要 165 毫秒(660 毫秒/4)才能完成,因为现在我有 4 个线程 运行 所有并行执行相同的任务,但 findMax2.c 大约需要 610 毫秒。只比 findMax1.c 少了 50ms。 我错过了什么?线程程序的实现有问题吗?
findMax1.c
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
int main(void)
{
int i, *array, max = 0, position;
size_t array_size_in_bytes = 1024*1024*1024, elements_read, array_size;
FILE *f;
clock_t t;
double time;
array = (int*) malloc(array_size_in_bytes);
assert(array != NULL); // assert if condition is falsa
printf("Loading array...");
t = clock();
f = fopen("numbers.bin", "rb");
assert(f != NULL);
elements_read = fread(array, array_size_in_bytes, 1, f);
t = clock() - t;
time = ((double) t) / CLOCKS_PER_SEC;
assert(elements_read == 1);
printf("done!\n");
printf("File load time: %f [s]\n", time);
fclose(f);
array_size = array_size_in_bytes / sizeof(int);
printf("Finding max...");
t = clock();
for(i = 0; i < array_size; i++)
if(array[i] > max)
{
max = array[i];
position = i;
}
t = clock() - t;
time = ((double) t) / CLOCKS_PER_SEC;
printf("done!\n");
printf("----------- Program results -------------\nMax number: %d position %d\n", max, position);
printf("Time %f [s]\n", time);
return 0;
}
findMax2.c:
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
#include <pthread.h>
#include <stdlib.h>
#include <unistd.h>
#include <sched.h>
#define NUM_THREADS 4
int max_chunk[NUM_THREADS], pos_chunk[NUM_THREADS];
int *array;
pthread_t tid[NUM_THREADS];
void *thread(void *arg)
{
size_t array_size_in_bytes = 1024*1024*1024;
int i, rc, offset, chunk_size, array_size, *core_id = (int*) arg, num_cores = sysconf(_SC_NPROCESSORS_ONLN);
pthread_t id = pthread_self();
cpu_set_t cpuset;
if (*core_id < 0 || *core_id >= num_cores)
return NULL;
CPU_ZERO(&cpuset);
CPU_SET(*core_id, &cpuset);
rc = pthread_setaffinity_np(id, sizeof(cpu_set_t), &cpuset);
if(rc != 0)
{
printf("pthread_setaffinity_np() failed! - rc %d\n", rc);
return NULL;
}
printf("Thread running on CPU %d\n", sched_getcpu());
array_size = (int) (array_size_in_bytes / sizeof(int));
chunk_size = (int) (array_size / NUM_THREADS);
offset = chunk_size * (*core_id);
// Find max number in the array chunk
for(i = offset; i < (offset + chunk_size); i++)
{
if(array[i] > max_chunk[*core_id])
{
max_chunk[*core_id] = array[i];
pos_chunk[*core_id] = i;
}
}
return NULL;
}
void load_array(void)
{
FILE *f;
size_t array_size_in_bytes = 1024*1024*1024, elements_read;
array = (int*) malloc(array_size_in_bytes);
assert(array != NULL); // assert if condition is false
printf("Loading array...");
f = fopen("numbers.bin", "rb");
assert(f != NULL);
elements_read = fread(array, array_size_in_bytes, 1, f);
assert(elements_read == 1);
printf("done!\n");
fclose(f);
}
int main(void)
{
int i, max = 0, position, id[NUM_THREADS], rc;
clock_t t;
double time;
load_array();
printf("Finding max...");
t = clock();
// Create threads
for(i = 0; i < NUM_THREADS; i++)
{
id[i] = i; // uso id para pasarle un puntero distinto a cada thread
rc = pthread_create(&(tid[i]), NULL, &thread, (void*)(id + i));
if (rc != 0)
printf("Can't create thread! rc = %d\n", rc);
else
printf("Thread %lu created\n", tid[i]);
}
// Join threads
for(i = 0; i < NUM_THREADS; i++)
pthread_join(tid[i], NULL);
// Find max number from all chunks
for(i = 0; i < NUM_THREADS; i++)
if(max_chunk[i] > max)
{
max = max_chunk[i];
position = pos_chunk[i];
}
t = clock() - t;
time = ((double) t) / CLOCKS_PER_SEC;
printf("done!\n");
free(array);
printf("----------- Program results -------------\nMax number: %d position %d\n", max, position);
printf("Time %f [s]\n", time);
pthread_exit(NULL);
return 0;
}
首先,你测错了时间。
clock()
测量进程 CPU 时间,即所有线程使用的时间。实际经过的时间将是其中的一小部分。 clock_gettime(CLOCK_MONOTONIC,...)
应该会产生更好的测量结果。
其次,您的核心循环根本没有可比性。
在多线程程序中,您在每次循环迭代中写入彼此非常接近的全局变量,这对于缓存争用来说是可怕的。
您可以 space 将全局内存分开(使每个数组项成为缓存对齐的结构(_Alignas(64)
)),这将有助于节省时间,但更好和更公平的方法是使用局部变量(这应该进入寄存器),复制第一个循环的方法,然后在循环结束时将块结果写入内存:
int l_max_chunk=0, l_pos_chunk=0, *a;
for(i = 0,a=array+offset; i < chunk_size; i++)
if(a[i] > l_max_chunk) l_max_chunk=a[i], l_pos_chunk=i;
max_chunk[*core_id] = l_max_chunk;
pos_chunk[*core_id] = l_pos_chunk;
这是您修改后的测试程序,具有预期的加速(我在我的双核处理器上获得了大约 2 倍的加速)。 (我还冒昧地将文件加载替换为内存中初始化,以使其更易于测试。)
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
#include <pthread.h>
#include <stdlib.h>
#include <unistd.h>
#include <sched.h>
#include <stdint.h>
struct timespec ts0,ts1;
uint64_t sc_timespec_diff(struct timespec Ts1, struct timespec Ts0) { return (Ts1.tv_sec - Ts0.tv_sec)*1000000000+(Ts1.tv_nsec - Ts0.tv_nsec); }
#define NUM_THREADS 4
int max_chunk[NUM_THREADS], pos_chunk[NUM_THREADS];
int *array;
pthread_t tid[NUM_THREADS];
void *thread(void *arg)
{
size_t array_size_in_bytes = 1024*1024*1024;
int i, rc, offset, chunk_size, array_size, *core_id = (int*) arg, num_cores = sysconf(_SC_NPROCESSORS_ONLN);
#if 1 //shouldn't make much difference
pthread_t id = pthread_self();
cpu_set_t cpuset;
if (*core_id < 0 || *core_id >= num_cores)
return NULL;
CPU_ZERO(&cpuset);
CPU_SET(*core_id, &cpuset);
rc = pthread_setaffinity_np(id, sizeof(cpu_set_t), &cpuset);
if(rc != 0)
{
printf("pthread_setaffinity_np() failed! - rc %d\n", rc);
return NULL;
}
printf("Thread running on CPU %d\n", sched_getcpu());
#endif
array_size = (int) (array_size_in_bytes / sizeof(int));
chunk_size = (int) (array_size / NUM_THREADS);
offset = chunk_size * (*core_id);
// Find max number in the array chunk
#if 0 //horrible for caches
for(i = offset; i < (offset + chunk_size); i++)
{
if(array[i] > max_chunk[*core_id])
{
max_chunk[*core_id] = array[i];
pos_chunk[*core_id] = i;
}
}
#else
int l_max_chunk=0, l_pos_chunk=0, *a;
for(i = 0,a=array+offset; i < chunk_size; i++)
if(a[i] > l_max_chunk) l_max_chunk=a[i], l_pos_chunk=i;
max_chunk[*core_id] = l_max_chunk;
pos_chunk[*core_id] = l_pos_chunk;
#endif
return NULL;
}
void load_array(void)
{
FILE *f;
size_t array_size_in_bytes = 1024*1024*1024, array_size=array_size_in_bytes/sizeof(int);
array = (int*) malloc(array_size_in_bytes);
if(array == NULL) abort(); // assert if condition is false
for(size_t i=0; i<array_size; i++) array[i]=i;
}
int main(void)
{
int i, max = 0, position, id[NUM_THREADS], rc;
clock_t t;
double time;
load_array();
printf("Finding max...");
t = clock();
clock_gettime(CLOCK_MONOTONIC,&ts0);
// Create threads
for(i = 0; i < NUM_THREADS; i++)
{
id[i] = i; // uso id para pasarle un puntero distinto a cada thread
rc = pthread_create(&(tid[i]), NULL, &thread, (void*)(id + i));
if (rc != 0)
printf("Can't create thread! rc = %d\n", rc);
else
printf("Thread %lu created\n", tid[i]);
}
// Join threads
for(i = 0; i < NUM_THREADS; i++)
pthread_join(tid[i], NULL);
// Find max number from all chunks
for(i = 0; i < NUM_THREADS; i++)
if(max_chunk[i] > max)
{
max = max_chunk[i];
position = pos_chunk[i];
}
clock_gettime(CLOCK_MONOTONIC,&ts1);
printf("Time2 %.6LF\n", sc_timespec_diff(ts1,ts0)/1E9L);
t = clock() - t;
time = ((double) t) / CLOCKS_PER_SEC;
printf("done!\n");
free(array);
printf("----------- Program results -------------\nMax number: %d position %d\n", max, position);
printf("Time %f [s]\n", time);
pthread_exit(NULL);
return 0;
}
我的时间:
- 0.188917 用于单线程版本
- 2.511590 用于原始多线程版本(用 clock_gettime(CLOCK_MONOTONIC,...) 测量
- 0.099802 带有修改后的螺纹版本(用 clock_gettime(CLOCK_MONOTONIC,...) 测量
运行 在配备 Intel(R) Core(TM) i7-2620M CPU @ 2.70GHz 的 Linux 机器上。