为什么我的多线程程序比单线程运行得慢?
Why my multi-thread program runs slower than single thread?
我是线程的新手,我的老师要求一个 9 九线程的程序,它使 3 个数组 A B C 具有 10^6 个元素,每个元素都等于 1。所以我们必须使用 9 个线程来填充(用数字1)那些阵列更快。我计划使用 3 个线程来填充每个数组。我将数组分成 3 个部分……i=0 到 i=333332……i=333333 到 i=666662……i=666666 到 i=999999。但是当我 运行 具有九个线程的程序 运行 比单线程程序慢(它包含一个从 0 到 10^6 的 for 循环并用数字 1 填充 3 个数组)。
这是我的 9 线程代码:
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <unistd.h>
#include <pthread.h>
#include <sys/sysinfo.h>
#include <time.h>
double A[1000000];
double B[1000000];
double C[1000000];
double D[1000000];
void * fillA1(void * tid){
for (int i=0;i<333333;i++){
A[i]=1.0;
}
return NULL;
}
void * fillA2(void * tid){
for (int i=333333;i<666666;i++){
A[i]=1.0;
}
return NULL;
}
void * fillA3(void * tid){
for (int i=666666;i<1000000;i++){
A[i]=1.0;
}
return NULL;
}
void * fillB1(void*tid){
for (int i=0;i<333333;i++){
B[i]=1.0;
}
return NULL;
}
void * fillB2(void*tid){
for (int i=333333;i<666666;i++){
B[i]=1.0;
}
return NULL;
}
void * fillB3 (void*tid){
for (int i=666666;i<1000000;i++){
B[i]=1.0;
}
return NULL;
}
void * fillC1(void*tid){
for (int i=0;i<333333;i++){
C[i]=1.0;
}
return NULL;
}
void * fillC2(void*tid){
for (int i=333333;i<666666;i++){
C[i]=1.0;
}
return NULL;
}
void * fillC3(void*tid){
for (int i=666666;i<1000000;i++){
C[i]=1.0;
}
return NULL;
}
int main (void){
double time_spent = 0.0;
clock_t begin = clock();
// declare 9 thread type variables;
pthread_t tid0;// pthread_t is a data type used to uniquely identify a thread.
pthread_t tid1;
pthread_t tid2;
pthread_t tid3;
pthread_t tid4;
pthread_t tid5;
pthread_t tid6;
pthread_t tid7;
pthread_t tid8;
//create an array with 9 threads address
pthread_t * pthreads[] ={&tid0,&tid1,&tid2,&tid3,&tid4,&tid5,&tid6,&tid7,&tid8};
// use 9 threads to fill the 3 arrays simultaneously
pthread_create(pthreads[0],NULL,fillA1,NULL);
pthread_create(pthreads[1],NULL,fillA2,NULL);
pthread_create(pthreads[2],NULL,fillA3,NULL);
pthread_create(pthreads[3],NULL,fillB1, NULL);
pthread_create(pthreads[4],NULL,fillB2, NULL);
pthread_create(pthreads[5],NULL,fillB3, NULL);
pthread_create(pthreads[6],NULL,fillC1,NULL);
pthread_create(pthreads[7],NULL,fillC2,NULL);
pthread_create(pthreads[8],NULL,fillC3,NULL);
for(int i=0;i<9;i++){
pthread_join(*pthreads[i],NULL);
}
clock_t end = clock();
// calculate elapsed time by finding difference (end - begin) and
// dividing the difference by CLOCKS_PER_SEC to convert to seconds
time_spent += (double)(end - begin) / CLOCKS_PER_SEC;
printf("The elapsed time is %f seconds", time_spent);
return 0;
}
这是我的单线程代码:
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <unistd.h>
#include <pthread.h>
#include<time.h>
double A[1000000];
double B[1000000];
double C[1000000];
double D[1000000];
int main (void){
//to store the execution time of code
double time_spent = 0.0;
clock_t begin = clock();
for (int i=0;i<1000000;i++){
A[i]=1;
B[i]=1;
C[i]=1;
}
clock_t end = clock();
// calculate elapsed time by finding difference (end - begin) and
// dividing the difference by CLOCKS_PER_SEC to convert to seconds
time_spent += (double)(end - begin) / CLOCKS_PER_SEC;
printf("The elapsed time is %f seconds\n", time_spent);
return 0;
}
每个数组中的每个段都有一个单独的线程函数,使程序不太灵活。通常的方法是有一个结构来描述每个线程的工作(例如,指向数组的指针和 start/end 索引)。
此外,clock_gettime
可能比 clock
更适合计时。并且,不要 将 printf
放入任何正在计时的内容中。
请注意,使用 3 个数组(例如 A/B/C
)可能与大小为 3 倍的单个数组没有任何不同(但我将其保留在其中)。
这是一个重构版本,可让您进行更多实验:
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <unistd.h>
#include <pthread.h>
#include <sys/sysinfo.h>
#include <time.h>
#define DATA_SIZE 1000000 // number of elements in array
#define NPER 3 // number of threads per array
#define NARRAY 3 // number of arrays
#define CHUNK_SIZE (DATA_SIZE / NPER) // amount of data for each thread
#define NTHREAD (NARRAY * NPER) // number of threads
double A[DATA_SIZE];
double B[DATA_SIZE];
double C[DATA_SIZE];
double D[DATA_SIZE];
struct work {
pthread_t tid;
double *data;
int start;
int count;
};
double
tscgetf(void)
{
struct timespec ts;
double sec;
clock_gettime(CLOCK_MONOTONIC,&ts);
sec = ts.tv_nsec;
sec /= 1e9;
sec += ts.tv_sec;
return sec;
}
void *
fill(void *tid)
{
struct work *work = tid;
double *data = &work->data[work->start];
double *edata = &data[work->count];
for (; data < edata; ++data)
*data = 1.0;
return NULL;
}
int
main(void)
{
double time_spent = 0.0;
struct work buflist[NARRAY] = {
{ .data = A },
{ .data = B },
{ .data = C },
};
struct work *buf;
struct work threads[NTHREAD];
struct work *work;
int tidx = 0;
for (int ibuf = 0; ibuf < NARRAY; ++ibuf) {
buf = &buflist[ibuf];
for (int ichunk = 0; ichunk < NPER; ++ichunk, ++tidx) {
work = &threads[tidx];
double *ptr = buf->data;
work->data = ptr;
work->start = buf->start;
if (ichunk == (NPER - 1))
work->count = DATA_SIZE - buf->start;
else
work->count = CHUNK_SIZE;
printf("%d: ibuf=%d start=%d count=%d (end=%d)\n",
tidx,ibuf,work->start,work->count,work->start + work->count);
buf->start += work->count;
}
}
double begin = tscgetf();
for (tidx = 0; tidx < NTHREAD; ++tidx) {
work = &threads[tidx];
pthread_create(&work->tid,NULL,fill,work);
}
for (tidx = 0; tidx < NTHREAD; ++tidx) {
work = &threads[tidx];
pthread_join(work->tid,NULL);
}
double end = tscgetf();
// calculate elapsed time
time_spent = end - begin;
printf("The elapsed time is %.9f seconds\n", time_spent);
return 0;
}
这是我的系统的输出:
0: ibuf=0 start=0 count=333333 (end=333333)
1: ibuf=0 start=333333 count=333333 (end=666666)
2: ibuf=0 start=666666 count=333334 (end=1000000)
3: ibuf=1 start=0 count=333333 (end=333333)
4: ibuf=1 start=333333 count=333333 (end=666666)
5: ibuf=1 start=666666 count=333334 (end=1000000)
6: ibuf=2 start=0 count=333333 (end=333333)
7: ibuf=2 start=333333 count=333333 (end=666666)
8: ibuf=2 start=666666 count=333334 (end=1000000)
The elapsed time is 0.004271893 seconds
更新:
对于您真正想要测量的内容:线程数对性能的影响,我会使用单个数组,因为线程数不必是数据数组数的倍数。
注意:为了考虑系统负载、时间片,我会 运行 每个基准标记(例如在给定的线程数下)多次不同的时间(例如重复 10 次)并使用 lowest 每个线程计数的运行时间。 (您可以使用脚本自动执行此操作)
这是我要使用的代码:
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <unistd.h>
#include <pthread.h>
#include <sys/sysinfo.h>
#include <time.h>
#ifndef DATA_SIZE
#define DATA_SIZE 4000000 // number of elements in array
#endif
#ifndef NTHREAD
#define NTHREAD 3 // number of threads per array
#endif
#define CHUNK_SIZE (DATA_SIZE / NTHREAD) // amount of data for each thread
double A[DATA_SIZE];
struct work {
pthread_t tid;
double *data;
int start;
int count;
};
double
tscgetf(void)
{
struct timespec ts;
double sec;
clock_gettime(CLOCK_MONOTONIC,&ts);
sec = ts.tv_nsec;
sec /= 1e9;
sec += ts.tv_sec;
return sec;
}
void *
fill(void *tid)
{
struct work *work = tid;
double *data = &work->data[work->start];
double *edata = &data[work->count];
for (; data < edata; ++data)
*data = 1.0;
return NULL;
}
int
main(void)
{
double time_spent = 0.0;
struct work threads[NTHREAD];
struct work *work;
int tidx = 0;
int start = 0;
for (int tidx = 0; tidx < NTHREAD; ++tidx) {
work = &threads[tidx];
double *ptr = A;
work->data = ptr;
work->start = start;
if (tidx == (NTHREAD - 1))
work->count = DATA_SIZE - start;
else
work->count = CHUNK_SIZE;
printf("%d: start=%d count=%d (end=%d)\n",
tidx,work->start,work->count,work->start + work->count);
start += work->count;
}
double begin = tscgetf();
for (tidx = 0; tidx < NTHREAD; ++tidx) {
work = &threads[tidx];
pthread_create(&work->tid,NULL,fill,work);
}
for (tidx = 0; tidx < NTHREAD; ++tidx) {
work = &threads[tidx];
pthread_join(work->tid,NULL);
}
double end = tscgetf();
// calculate elapsed time
time_spent = end - begin;
printf("The elapsed time is %.9f seconds (RATE: %.3f/sec)\n",
time_spent,(double) DATA_SIZE / time_spent);
return 0;
}
这里是各种线程数的输出:
0: start=0 count=4000000 (end=4000000)
The elapsed time is 0.016544766 seconds (RATE: 241768306.067/sec)
0: start=0 count=2000000 (end=2000000)
1: start=2000000 count=2000000 (end=4000000)
The elapsed time is 0.008973760 seconds (RATE: 445744055.862/sec)
0: start=0 count=1333333 (end=1333333)
1: start=1333333 count=1333333 (end=2666666)
2: start=2666666 count=1333334 (end=4000000)
The elapsed time is 0.006181198 seconds (RATE: 647123775.983/sec)
0: start=0 count=1000000 (end=1000000)
1: start=1000000 count=1000000 (end=2000000)
2: start=2000000 count=1000000 (end=3000000)
3: start=3000000 count=1000000 (end=4000000)
The elapsed time is 0.006610514 seconds (RATE: 605096656.791/sec)
0: start=0 count=800000 (end=800000)
1: start=800000 count=800000 (end=1600000)
2: start=1600000 count=800000 (end=2400000)
3: start=2400000 count=800000 (end=3200000)
4: start=3200000 count=800000 (end=4000000)
The elapsed time is 0.005417253 seconds (RATE: 738381626.381/sec)
0: start=0 count=666666 (end=666666)
1: start=666666 count=666666 (end=1333332)
2: start=1333332 count=666666 (end=1999998)
3: start=1999998 count=666666 (end=2666664)
4: start=2666664 count=666666 (end=3333330)
5: start=3333330 count=666670 (end=4000000)
The elapsed time is 0.004728798 seconds (RATE: 845880951.049/sec)
0: start=0 count=571428 (end=571428)
1: start=571428 count=571428 (end=1142856)
2: start=1142856 count=571428 (end=1714284)
3: start=1714284 count=571428 (end=2285712)
4: start=2285712 count=571428 (end=2857140)
5: start=2857140 count=571428 (end=3428568)
6: start=3428568 count=571432 (end=4000000)
The elapsed time is 0.004353798 seconds (RATE: 918738077.652/sec)
0: start=0 count=500000 (end=500000)
1: start=500000 count=500000 (end=1000000)
2: start=1000000 count=500000 (end=1500000)
3: start=1500000 count=500000 (end=2000000)
4: start=2000000 count=500000 (end=2500000)
5: start=2500000 count=500000 (end=3000000)
6: start=3000000 count=500000 (end=3500000)
7: start=3500000 count=500000 (end=4000000)
The elapsed time is 0.006288737 seconds (RATE: 636057758.927/sec)
0: start=0 count=444444 (end=444444)
1: start=444444 count=444444 (end=888888)
2: start=888888 count=444444 (end=1333332)
3: start=1333332 count=444444 (end=1777776)
4: start=1777776 count=444444 (end=2222220)
5: start=2222220 count=444444 (end=2666664)
6: start=2666664 count=444444 (end=3111108)
7: start=3111108 count=444444 (end=3555552)
8: start=3555552 count=444448 (end=4000000)
The elapsed time is 0.005567823 seconds (RATE: 718413632.935/sec)
0: start=0 count=400000 (end=400000)
1: start=400000 count=400000 (end=800000)
2: start=800000 count=400000 (end=1200000)
3: start=1200000 count=400000 (end=1600000)
4: start=1600000 count=400000 (end=2000000)
5: start=2000000 count=400000 (end=2400000)
6: start=2400000 count=400000 (end=2800000)
7: start=2800000 count=400000 (end=3200000)
8: start=3200000 count=400000 (end=3600000)
9: start=3600000 count=400000 (end=4000000)
The elapsed time is 0.005202681 seconds (RATE: 768834332.840/sec)
0: start=0 count=363636 (end=363636)
1: start=363636 count=363636 (end=727272)
2: start=727272 count=363636 (end=1090908)
3: start=1090908 count=363636 (end=1454544)
4: start=1454544 count=363636 (end=1818180)
5: start=1818180 count=363636 (end=2181816)
6: start=2181816 count=363636 (end=2545452)
7: start=2545452 count=363636 (end=2909088)
8: start=2909088 count=363636 (end=3272724)
9: start=3272724 count=363636 (end=3636360)
10: start=3636360 count=363640 (end=4000000)
The elapsed time is 0.004945641 seconds (RATE: 808793049.562/sec)
0: start=0 count=333333 (end=333333)
1: start=333333 count=333333 (end=666666)
2: start=666666 count=333333 (end=999999)
3: start=999999 count=333333 (end=1333332)
4: start=1333332 count=333333 (end=1666665)
5: start=1666665 count=333333 (end=1999998)
6: start=1999998 count=333333 (end=2333331)
7: start=2333331 count=333333 (end=2666664)
8: start=2666664 count=333333 (end=2999997)
9: start=2999997 count=333333 (end=3333330)
10: start=3333330 count=333333 (end=3666663)
11: start=3666663 count=333337 (end=4000000)
The elapsed time is 0.005431988 seconds (RATE: 736378677.432/sec)
0: start=0 count=307692 (end=307692)
1: start=307692 count=307692 (end=615384)
2: start=615384 count=307692 (end=923076)
3: start=923076 count=307692 (end=1230768)
4: start=1230768 count=307692 (end=1538460)
5: start=1538460 count=307692 (end=1846152)
6: start=1846152 count=307692 (end=2153844)
7: start=2153844 count=307692 (end=2461536)
8: start=2461536 count=307692 (end=2769228)
9: start=2769228 count=307692 (end=3076920)
10: start=3076920 count=307692 (end=3384612)
11: start=3384612 count=307692 (end=3692304)
12: start=3692304 count=307696 (end=4000000)
The elapsed time is 0.005035344 seconds (RATE: 794384720.028/sec)
0: start=0 count=285714 (end=285714)
1: start=285714 count=285714 (end=571428)
2: start=571428 count=285714 (end=857142)
3: start=857142 count=285714 (end=1142856)
4: start=1142856 count=285714 (end=1428570)
5: start=1428570 count=285714 (end=1714284)
6: start=1714284 count=285714 (end=1999998)
7: start=1999998 count=285714 (end=2285712)
8: start=2285712 count=285714 (end=2571426)
9: start=2571426 count=285714 (end=2857140)
10: start=2857140 count=285714 (end=3142854)
11: start=3142854 count=285714 (end=3428568)
12: start=3428568 count=285714 (end=3714282)
13: start=3714282 count=285718 (end=4000000)
The elapsed time is 0.004831767 seconds (RATE: 827854458.801/sec)
0: start=0 count=266666 (end=266666)
1: start=266666 count=266666 (end=533332)
2: start=533332 count=266666 (end=799998)
3: start=799998 count=266666 (end=1066664)
4: start=1066664 count=266666 (end=1333330)
5: start=1333330 count=266666 (end=1599996)
6: start=1599996 count=266666 (end=1866662)
7: start=1866662 count=266666 (end=2133328)
8: start=2133328 count=266666 (end=2399994)
9: start=2399994 count=266666 (end=2666660)
10: start=2666660 count=266666 (end=2933326)
11: start=2933326 count=266666 (end=3199992)
12: start=3199992 count=266666 (end=3466658)
13: start=3466658 count=266666 (end=3733324)
14: start=3733324 count=266676 (end=4000000)
The elapsed time is 0.006333866 seconds (RATE: 631525817.104/sec)
0: start=0 count=250000 (end=250000)
1: start=250000 count=250000 (end=500000)
2: start=500000 count=250000 (end=750000)
3: start=750000 count=250000 (end=1000000)
4: start=1000000 count=250000 (end=1250000)
5: start=1250000 count=250000 (end=1500000)
6: start=1500000 count=250000 (end=1750000)
7: start=1750000 count=250000 (end=2000000)
8: start=2000000 count=250000 (end=2250000)
9: start=2250000 count=250000 (end=2500000)
10: start=2500000 count=250000 (end=2750000)
11: start=2750000 count=250000 (end=3000000)
12: start=3000000 count=250000 (end=3250000)
13: start=3250000 count=250000 (end=3500000)
14: start=3500000 count=250000 (end=3750000)
15: start=3750000 count=250000 (end=4000000)
The elapsed time is 0.005258777 seconds (RATE: 760633102.332/sec)
我是线程的新手,我的老师要求一个 9 九线程的程序,它使 3 个数组 A B C 具有 10^6 个元素,每个元素都等于 1。所以我们必须使用 9 个线程来填充(用数字1)那些阵列更快。我计划使用 3 个线程来填充每个数组。我将数组分成 3 个部分……i=0 到 i=333332……i=333333 到 i=666662……i=666666 到 i=999999。但是当我 运行 具有九个线程的程序 运行 比单线程程序慢(它包含一个从 0 到 10^6 的 for 循环并用数字 1 填充 3 个数组)。
这是我的 9 线程代码:
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <unistd.h>
#include <pthread.h>
#include <sys/sysinfo.h>
#include <time.h>
double A[1000000];
double B[1000000];
double C[1000000];
double D[1000000];
void * fillA1(void * tid){
for (int i=0;i<333333;i++){
A[i]=1.0;
}
return NULL;
}
void * fillA2(void * tid){
for (int i=333333;i<666666;i++){
A[i]=1.0;
}
return NULL;
}
void * fillA3(void * tid){
for (int i=666666;i<1000000;i++){
A[i]=1.0;
}
return NULL;
}
void * fillB1(void*tid){
for (int i=0;i<333333;i++){
B[i]=1.0;
}
return NULL;
}
void * fillB2(void*tid){
for (int i=333333;i<666666;i++){
B[i]=1.0;
}
return NULL;
}
void * fillB3 (void*tid){
for (int i=666666;i<1000000;i++){
B[i]=1.0;
}
return NULL;
}
void * fillC1(void*tid){
for (int i=0;i<333333;i++){
C[i]=1.0;
}
return NULL;
}
void * fillC2(void*tid){
for (int i=333333;i<666666;i++){
C[i]=1.0;
}
return NULL;
}
void * fillC3(void*tid){
for (int i=666666;i<1000000;i++){
C[i]=1.0;
}
return NULL;
}
int main (void){
double time_spent = 0.0;
clock_t begin = clock();
// declare 9 thread type variables;
pthread_t tid0;// pthread_t is a data type used to uniquely identify a thread.
pthread_t tid1;
pthread_t tid2;
pthread_t tid3;
pthread_t tid4;
pthread_t tid5;
pthread_t tid6;
pthread_t tid7;
pthread_t tid8;
//create an array with 9 threads address
pthread_t * pthreads[] ={&tid0,&tid1,&tid2,&tid3,&tid4,&tid5,&tid6,&tid7,&tid8};
// use 9 threads to fill the 3 arrays simultaneously
pthread_create(pthreads[0],NULL,fillA1,NULL);
pthread_create(pthreads[1],NULL,fillA2,NULL);
pthread_create(pthreads[2],NULL,fillA3,NULL);
pthread_create(pthreads[3],NULL,fillB1, NULL);
pthread_create(pthreads[4],NULL,fillB2, NULL);
pthread_create(pthreads[5],NULL,fillB3, NULL);
pthread_create(pthreads[6],NULL,fillC1,NULL);
pthread_create(pthreads[7],NULL,fillC2,NULL);
pthread_create(pthreads[8],NULL,fillC3,NULL);
for(int i=0;i<9;i++){
pthread_join(*pthreads[i],NULL);
}
clock_t end = clock();
// calculate elapsed time by finding difference (end - begin) and
// dividing the difference by CLOCKS_PER_SEC to convert to seconds
time_spent += (double)(end - begin) / CLOCKS_PER_SEC;
printf("The elapsed time is %f seconds", time_spent);
return 0;
}
这是我的单线程代码:
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <unistd.h>
#include <pthread.h>
#include<time.h>
double A[1000000];
double B[1000000];
double C[1000000];
double D[1000000];
int main (void){
//to store the execution time of code
double time_spent = 0.0;
clock_t begin = clock();
for (int i=0;i<1000000;i++){
A[i]=1;
B[i]=1;
C[i]=1;
}
clock_t end = clock();
// calculate elapsed time by finding difference (end - begin) and
// dividing the difference by CLOCKS_PER_SEC to convert to seconds
time_spent += (double)(end - begin) / CLOCKS_PER_SEC;
printf("The elapsed time is %f seconds\n", time_spent);
return 0;
}
每个数组中的每个段都有一个单独的线程函数,使程序不太灵活。通常的方法是有一个结构来描述每个线程的工作(例如,指向数组的指针和 start/end 索引)。
此外,clock_gettime
可能比 clock
更适合计时。并且,不要 将 printf
放入任何正在计时的内容中。
请注意,使用 3 个数组(例如 A/B/C
)可能与大小为 3 倍的单个数组没有任何不同(但我将其保留在其中)。
这是一个重构版本,可让您进行更多实验:
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <unistd.h>
#include <pthread.h>
#include <sys/sysinfo.h>
#include <time.h>
#define DATA_SIZE 1000000 // number of elements in array
#define NPER 3 // number of threads per array
#define NARRAY 3 // number of arrays
#define CHUNK_SIZE (DATA_SIZE / NPER) // amount of data for each thread
#define NTHREAD (NARRAY * NPER) // number of threads
double A[DATA_SIZE];
double B[DATA_SIZE];
double C[DATA_SIZE];
double D[DATA_SIZE];
struct work {
pthread_t tid;
double *data;
int start;
int count;
};
double
tscgetf(void)
{
struct timespec ts;
double sec;
clock_gettime(CLOCK_MONOTONIC,&ts);
sec = ts.tv_nsec;
sec /= 1e9;
sec += ts.tv_sec;
return sec;
}
void *
fill(void *tid)
{
struct work *work = tid;
double *data = &work->data[work->start];
double *edata = &data[work->count];
for (; data < edata; ++data)
*data = 1.0;
return NULL;
}
int
main(void)
{
double time_spent = 0.0;
struct work buflist[NARRAY] = {
{ .data = A },
{ .data = B },
{ .data = C },
};
struct work *buf;
struct work threads[NTHREAD];
struct work *work;
int tidx = 0;
for (int ibuf = 0; ibuf < NARRAY; ++ibuf) {
buf = &buflist[ibuf];
for (int ichunk = 0; ichunk < NPER; ++ichunk, ++tidx) {
work = &threads[tidx];
double *ptr = buf->data;
work->data = ptr;
work->start = buf->start;
if (ichunk == (NPER - 1))
work->count = DATA_SIZE - buf->start;
else
work->count = CHUNK_SIZE;
printf("%d: ibuf=%d start=%d count=%d (end=%d)\n",
tidx,ibuf,work->start,work->count,work->start + work->count);
buf->start += work->count;
}
}
double begin = tscgetf();
for (tidx = 0; tidx < NTHREAD; ++tidx) {
work = &threads[tidx];
pthread_create(&work->tid,NULL,fill,work);
}
for (tidx = 0; tidx < NTHREAD; ++tidx) {
work = &threads[tidx];
pthread_join(work->tid,NULL);
}
double end = tscgetf();
// calculate elapsed time
time_spent = end - begin;
printf("The elapsed time is %.9f seconds\n", time_spent);
return 0;
}
这是我的系统的输出:
0: ibuf=0 start=0 count=333333 (end=333333)
1: ibuf=0 start=333333 count=333333 (end=666666)
2: ibuf=0 start=666666 count=333334 (end=1000000)
3: ibuf=1 start=0 count=333333 (end=333333)
4: ibuf=1 start=333333 count=333333 (end=666666)
5: ibuf=1 start=666666 count=333334 (end=1000000)
6: ibuf=2 start=0 count=333333 (end=333333)
7: ibuf=2 start=333333 count=333333 (end=666666)
8: ibuf=2 start=666666 count=333334 (end=1000000)
The elapsed time is 0.004271893 seconds
更新:
对于您真正想要测量的内容:线程数对性能的影响,我会使用单个数组,因为线程数不必是数据数组数的倍数。
注意:为了考虑系统负载、时间片,我会 运行 每个基准标记(例如在给定的线程数下)多次不同的时间(例如重复 10 次)并使用 lowest 每个线程计数的运行时间。 (您可以使用脚本自动执行此操作)
这是我要使用的代码:
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <unistd.h>
#include <pthread.h>
#include <sys/sysinfo.h>
#include <time.h>
#ifndef DATA_SIZE
#define DATA_SIZE 4000000 // number of elements in array
#endif
#ifndef NTHREAD
#define NTHREAD 3 // number of threads per array
#endif
#define CHUNK_SIZE (DATA_SIZE / NTHREAD) // amount of data for each thread
double A[DATA_SIZE];
struct work {
pthread_t tid;
double *data;
int start;
int count;
};
double
tscgetf(void)
{
struct timespec ts;
double sec;
clock_gettime(CLOCK_MONOTONIC,&ts);
sec = ts.tv_nsec;
sec /= 1e9;
sec += ts.tv_sec;
return sec;
}
void *
fill(void *tid)
{
struct work *work = tid;
double *data = &work->data[work->start];
double *edata = &data[work->count];
for (; data < edata; ++data)
*data = 1.0;
return NULL;
}
int
main(void)
{
double time_spent = 0.0;
struct work threads[NTHREAD];
struct work *work;
int tidx = 0;
int start = 0;
for (int tidx = 0; tidx < NTHREAD; ++tidx) {
work = &threads[tidx];
double *ptr = A;
work->data = ptr;
work->start = start;
if (tidx == (NTHREAD - 1))
work->count = DATA_SIZE - start;
else
work->count = CHUNK_SIZE;
printf("%d: start=%d count=%d (end=%d)\n",
tidx,work->start,work->count,work->start + work->count);
start += work->count;
}
double begin = tscgetf();
for (tidx = 0; tidx < NTHREAD; ++tidx) {
work = &threads[tidx];
pthread_create(&work->tid,NULL,fill,work);
}
for (tidx = 0; tidx < NTHREAD; ++tidx) {
work = &threads[tidx];
pthread_join(work->tid,NULL);
}
double end = tscgetf();
// calculate elapsed time
time_spent = end - begin;
printf("The elapsed time is %.9f seconds (RATE: %.3f/sec)\n",
time_spent,(double) DATA_SIZE / time_spent);
return 0;
}
这里是各种线程数的输出:
0: start=0 count=4000000 (end=4000000)
The elapsed time is 0.016544766 seconds (RATE: 241768306.067/sec)
0: start=0 count=2000000 (end=2000000)
1: start=2000000 count=2000000 (end=4000000)
The elapsed time is 0.008973760 seconds (RATE: 445744055.862/sec)
0: start=0 count=1333333 (end=1333333)
1: start=1333333 count=1333333 (end=2666666)
2: start=2666666 count=1333334 (end=4000000)
The elapsed time is 0.006181198 seconds (RATE: 647123775.983/sec)
0: start=0 count=1000000 (end=1000000)
1: start=1000000 count=1000000 (end=2000000)
2: start=2000000 count=1000000 (end=3000000)
3: start=3000000 count=1000000 (end=4000000)
The elapsed time is 0.006610514 seconds (RATE: 605096656.791/sec)
0: start=0 count=800000 (end=800000)
1: start=800000 count=800000 (end=1600000)
2: start=1600000 count=800000 (end=2400000)
3: start=2400000 count=800000 (end=3200000)
4: start=3200000 count=800000 (end=4000000)
The elapsed time is 0.005417253 seconds (RATE: 738381626.381/sec)
0: start=0 count=666666 (end=666666)
1: start=666666 count=666666 (end=1333332)
2: start=1333332 count=666666 (end=1999998)
3: start=1999998 count=666666 (end=2666664)
4: start=2666664 count=666666 (end=3333330)
5: start=3333330 count=666670 (end=4000000)
The elapsed time is 0.004728798 seconds (RATE: 845880951.049/sec)
0: start=0 count=571428 (end=571428)
1: start=571428 count=571428 (end=1142856)
2: start=1142856 count=571428 (end=1714284)
3: start=1714284 count=571428 (end=2285712)
4: start=2285712 count=571428 (end=2857140)
5: start=2857140 count=571428 (end=3428568)
6: start=3428568 count=571432 (end=4000000)
The elapsed time is 0.004353798 seconds (RATE: 918738077.652/sec)
0: start=0 count=500000 (end=500000)
1: start=500000 count=500000 (end=1000000)
2: start=1000000 count=500000 (end=1500000)
3: start=1500000 count=500000 (end=2000000)
4: start=2000000 count=500000 (end=2500000)
5: start=2500000 count=500000 (end=3000000)
6: start=3000000 count=500000 (end=3500000)
7: start=3500000 count=500000 (end=4000000)
The elapsed time is 0.006288737 seconds (RATE: 636057758.927/sec)
0: start=0 count=444444 (end=444444)
1: start=444444 count=444444 (end=888888)
2: start=888888 count=444444 (end=1333332)
3: start=1333332 count=444444 (end=1777776)
4: start=1777776 count=444444 (end=2222220)
5: start=2222220 count=444444 (end=2666664)
6: start=2666664 count=444444 (end=3111108)
7: start=3111108 count=444444 (end=3555552)
8: start=3555552 count=444448 (end=4000000)
The elapsed time is 0.005567823 seconds (RATE: 718413632.935/sec)
0: start=0 count=400000 (end=400000)
1: start=400000 count=400000 (end=800000)
2: start=800000 count=400000 (end=1200000)
3: start=1200000 count=400000 (end=1600000)
4: start=1600000 count=400000 (end=2000000)
5: start=2000000 count=400000 (end=2400000)
6: start=2400000 count=400000 (end=2800000)
7: start=2800000 count=400000 (end=3200000)
8: start=3200000 count=400000 (end=3600000)
9: start=3600000 count=400000 (end=4000000)
The elapsed time is 0.005202681 seconds (RATE: 768834332.840/sec)
0: start=0 count=363636 (end=363636)
1: start=363636 count=363636 (end=727272)
2: start=727272 count=363636 (end=1090908)
3: start=1090908 count=363636 (end=1454544)
4: start=1454544 count=363636 (end=1818180)
5: start=1818180 count=363636 (end=2181816)
6: start=2181816 count=363636 (end=2545452)
7: start=2545452 count=363636 (end=2909088)
8: start=2909088 count=363636 (end=3272724)
9: start=3272724 count=363636 (end=3636360)
10: start=3636360 count=363640 (end=4000000)
The elapsed time is 0.004945641 seconds (RATE: 808793049.562/sec)
0: start=0 count=333333 (end=333333)
1: start=333333 count=333333 (end=666666)
2: start=666666 count=333333 (end=999999)
3: start=999999 count=333333 (end=1333332)
4: start=1333332 count=333333 (end=1666665)
5: start=1666665 count=333333 (end=1999998)
6: start=1999998 count=333333 (end=2333331)
7: start=2333331 count=333333 (end=2666664)
8: start=2666664 count=333333 (end=2999997)
9: start=2999997 count=333333 (end=3333330)
10: start=3333330 count=333333 (end=3666663)
11: start=3666663 count=333337 (end=4000000)
The elapsed time is 0.005431988 seconds (RATE: 736378677.432/sec)
0: start=0 count=307692 (end=307692)
1: start=307692 count=307692 (end=615384)
2: start=615384 count=307692 (end=923076)
3: start=923076 count=307692 (end=1230768)
4: start=1230768 count=307692 (end=1538460)
5: start=1538460 count=307692 (end=1846152)
6: start=1846152 count=307692 (end=2153844)
7: start=2153844 count=307692 (end=2461536)
8: start=2461536 count=307692 (end=2769228)
9: start=2769228 count=307692 (end=3076920)
10: start=3076920 count=307692 (end=3384612)
11: start=3384612 count=307692 (end=3692304)
12: start=3692304 count=307696 (end=4000000)
The elapsed time is 0.005035344 seconds (RATE: 794384720.028/sec)
0: start=0 count=285714 (end=285714)
1: start=285714 count=285714 (end=571428)
2: start=571428 count=285714 (end=857142)
3: start=857142 count=285714 (end=1142856)
4: start=1142856 count=285714 (end=1428570)
5: start=1428570 count=285714 (end=1714284)
6: start=1714284 count=285714 (end=1999998)
7: start=1999998 count=285714 (end=2285712)
8: start=2285712 count=285714 (end=2571426)
9: start=2571426 count=285714 (end=2857140)
10: start=2857140 count=285714 (end=3142854)
11: start=3142854 count=285714 (end=3428568)
12: start=3428568 count=285714 (end=3714282)
13: start=3714282 count=285718 (end=4000000)
The elapsed time is 0.004831767 seconds (RATE: 827854458.801/sec)
0: start=0 count=266666 (end=266666)
1: start=266666 count=266666 (end=533332)
2: start=533332 count=266666 (end=799998)
3: start=799998 count=266666 (end=1066664)
4: start=1066664 count=266666 (end=1333330)
5: start=1333330 count=266666 (end=1599996)
6: start=1599996 count=266666 (end=1866662)
7: start=1866662 count=266666 (end=2133328)
8: start=2133328 count=266666 (end=2399994)
9: start=2399994 count=266666 (end=2666660)
10: start=2666660 count=266666 (end=2933326)
11: start=2933326 count=266666 (end=3199992)
12: start=3199992 count=266666 (end=3466658)
13: start=3466658 count=266666 (end=3733324)
14: start=3733324 count=266676 (end=4000000)
The elapsed time is 0.006333866 seconds (RATE: 631525817.104/sec)
0: start=0 count=250000 (end=250000)
1: start=250000 count=250000 (end=500000)
2: start=500000 count=250000 (end=750000)
3: start=750000 count=250000 (end=1000000)
4: start=1000000 count=250000 (end=1250000)
5: start=1250000 count=250000 (end=1500000)
6: start=1500000 count=250000 (end=1750000)
7: start=1750000 count=250000 (end=2000000)
8: start=2000000 count=250000 (end=2250000)
9: start=2250000 count=250000 (end=2500000)
10: start=2500000 count=250000 (end=2750000)
11: start=2750000 count=250000 (end=3000000)
12: start=3000000 count=250000 (end=3250000)
13: start=3250000 count=250000 (end=3500000)
14: start=3500000 count=250000 (end=3750000)
15: start=3750000 count=250000 (end=4000000)
The elapsed time is 0.005258777 seconds (RATE: 760633102.332/sec)