pthread_create 中的参数错误
Wrong argument cast in pthread_create
我想要做的是将整数值 0 发送到函数以将其用作我的数组的索引。但是它不是写给 patients[0],而是写给 patients[1]。知道为什么吗?
我只是简单地从 0 到 1 循环,只是为了看看它是否正确传递了值 0,将 i(0) 传递给函数,将 myArr[0] 分配给某物,但它却分配给了 myArr[1]。
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
#include <time.h>
typedef struct patient_info {
pthread_t thread;
char treatment;
char department[20];
} patient;
patient patients[1000];
void* registration(void* arg)
{
int p_num = *((int*)arg); // my array index that supposed to be 0
if (rand() % 2 == 0)
{
patients[p_num].treatment = 'M';
}
else
{
patients[p_num].treatment = 'S';
}
return NULL;
}
int main(void)
{
srand(time(NULL));
for (size_t i = 0; i < 1; i++) // simple for loop to create my thread
{
if (pthread_create(&patients[i].thread, NULL, ®istration, (void*)&i) != 0)
{
perror("There has been an error with pthread_create().");
return 1;
}
}
for (size_t j = 0; j < 1; j++)
{
if (pthread_join(patients[j].thread, NULL) != 0)
{
perror("There has been an error with the pthread_join().");
return 2;
}
}
for (size_t i = 0; i < 1000; i++) // make this loop to see where it is writing.
{
if (patients[i].treatment == 'M' || patients[i].treatment == 'S')
{
printf("Treatment is: %c %d\n", patients[i].treatment, i);
}
}
return 0;
}
您正在将 指针 传递给 i
,因此每个线程都指向 相同的 i
变量.
因此,线程竞速以获得它们的值。 (例如)threadA 想要 0
而 threadB 想要 1
。但是,如果主任务足够快 both 可能会看到 0 或 1。因此,冲突。
此外,在 main
中,i
是一个 size_t
,但在 registration
中,它是一个 int
指针。它们 [可能] 大小不同。
解决方案是通过值
传递i
pthread_create(&patients[i].thread, NULL, ®istration, (void *) i)
并且,在 registration
中,我们按值接受:
void *
registration(void *arg)
{
size_t p_num = (size_t) arg;
// ...
return (void *) 0;
}
这是更正后的代码:
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
#include <time.h>
typedef struct patient_info {
pthread_t thread;
char treatment;
char department[20];
} patient;
patient patients[1000];
void *
registration(void *arg)
{
// my array index that supposed to be 0
// NOTE/BUG: this uses the wrong size pointer and to prevent the race condition
// we want to accept by value
#if 0
int p_num = *((int *) arg);
#else
size_t p_num = (size_t) arg;
#endif
if (rand() % 2 == 0) {
patients[p_num].treatment = 'M';
}
else {
patients[p_num].treatment = 'S';
}
return NULL;
}
int
main(void)
{
srand(time(NULL));
// simple for loop to create my thread
for (size_t i = 0; i < 1; i++) {
if (pthread_create(&patients[i].thread, NULL, ®istration,
#if 0
(void *) &i) != 0) {
#else
(void *) i) != 0) {
#endif
perror("There has been an error with pthread_create().");
return 1;
}
}
for (size_t j = 0; j < 1; j++) {
if (pthread_join(patients[j].thread, NULL) != 0) {
perror("There has been an error with the pthread_join().");
return 2;
}
}
// make this loop to see where it is writing.
for (size_t i = 0; i < 1000; i++) {
if (patients[i].treatment == 'M' || patients[i].treatment == 'S') {
printf("Treatment is: %c %d\n", patients[i].treatment, i);
}
}
return 0;
}
既然你已经解决了创建患者 struct
的麻烦,我们可以通过使用和传递一些指向该 struct
:[=62= 的指针来稍微清理一下代码]
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
#include <time.h>
typedef struct patient_info {
pthread_t thread;
char treatment;
char department[20];
} patient;
patient patients[1000];
void *
registration(void *arg)
{
patient *pt = arg;
if (rand() % 2 == 0) {
pt->treatment = 'M';
}
else {
pt->treatment = 'S';
}
return NULL;
}
int
main(void)
{
srand(time(NULL));
patient *pt;
// simple for loop to create my thread
for (size_t i = 0; i < 1; i++) {
pt = &patients[i];
if (pthread_create(&pt->thread, NULL, ®istration, pt) != 0) {
perror("There has been an error with pthread_create().");
return 1;
}
}
for (size_t j = 0; j < 1; j++) {
pt = &patients[j];
if (pthread_join(pt->thread, NULL) != 0) {
perror("There has been an error with the pthread_join().");
return 2;
}
}
// make this loop to see where it is writing.
for (size_t i = 0; i < 1000; i++) {
pt = &patients[i];
if (pt->treatment == 'M' || pt->treatment == 'S') {
printf("Treatment is: %c %d\n", pt->treatment, i);
}
}
return 0;
}
请注意,我们将患者数组定义为具有 1000 个元素。
目前,我们只创建一个线程。
据推测,我们要处理所有 1000 条记录。
但是,创建 1000 个 线程 是有问题的,而且扩展性不是很好。如果我们有 100,000 名患者,我们[可能] 不会 并行创建 100,000 个线程。
而且,即使我们可以,系统也会花费大部分时间在线程之间切换,系统会变慢。
最好有一个“工作者”线程的“池”,并一次向它们提供几条记录。
如果我们这样做,就没有理由将 pthread_t
放入患者记录中。我们可以有 两个 单独的数组:一个用于患者,另一个 [较小的] 数组用于“活动”线程。
有很多方法可以做到这一点。理想情况下,我们监视线程完成并动态添加新线程。但是,第一次尝试有点复杂。
这是一个将事物分成有限块的版本。这是“目前足够好”的解决方案:
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
#include <time.h>
typedef struct patient_info {
char treatment;
char department[20];
} patient;
#define NPATIENT 1000
patient patients[NPATIENT];
#define NWORKER 10
pthread_t threads[NWORKER];
void *
registration(void *arg)
{
patient *pt = arg;
if (rand() % 2 == 0) {
pt->treatment = 'M';
}
else {
pt->treatment = 'S';
}
return NULL;
}
int
main(void)
{
srand(time(NULL));
patient *pt;
for (size_t patlo = 0; patlo < NPATIENT; patlo += NWORKER) {
size_t pathi = patlo + NWORKER;
if (pathi > NPATIENT)
pathi = NPATIENT;
size_t itsk;
// simple for loop to create my thread
itsk = 0;
for (size_t ipat = patlo; ipat < pathi; ipat++, itsk++) {
pt = &patients[ipat];
if (pthread_create(&threads[itsk], NULL, ®istration, pt) != 0) {
perror("There has been an error with pthread_create().");
return 1;
}
}
// join this chunk of threads
itsk = 0;
for (size_t ipat = patlo; ipat < pathi; ipat++, itsk++) {
pt = &patients[ipat];
if (pthread_join(threads[itsk], NULL) != 0) {
perror("There has been an error with the pthread_join().");
return 2;
}
}
}
// make this loop to see where it is writing.
for (size_t ipat = 0; ipat < NPATIENT; ipat++) {
pt = &patients[ipat];
if (pt->treatment == 'M' || pt->treatment == 'S') {
printf("Treatment is: %c %zu\n", pt->treatment, ipat);
}
}
return 0;
}
更新:
But why is it necessary to use pointer to struct in the below example you gave?
它不是绝对必要的,但它是一个更清晰、更可扩展的选项。而且,在没有编译器优化的情况下,它生成 更快 代码。
在任何地方做pt->whatever
都比patients[i].whatever
简单。
And how can 2 thread race for 0 or 1 when I only loop for once (create only 1 thread)? –
covenant
只有一个线程,他们不会竞争。但是,如果我们切换到更大的数字(例如)2,他们 会 比赛。
记住,我们解决了 两个 问题:
- 竞争条件
main
中的 i
大小不匹配,其中 size_t
为 8 个字节,线程函数中的 p_num
大小为 4。
更新#2:
Thank you so much again. Can you please expend the names of patlo, pathi, ipat and itsk?
嗯,itsk
是最简单的。如果我不知道这段代码并且必须分析它,我会查看所有使用它的地方。它仅用作 索引 到 threads
数组。
“tsk”对我来说是一种“签名”风格(想想:“任务”)。我经常使用三个字符abbreviations/acronyms。 threads
数组只是一个 pthread_t
。但是,如果我们需要更多的每任务(即每线程)信息,我会创建一个每任务结构(例如):
typedef struct {
pthread_t tsk_pthr; // the thread ID
int tsk_patdone; // number of patients processed
long long tsk_elap; // elapsed time of task
} tsk_t;
并且,指向该结构的指针将是(例如):tsk_t *tskcur;
至于ipat
,就是索引进入parents
数组。当我们将父数组拆分为 NWORKER
的块时,patlo
是当前块的第一个索引,而 pathi
是 之后的索引 当前块的结尾。因此,如果 NWORKER
为 10,则 patlo,pathi
将是:0,10 10,20 20,30
...
And yes, what I want was working with 1000 threads at once, but as you said above it is problematic and I have 4 CPU only. Is it a better idea to change NWORKER to 4? –
covenant
通常,使用 CPU 数量是一个很好的起点。我幸运地拥有了 2 倍的 CPU。这是一个调整参数。你必须尝试、测量、调整。 “最佳”数字取决于正在完成的工作类型。
Can this be done by semaphores or mutex_locks? Let's say I can only let 10 thread inside of my registration function. –
covenant
更高级的实现(相对于我在上面所做的“现在足够好”的实现)将在开始时启动 NWORKER 线程。然后将新工作提供给各种线程。然后线程只会在最后加入[即不是在每个块之后。
为了让这个更加动态,信号量可以提供帮助。 如果 小心——否则,它们会“序列化”等待信号量。逻辑正确,但并行度降低
或者,条件变量可能会有所帮助(例如)pthread_cond_signal
等。其他
或者,我们可以使用 atomic 操作(来自 stdatomic.h
)。每个线程独立运行并“自动”将“下一个”索引抓取到患者数组中。对于此处的简单用例,这 [可能] 是性能最高的。
这是一个可以做到这一点的版本:
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
#include <time.h>
#include <stdatomic.h>
typedef struct patient_info {
char treatment;
char department[20];
} patient;
#define NPATIENT 1000
patient patients[NPATIENT];
size_t patidx = 0;
#define NWORKER 10
pthread_t threads[NWORKER];
void *
registration(void *arg)
{
size_t ipat;
patient *pt;
while (1) {
// _atomically_ grab the next index to use
// NOTE: these next two lines are functionally equivalent, but ...
#if 0
// ordinary code -- has race condition
ipat = patidx++;
#else
// atomic code -- works correctly
ipat = atomic_fetch_add(&patidx,1);
#endif
// stop if we are done
if (ipat >= NPATIENT)
break;
pt = &patients[ipat];
if (rand() % 2 == 0) {
pt->treatment = 'M';
}
else {
pt->treatment = 'S';
}
}
return NULL;
}
int
main(void)
{
srand(time(NULL));
patient *pt;
// start all threads
for (size_t itsk = 0; itsk < NWORKER; ++itsk) {
if (pthread_create(&threads[itsk], NULL, ®istration, (void *) itsk)
!= 0) {
perror("There has been an error with pthread_create().");
return 1;
}
}
// wait for all threads to complete
for (size_t itsk = 0; itsk < NWORKER; ++itsk) {
pthread_join(threads[itsk], NULL);
}
// make this loop to see where it is writing.
for (size_t ipat = 0; ipat < NPATIENT; ipat++) {
pt = &patients[ipat];
if (pt->treatment == 'M' || pt->treatment == 'S') {
printf("Treatment is: %c %zu\n", pt->treatment, ipat);
}
}
return 0;
}
更新#3:
在上面的代码示例中,我忽略了 rand
不是 线程安全的事实,应该使用 rand_r
代替。
此外,我[简要]谈到了性能以及应该如何衡量性能以调整应用程序。
所以,我创建了 [希望最终 :-)] 版本,它结合了原始分块(“现在足够好”)版本和“原子”版本,使用 tsk_t
结构,额外使用指针、宏和性能测量。
我不得不将所有内容都移至子功能。这是优秀程序员必须做的事情的一个很好的例子。
无论如何,这是代码:
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
#include <time.h>
#include <stdatomic.h>
unsigned int seed; // random seed
typedef struct patient_info {
char treatment;
char department[20];
} patient;
#ifndef NPATIENT
#define NPATIENT 100000
#endif
int npatient = 1000;
patient patients[NPATIENT];
size_t patidx; // current patient index
typedef struct {
int tsk_active; // 1=task active/joinable
pthread_t tsk_pthr; // the thread ID
unsigned int tsk_seed; // random seed
patient *tsk_pat; // pointer to patient record
void *tsk_ret; // thread return value
} tsk_t;
#ifndef NWORKER
#define NWORKER 100
#endif
int nworker;
tsk_t threads[NWORKER];
#define TSKFORCUR(_tsk) \
tsk_t *_tsk = &threads[0]; tsk < &threads[nworker]; ++tsk
#define TSKFORALL(_tsk) \
tsk_t *_tsk = &threads[0]; tsk < &threads[NWORKER]; ++tsk
typedef struct {
int (*fnc_ptr)(void); // pointer to function
const char *fnc_who; // name of function
double fnc_tscbest; // best time
int fnc_nworker; // best number of workers
} fnc_t;
int
joinall(void)
{
for (TSKFORCUR(tsk)) {
if (! tsk->tsk_active)
continue;
if (pthread_join(tsk->tsk_pthr, &tsk->tsk_ret) != 0) {
perror("There has been an error with the pthread_join().");
return 2;
}
}
return 0;
}
// registration_chunked -- "chunked" thread function
void *
registration_chunked(void *arg)
{
tsk_t *tsk = arg;
patient *pt = tsk->tsk_pat;
if (rand_r(&tsk->tsk_seed) % 2 == 0)
pt->treatment = 'M';
else
pt->treatment = 'S';
return NULL;
}
// perform_chunked -- do separate create/join on threads
int
perform_chunked(void)
{
int code = 0;
for (size_t patlo = 0; patlo < npatient; patlo += nworker) {
for (TSKFORALL(tsk))
tsk->tsk_active = 0;
size_t pathi = patlo + nworker;
if (pathi > npatient)
pathi = npatient;
// simple for loop to create my thread
tsk_t *tsk = &threads[0];
for (size_t ipat = patlo; ipat < pathi; ++ipat, ++tsk) {
tsk->tsk_active = 1;
tsk->tsk_pat = &patients[ipat];
if (pthread_create(&tsk->tsk_pthr, NULL, registration_chunked,
tsk) != 0) {
perror("There has been an error with pthread_create().");
return 1;
}
}
// join this chunk of threads
code = joinall();
if (code)
break;
}
return code;
}
// registration_atomic -- atomic thread function
void *
registration_atomic(void *arg)
{
tsk_t *tsk = arg;
size_t ipat;
patient *pt;
while (1) {
// _atomically_ grab the next index to use
// NOTE: these next two lines are functionally equivalent, but ...
#if 0
// ordinary code -- has race condition
ipat = patidx++;
#else
// atomic code -- works correctly
ipat = atomic_fetch_add(&patidx,1);
#endif
// stop if we are done
if (ipat >= npatient)
break;
pt = &patients[ipat];
if (rand_r(&tsk->tsk_seed) % 2 == 0) {
pt->treatment = 'M';
}
else {
pt->treatment = 'S';
}
}
return NULL;
}
// perform_atomic -- do all work with atomic primitives
int
perform_atomic(void)
{
atomic_store(&patidx,0);
// start all threads
for (TSKFORCUR(tsk)) {
tsk->tsk_active = 1;
if (pthread_create(&tsk->tsk_pthr, NULL, ®istration_atomic, tsk)
!= 0) {
perror("There has been an error with pthread_create().");
return 1;
}
}
// wait for all threads to complete
int code = joinall();
return code;
}
// patshow -- show patient data
void
patshow(void)
{
const patient *pt;
for (size_t ipat = 0; ipat < npatient; ipat++) {
pt = &patients[ipat];
if (pt->treatment == 'M' || pt->treatment == 'S') {
printf("Treatment is: %c %zu\n", pt->treatment, ipat);
}
}
}
// tscgetf -- get hires timestamp
double
tscgetf(void)
{
struct timespec ts;
double sec;
clock_gettime(CLOCK_MONOTONIC,&ts);
sec = ts.tv_nsec;
sec /= 1e9;
sec += ts.tv_sec;
return sec;
}
// NOTE: this uses "designated initializers"
fnc_t fnclist[] = {
{ .fnc_ptr = perform_chunked, .fnc_who = "chunked" },
{ .fnc_ptr = perform_atomic, .fnc_who = "atomic" },
{ .fnc_ptr = NULL }
};
// dofnc -- benchmark a given method
double
dofnc(fnc_t *fnc,double tsclast)
{
double tscbeg;
double tscdif;
double tscbest = 1e6;
patient *pt;
// do multiple trials and take the fastest (best) one
for (int iter = 1; iter <= 5; ++iter) {
// reset the random seed
for (TSKFORALL(tsk))
tsk->tsk_seed = seed;
// reset records and heat up the cache
for (size_t ipat = 0; ipat < npatient; ipat++) {
pt = &patients[ipat];
pt->treatment = 0;
}
tscbeg = tscgetf();
fnc->fnc_ptr();
tscdif = tscgetf();
// get elapsed time
tscdif -= tscbeg;
// take the best time to account for system delays and timeslicing
if (tscdif < tscbest)
tscbest = tscdif;
}
printf(" ELAPSED=(%.9f) RATE=(%.3f p/s) -- %s",
tscbest,(double) npatient / tscbest,fnc->fnc_who);
do {
if (tsclast == 0)
break;
printf(" --");
double ratio;
if (tsclast > tscbest) {
ratio = tsclast / tscbest;
printf(" %.3fx faster",ratio);
}
else {
ratio = tscbest / tsclast;
printf(" %.3fx slower",ratio);
}
} while (0);
printf("\n");
if ((fnc->fnc_nworker <= 0) || (tscbest < fnc->fnc_tscbest)) {
fnc->fnc_nworker = nworker;
fnc->fnc_tscbest = tscbest;
}
// remember this so we can take a ratio
return tscbest;
}
void
dosize(int nwork,size_t npat)
{
static int sep = 0;
if (sep)
printf("\n");
sep = 1;
if (nwork < 1)
nwork = 1;
if (nwork > NWORKER)
nwork = NWORKER;
nworker = nwork;
if (npat < 1)
npat = 1;
if (npat > NPATIENT)
npat = NPATIENT;
npatient = npat;
printf("NWORKER=%d NPATIENT=%d\n",nworker,npatient);
double tscnow = 0;
for (fnc_t *fnc = fnclist; fnc->fnc_ptr != NULL; ++fnc)
tscnow = dofnc(fnc,tscnow);
}
int
main(void)
{
seed = time(NULL);
for (size_t nwork = 1; nwork < 40; ++nwork)
dosize(nwork,1000);
// show the best number of workers to use
printf("\n");
printf("best nworkers:\n");
for (fnc_t *fnc = fnclist; fnc->fnc_ptr != NULL; ++fnc)
printf("fnc_nworker=%d fnc_tscbest=%.9f -- %s\n",
fnc->fnc_nworker,fnc->fnc_tscbest,fnc->fnc_who);
return 0;
}
这是程序输出。看看统计数据。最好使用的工人数量在底部。惊喜等你来!
NWORKER=1 NPATIENT=1000
ELAPSED=(0.032663233) RATE=(30615.463 p/s) -- chunked
ELAPSED=(0.000046097) RATE=(21693397.459 p/s) -- atomic -- 708.576x faster
NWORKER=2 NPATIENT=1000
ELAPSED=(0.021753732) RATE=(45969.124 p/s) -- chunked
ELAPSED=(0.000059036) RATE=(16938829.638 p/s) -- atomic -- 368.483x faster
NWORKER=3 NPATIENT=1000
ELAPSED=(0.021092976) RATE=(47409.147 p/s) -- chunked
ELAPSED=(0.000083985) RATE=(11906898.974 p/s) -- atomic -- 251.152x faster
NWORKER=4 NPATIENT=1000
ELAPSED=(0.024977652) RATE=(40035.789 p/s) -- chunked
ELAPSED=(0.000083009) RATE=(12046901.359 p/s) -- atomic -- 300.903x faster
NWORKER=5 NPATIENT=1000
ELAPSED=(0.038758768) RATE=(25800.614 p/s) -- chunked
ELAPSED=(0.000139154) RATE=(7186281.370 p/s) -- atomic -- 278.531x faster
NWORKER=6 NPATIENT=1000
ELAPSED=(0.029736476) RATE=(33628.733 p/s) -- chunked
ELAPSED=(0.000191748) RATE=(5215177.552 p/s) -- atomic -- 155.081x faster
NWORKER=7 NPATIENT=1000
ELAPSED=(0.026535172) RATE=(37685.831 p/s) -- chunked
ELAPSED=(0.000234081) RATE=(4272024.389 p/s) -- atomic -- 113.359x faster
NWORKER=8 NPATIENT=1000
ELAPSED=(0.025485060) RATE=(39238.676 p/s) -- chunked
ELAPSED=(0.000285933) RATE=(3497322.469 p/s) -- atomic -- 89.129x faster
NWORKER=9 NPATIENT=1000
ELAPSED=(0.026013032) RATE=(38442.270 p/s) -- chunked
ELAPSED=(0.000263240) RATE=(3798813.732 p/s) -- atomic -- 98.819x faster
NWORKER=10 NPATIENT=1000
ELAPSED=(0.029725359) RATE=(33641.309 p/s) -- chunked
ELAPSED=(0.000261056) RATE=(3830595.674 p/s) -- atomic -- 113.866x faster
NWORKER=11 NPATIENT=1000
ELAPSED=(0.026881332) RATE=(37200.538 p/s) -- chunked
ELAPSED=(0.000271164) RATE=(3687805.203 p/s) -- atomic -- 99.133x faster
NWORKER=12 NPATIENT=1000
ELAPSED=(0.030074292) RATE=(33250.991 p/s) -- chunked
ELAPSED=(0.000394198) RATE=(2536796.256 p/s) -- atomic -- 76.292x faster
NWORKER=13 NPATIENT=1000
ELAPSED=(0.030961288) RATE=(32298.398 p/s) -- chunked
ELAPSED=(0.000345326) RATE=(2895815.125 p/s) -- atomic -- 89.658x faster
NWORKER=14 NPATIENT=1000
ELAPSED=(0.027436778) RATE=(36447.428 p/s) -- chunked
ELAPSED=(0.000587254) RATE=(1702840.830 p/s) -- atomic -- 46.720x faster
NWORKER=15 NPATIENT=1000
ELAPSED=(0.032111215) RATE=(31141.768 p/s) -- chunked
ELAPSED=(0.000391190) RATE=(2556302.194 p/s) -- atomic -- 82.086x faster
NWORKER=16 NPATIENT=1000
ELAPSED=(0.027765346) RATE=(36016.119 p/s) -- chunked
ELAPSED=(0.000475762) RATE=(2101891.519 p/s) -- atomic -- 58.360x faster
NWORKER=17 NPATIENT=1000
ELAPSED=(0.026204446) RATE=(38161.463 p/s) -- chunked
ELAPSED=(0.000951203) RATE=(1051300.372 p/s) -- atomic -- 27.549x faster
NWORKER=18 NPATIENT=1000
ELAPSED=(0.030340088) RATE=(32959.694 p/s) -- chunked
ELAPSED=(0.000467318) RATE=(2139870.524 p/s) -- atomic -- 64.924x faster
NWORKER=19 NPATIENT=1000
ELAPSED=(0.028912229) RATE=(34587.440 p/s) -- chunked
ELAPSED=(0.000553825) RATE=(1805624.340 p/s) -- atomic -- 52.205x faster
NWORKER=20 NPATIENT=1000
ELAPSED=(0.029094981) RATE=(34370.189 p/s) -- chunked
ELAPSED=(0.000505824) RATE=(1976972.262 p/s) -- atomic -- 57.520x faster
NWORKER=21 NPATIENT=1000
ELAPSED=(0.031570002) RATE=(31675.639 p/s) -- chunked
ELAPSED=(0.000901482) RATE=(1109284.549 p/s) -- atomic -- 35.020x faster
NWORKER=22 NPATIENT=1000
ELAPSED=(0.033848829) RATE=(29543.120 p/s) -- chunked
ELAPSED=(0.000575106) RATE=(1738809.862 p/s) -- atomic -- 58.857x faster
NWORKER=23 NPATIENT=1000
ELAPSED=(0.029385494) RATE=(34030.396 p/s) -- chunked
ELAPSED=(0.000793229) RATE=(1260669.853 p/s) -- atomic -- 37.045x faster
NWORKER=24 NPATIENT=1000
ELAPSED=(0.031210263) RATE=(32040.742 p/s) -- chunked
ELAPSED=(0.000643074) RATE=(1555030.879 p/s) -- atomic -- 48.533x faster
NWORKER=25 NPATIENT=1000
ELAPSED=(0.029140703) RATE=(34316.262 p/s) -- chunked
ELAPSED=(0.000715511) RATE=(1397602.482 p/s) -- atomic -- 40.727x faster
NWORKER=26 NPATIENT=1000
ELAPSED=(0.032022561) RATE=(31227.983 p/s) -- chunked
ELAPSED=(0.000705709) RATE=(1417014.463 p/s) -- atomic -- 45.376x faster
NWORKER=27 NPATIENT=1000
ELAPSED=(0.029134086) RATE=(34324.056 p/s) -- chunked
ELAPSED=(0.000724864) RATE=(1379569.210 p/s) -- atomic -- 40.192x faster
NWORKER=28 NPATIENT=1000
ELAPSED=(0.035466630) RATE=(28195.518 p/s) -- chunked
ELAPSED=(0.000987683) RATE=(1012470.644 p/s) -- atomic -- 35.909x faster
NWORKER=29 NPATIENT=1000
ELAPSED=(0.035837240) RATE=(27903.935 p/s) -- chunked
ELAPSED=(0.001032722) RATE=(968314.850 p/s) -- atomic -- 34.702x faster
NWORKER=30 NPATIENT=1000
ELAPSED=(0.036233530) RATE=(27598.746 p/s) -- chunked
ELAPSED=(0.001048557) RATE=(953691.602 p/s) -- atomic -- 34.556x faster
NWORKER=31 NPATIENT=1000
ELAPSED=(0.034758216) RATE=(28770.176 p/s) -- chunked
ELAPSED=(0.000810737) RATE=(1233445.583 p/s) -- atomic -- 42.872x faster
NWORKER=32 NPATIENT=1000
ELAPSED=(0.032050096) RATE=(31201.155 p/s) -- chunked
ELAPSED=(0.001110657) RATE=(900368.073 p/s) -- atomic -- 28.857x faster
NWORKER=33 NPATIENT=1000
ELAPSED=(0.028196867) RATE=(35464.933 p/s) -- chunked
ELAPSED=(0.000948129) RATE=(1054708.812 p/s) -- atomic -- 29.739x faster
NWORKER=34 NPATIENT=1000
ELAPSED=(0.036432115) RATE=(27448.310 p/s) -- chunked
ELAPSED=(0.000938635) RATE=(1065376.884 p/s) -- atomic -- 38.814x faster
NWORKER=35 NPATIENT=1000
ELAPSED=(0.029211664) RATE=(34232.901 p/s) -- chunked
ELAPSED=(0.001254896) RATE=(796878.827 p/s) -- atomic -- 23.278x faster
NWORKER=36 NPATIENT=1000
ELAPSED=(0.035125977) RATE=(28468.959 p/s) -- chunked
ELAPSED=(0.001015229) RATE=(984999.410 p/s) -- atomic -- 34.599x faster
NWORKER=37 NPATIENT=1000
ELAPSED=(0.027013535) RATE=(37018.480 p/s) -- chunked
ELAPSED=(0.000971639) RATE=(1029188.881 p/s) -- atomic -- 27.802x faster
NWORKER=38 NPATIENT=1000
ELAPSED=(0.027284315) RATE=(36651.094 p/s) -- chunked
ELAPSED=(0.001343600) RATE=(744269.135 p/s) -- atomic -- 20.307x faster
NWORKER=39 NPATIENT=1000
ELAPSED=(0.026986172) RATE=(37056.015 p/s) -- chunked
ELAPSED=(0.001386600) RATE=(721188.537 p/s) -- atomic -- 19.462x faster
best nworkers:
fnc_nworker=3 fnc_tscbest=0.021092976 -- chunked
fnc_nworker=1 fnc_tscbest=0.000046097 -- atomic
我想要做的是将整数值 0 发送到函数以将其用作我的数组的索引。但是它不是写给 patients[0],而是写给 patients[1]。知道为什么吗? 我只是简单地从 0 到 1 循环,只是为了看看它是否正确传递了值 0,将 i(0) 传递给函数,将 myArr[0] 分配给某物,但它却分配给了 myArr[1]。
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
#include <time.h>
typedef struct patient_info {
pthread_t thread;
char treatment;
char department[20];
} patient;
patient patients[1000];
void* registration(void* arg)
{
int p_num = *((int*)arg); // my array index that supposed to be 0
if (rand() % 2 == 0)
{
patients[p_num].treatment = 'M';
}
else
{
patients[p_num].treatment = 'S';
}
return NULL;
}
int main(void)
{
srand(time(NULL));
for (size_t i = 0; i < 1; i++) // simple for loop to create my thread
{
if (pthread_create(&patients[i].thread, NULL, ®istration, (void*)&i) != 0)
{
perror("There has been an error with pthread_create().");
return 1;
}
}
for (size_t j = 0; j < 1; j++)
{
if (pthread_join(patients[j].thread, NULL) != 0)
{
perror("There has been an error with the pthread_join().");
return 2;
}
}
for (size_t i = 0; i < 1000; i++) // make this loop to see where it is writing.
{
if (patients[i].treatment == 'M' || patients[i].treatment == 'S')
{
printf("Treatment is: %c %d\n", patients[i].treatment, i);
}
}
return 0;
}
您正在将 指针 传递给 i
,因此每个线程都指向 相同的 i
变量.
因此,线程竞速以获得它们的值。 (例如)threadA 想要 0
而 threadB 想要 1
。但是,如果主任务足够快 both 可能会看到 0 或 1。因此,冲突。
此外,在 main
中,i
是一个 size_t
,但在 registration
中,它是一个 int
指针。它们 [可能] 大小不同。
解决方案是通过值
传递i
pthread_create(&patients[i].thread, NULL, ®istration, (void *) i)
并且,在 registration
中,我们按值接受:
void *
registration(void *arg)
{
size_t p_num = (size_t) arg;
// ...
return (void *) 0;
}
这是更正后的代码:
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
#include <time.h>
typedef struct patient_info {
pthread_t thread;
char treatment;
char department[20];
} patient;
patient patients[1000];
void *
registration(void *arg)
{
// my array index that supposed to be 0
// NOTE/BUG: this uses the wrong size pointer and to prevent the race condition
// we want to accept by value
#if 0
int p_num = *((int *) arg);
#else
size_t p_num = (size_t) arg;
#endif
if (rand() % 2 == 0) {
patients[p_num].treatment = 'M';
}
else {
patients[p_num].treatment = 'S';
}
return NULL;
}
int
main(void)
{
srand(time(NULL));
// simple for loop to create my thread
for (size_t i = 0; i < 1; i++) {
if (pthread_create(&patients[i].thread, NULL, ®istration,
#if 0
(void *) &i) != 0) {
#else
(void *) i) != 0) {
#endif
perror("There has been an error with pthread_create().");
return 1;
}
}
for (size_t j = 0; j < 1; j++) {
if (pthread_join(patients[j].thread, NULL) != 0) {
perror("There has been an error with the pthread_join().");
return 2;
}
}
// make this loop to see where it is writing.
for (size_t i = 0; i < 1000; i++) {
if (patients[i].treatment == 'M' || patients[i].treatment == 'S') {
printf("Treatment is: %c %d\n", patients[i].treatment, i);
}
}
return 0;
}
既然你已经解决了创建患者 struct
的麻烦,我们可以通过使用和传递一些指向该 struct
:[=62= 的指针来稍微清理一下代码]
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
#include <time.h>
typedef struct patient_info {
pthread_t thread;
char treatment;
char department[20];
} patient;
patient patients[1000];
void *
registration(void *arg)
{
patient *pt = arg;
if (rand() % 2 == 0) {
pt->treatment = 'M';
}
else {
pt->treatment = 'S';
}
return NULL;
}
int
main(void)
{
srand(time(NULL));
patient *pt;
// simple for loop to create my thread
for (size_t i = 0; i < 1; i++) {
pt = &patients[i];
if (pthread_create(&pt->thread, NULL, ®istration, pt) != 0) {
perror("There has been an error with pthread_create().");
return 1;
}
}
for (size_t j = 0; j < 1; j++) {
pt = &patients[j];
if (pthread_join(pt->thread, NULL) != 0) {
perror("There has been an error with the pthread_join().");
return 2;
}
}
// make this loop to see where it is writing.
for (size_t i = 0; i < 1000; i++) {
pt = &patients[i];
if (pt->treatment == 'M' || pt->treatment == 'S') {
printf("Treatment is: %c %d\n", pt->treatment, i);
}
}
return 0;
}
请注意,我们将患者数组定义为具有 1000 个元素。
目前,我们只创建一个线程。
据推测,我们要处理所有 1000 条记录。
但是,创建 1000 个 线程 是有问题的,而且扩展性不是很好。如果我们有 100,000 名患者,我们[可能] 不会 并行创建 100,000 个线程。
而且,即使我们可以,系统也会花费大部分时间在线程之间切换,系统会变慢。
最好有一个“工作者”线程的“池”,并一次向它们提供几条记录。
如果我们这样做,就没有理由将 pthread_t
放入患者记录中。我们可以有 两个 单独的数组:一个用于患者,另一个 [较小的] 数组用于“活动”线程。
有很多方法可以做到这一点。理想情况下,我们监视线程完成并动态添加新线程。但是,第一次尝试有点复杂。
这是一个将事物分成有限块的版本。这是“目前足够好”的解决方案:
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
#include <time.h>
typedef struct patient_info {
char treatment;
char department[20];
} patient;
#define NPATIENT 1000
patient patients[NPATIENT];
#define NWORKER 10
pthread_t threads[NWORKER];
void *
registration(void *arg)
{
patient *pt = arg;
if (rand() % 2 == 0) {
pt->treatment = 'M';
}
else {
pt->treatment = 'S';
}
return NULL;
}
int
main(void)
{
srand(time(NULL));
patient *pt;
for (size_t patlo = 0; patlo < NPATIENT; patlo += NWORKER) {
size_t pathi = patlo + NWORKER;
if (pathi > NPATIENT)
pathi = NPATIENT;
size_t itsk;
// simple for loop to create my thread
itsk = 0;
for (size_t ipat = patlo; ipat < pathi; ipat++, itsk++) {
pt = &patients[ipat];
if (pthread_create(&threads[itsk], NULL, ®istration, pt) != 0) {
perror("There has been an error with pthread_create().");
return 1;
}
}
// join this chunk of threads
itsk = 0;
for (size_t ipat = patlo; ipat < pathi; ipat++, itsk++) {
pt = &patients[ipat];
if (pthread_join(threads[itsk], NULL) != 0) {
perror("There has been an error with the pthread_join().");
return 2;
}
}
}
// make this loop to see where it is writing.
for (size_t ipat = 0; ipat < NPATIENT; ipat++) {
pt = &patients[ipat];
if (pt->treatment == 'M' || pt->treatment == 'S') {
printf("Treatment is: %c %zu\n", pt->treatment, ipat);
}
}
return 0;
}
更新:
But why is it necessary to use pointer to struct in the below example you gave?
它不是绝对必要的,但它是一个更清晰、更可扩展的选项。而且,在没有编译器优化的情况下,它生成 更快 代码。
在任何地方做pt->whatever
都比patients[i].whatever
简单。
And how can 2 thread race for 0 or 1 when I only loop for once (create only 1 thread)? – covenant
只有一个线程,他们不会竞争。但是,如果我们切换到更大的数字(例如)2,他们 会 比赛。
记住,我们解决了 两个 问题:
- 竞争条件
main
中的i
大小不匹配,其中size_t
为 8 个字节,线程函数中的p_num
大小为 4。
更新#2:
Thank you so much again. Can you please expend the names of patlo, pathi, ipat and itsk?
嗯,itsk
是最简单的。如果我不知道这段代码并且必须分析它,我会查看所有使用它的地方。它仅用作 索引 到 threads
数组。
“tsk”对我来说是一种“签名”风格(想想:“任务”)。我经常使用三个字符abbreviations/acronyms。 threads
数组只是一个 pthread_t
。但是,如果我们需要更多的每任务(即每线程)信息,我会创建一个每任务结构(例如):
typedef struct {
pthread_t tsk_pthr; // the thread ID
int tsk_patdone; // number of patients processed
long long tsk_elap; // elapsed time of task
} tsk_t;
并且,指向该结构的指针将是(例如):tsk_t *tskcur;
至于ipat
,就是索引进入parents
数组。当我们将父数组拆分为 NWORKER
的块时,patlo
是当前块的第一个索引,而 pathi
是 之后的索引 当前块的结尾。因此,如果 NWORKER
为 10,则 patlo,pathi
将是:0,10 10,20 20,30
...
And yes, what I want was working with 1000 threads at once, but as you said above it is problematic and I have 4 CPU only. Is it a better idea to change NWORKER to 4? – covenant
通常,使用 CPU 数量是一个很好的起点。我幸运地拥有了 2 倍的 CPU。这是一个调整参数。你必须尝试、测量、调整。 “最佳”数字取决于正在完成的工作类型。
Can this be done by semaphores or mutex_locks? Let's say I can only let 10 thread inside of my registration function. – covenant
更高级的实现(相对于我在上面所做的“现在足够好”的实现)将在开始时启动 NWORKER 线程。然后将新工作提供给各种线程。然后线程只会在最后加入[即不是在每个块之后。
为了让这个更加动态,信号量可以提供帮助。 如果 小心——否则,它们会“序列化”等待信号量。逻辑正确,但并行度降低
或者,条件变量可能会有所帮助(例如)pthread_cond_signal
等。其他
或者,我们可以使用 atomic 操作(来自 stdatomic.h
)。每个线程独立运行并“自动”将“下一个”索引抓取到患者数组中。对于此处的简单用例,这 [可能] 是性能最高的。
这是一个可以做到这一点的版本:
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
#include <time.h>
#include <stdatomic.h>
typedef struct patient_info {
char treatment;
char department[20];
} patient;
#define NPATIENT 1000
patient patients[NPATIENT];
size_t patidx = 0;
#define NWORKER 10
pthread_t threads[NWORKER];
void *
registration(void *arg)
{
size_t ipat;
patient *pt;
while (1) {
// _atomically_ grab the next index to use
// NOTE: these next two lines are functionally equivalent, but ...
#if 0
// ordinary code -- has race condition
ipat = patidx++;
#else
// atomic code -- works correctly
ipat = atomic_fetch_add(&patidx,1);
#endif
// stop if we are done
if (ipat >= NPATIENT)
break;
pt = &patients[ipat];
if (rand() % 2 == 0) {
pt->treatment = 'M';
}
else {
pt->treatment = 'S';
}
}
return NULL;
}
int
main(void)
{
srand(time(NULL));
patient *pt;
// start all threads
for (size_t itsk = 0; itsk < NWORKER; ++itsk) {
if (pthread_create(&threads[itsk], NULL, ®istration, (void *) itsk)
!= 0) {
perror("There has been an error with pthread_create().");
return 1;
}
}
// wait for all threads to complete
for (size_t itsk = 0; itsk < NWORKER; ++itsk) {
pthread_join(threads[itsk], NULL);
}
// make this loop to see where it is writing.
for (size_t ipat = 0; ipat < NPATIENT; ipat++) {
pt = &patients[ipat];
if (pt->treatment == 'M' || pt->treatment == 'S') {
printf("Treatment is: %c %zu\n", pt->treatment, ipat);
}
}
return 0;
}
更新#3:
在上面的代码示例中,我忽略了 rand
不是 线程安全的事实,应该使用 rand_r
代替。
此外,我[简要]谈到了性能以及应该如何衡量性能以调整应用程序。
所以,我创建了 [希望最终 :-)] 版本,它结合了原始分块(“现在足够好”)版本和“原子”版本,使用 tsk_t
结构,额外使用指针、宏和性能测量。
我不得不将所有内容都移至子功能。这是优秀程序员必须做的事情的一个很好的例子。
无论如何,这是代码:
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
#include <time.h>
#include <stdatomic.h>
unsigned int seed; // random seed
typedef struct patient_info {
char treatment;
char department[20];
} patient;
#ifndef NPATIENT
#define NPATIENT 100000
#endif
int npatient = 1000;
patient patients[NPATIENT];
size_t patidx; // current patient index
typedef struct {
int tsk_active; // 1=task active/joinable
pthread_t tsk_pthr; // the thread ID
unsigned int tsk_seed; // random seed
patient *tsk_pat; // pointer to patient record
void *tsk_ret; // thread return value
} tsk_t;
#ifndef NWORKER
#define NWORKER 100
#endif
int nworker;
tsk_t threads[NWORKER];
#define TSKFORCUR(_tsk) \
tsk_t *_tsk = &threads[0]; tsk < &threads[nworker]; ++tsk
#define TSKFORALL(_tsk) \
tsk_t *_tsk = &threads[0]; tsk < &threads[NWORKER]; ++tsk
typedef struct {
int (*fnc_ptr)(void); // pointer to function
const char *fnc_who; // name of function
double fnc_tscbest; // best time
int fnc_nworker; // best number of workers
} fnc_t;
int
joinall(void)
{
for (TSKFORCUR(tsk)) {
if (! tsk->tsk_active)
continue;
if (pthread_join(tsk->tsk_pthr, &tsk->tsk_ret) != 0) {
perror("There has been an error with the pthread_join().");
return 2;
}
}
return 0;
}
// registration_chunked -- "chunked" thread function
void *
registration_chunked(void *arg)
{
tsk_t *tsk = arg;
patient *pt = tsk->tsk_pat;
if (rand_r(&tsk->tsk_seed) % 2 == 0)
pt->treatment = 'M';
else
pt->treatment = 'S';
return NULL;
}
// perform_chunked -- do separate create/join on threads
int
perform_chunked(void)
{
int code = 0;
for (size_t patlo = 0; patlo < npatient; patlo += nworker) {
for (TSKFORALL(tsk))
tsk->tsk_active = 0;
size_t pathi = patlo + nworker;
if (pathi > npatient)
pathi = npatient;
// simple for loop to create my thread
tsk_t *tsk = &threads[0];
for (size_t ipat = patlo; ipat < pathi; ++ipat, ++tsk) {
tsk->tsk_active = 1;
tsk->tsk_pat = &patients[ipat];
if (pthread_create(&tsk->tsk_pthr, NULL, registration_chunked,
tsk) != 0) {
perror("There has been an error with pthread_create().");
return 1;
}
}
// join this chunk of threads
code = joinall();
if (code)
break;
}
return code;
}
// registration_atomic -- atomic thread function
void *
registration_atomic(void *arg)
{
tsk_t *tsk = arg;
size_t ipat;
patient *pt;
while (1) {
// _atomically_ grab the next index to use
// NOTE: these next two lines are functionally equivalent, but ...
#if 0
// ordinary code -- has race condition
ipat = patidx++;
#else
// atomic code -- works correctly
ipat = atomic_fetch_add(&patidx,1);
#endif
// stop if we are done
if (ipat >= npatient)
break;
pt = &patients[ipat];
if (rand_r(&tsk->tsk_seed) % 2 == 0) {
pt->treatment = 'M';
}
else {
pt->treatment = 'S';
}
}
return NULL;
}
// perform_atomic -- do all work with atomic primitives
int
perform_atomic(void)
{
atomic_store(&patidx,0);
// start all threads
for (TSKFORCUR(tsk)) {
tsk->tsk_active = 1;
if (pthread_create(&tsk->tsk_pthr, NULL, ®istration_atomic, tsk)
!= 0) {
perror("There has been an error with pthread_create().");
return 1;
}
}
// wait for all threads to complete
int code = joinall();
return code;
}
// patshow -- show patient data
void
patshow(void)
{
const patient *pt;
for (size_t ipat = 0; ipat < npatient; ipat++) {
pt = &patients[ipat];
if (pt->treatment == 'M' || pt->treatment == 'S') {
printf("Treatment is: %c %zu\n", pt->treatment, ipat);
}
}
}
// tscgetf -- get hires timestamp
double
tscgetf(void)
{
struct timespec ts;
double sec;
clock_gettime(CLOCK_MONOTONIC,&ts);
sec = ts.tv_nsec;
sec /= 1e9;
sec += ts.tv_sec;
return sec;
}
// NOTE: this uses "designated initializers"
fnc_t fnclist[] = {
{ .fnc_ptr = perform_chunked, .fnc_who = "chunked" },
{ .fnc_ptr = perform_atomic, .fnc_who = "atomic" },
{ .fnc_ptr = NULL }
};
// dofnc -- benchmark a given method
double
dofnc(fnc_t *fnc,double tsclast)
{
double tscbeg;
double tscdif;
double tscbest = 1e6;
patient *pt;
// do multiple trials and take the fastest (best) one
for (int iter = 1; iter <= 5; ++iter) {
// reset the random seed
for (TSKFORALL(tsk))
tsk->tsk_seed = seed;
// reset records and heat up the cache
for (size_t ipat = 0; ipat < npatient; ipat++) {
pt = &patients[ipat];
pt->treatment = 0;
}
tscbeg = tscgetf();
fnc->fnc_ptr();
tscdif = tscgetf();
// get elapsed time
tscdif -= tscbeg;
// take the best time to account for system delays and timeslicing
if (tscdif < tscbest)
tscbest = tscdif;
}
printf(" ELAPSED=(%.9f) RATE=(%.3f p/s) -- %s",
tscbest,(double) npatient / tscbest,fnc->fnc_who);
do {
if (tsclast == 0)
break;
printf(" --");
double ratio;
if (tsclast > tscbest) {
ratio = tsclast / tscbest;
printf(" %.3fx faster",ratio);
}
else {
ratio = tscbest / tsclast;
printf(" %.3fx slower",ratio);
}
} while (0);
printf("\n");
if ((fnc->fnc_nworker <= 0) || (tscbest < fnc->fnc_tscbest)) {
fnc->fnc_nworker = nworker;
fnc->fnc_tscbest = tscbest;
}
// remember this so we can take a ratio
return tscbest;
}
void
dosize(int nwork,size_t npat)
{
static int sep = 0;
if (sep)
printf("\n");
sep = 1;
if (nwork < 1)
nwork = 1;
if (nwork > NWORKER)
nwork = NWORKER;
nworker = nwork;
if (npat < 1)
npat = 1;
if (npat > NPATIENT)
npat = NPATIENT;
npatient = npat;
printf("NWORKER=%d NPATIENT=%d\n",nworker,npatient);
double tscnow = 0;
for (fnc_t *fnc = fnclist; fnc->fnc_ptr != NULL; ++fnc)
tscnow = dofnc(fnc,tscnow);
}
int
main(void)
{
seed = time(NULL);
for (size_t nwork = 1; nwork < 40; ++nwork)
dosize(nwork,1000);
// show the best number of workers to use
printf("\n");
printf("best nworkers:\n");
for (fnc_t *fnc = fnclist; fnc->fnc_ptr != NULL; ++fnc)
printf("fnc_nworker=%d fnc_tscbest=%.9f -- %s\n",
fnc->fnc_nworker,fnc->fnc_tscbest,fnc->fnc_who);
return 0;
}
这是程序输出。看看统计数据。最好使用的工人数量在底部。惊喜等你来!
NWORKER=1 NPATIENT=1000
ELAPSED=(0.032663233) RATE=(30615.463 p/s) -- chunked
ELAPSED=(0.000046097) RATE=(21693397.459 p/s) -- atomic -- 708.576x faster
NWORKER=2 NPATIENT=1000
ELAPSED=(0.021753732) RATE=(45969.124 p/s) -- chunked
ELAPSED=(0.000059036) RATE=(16938829.638 p/s) -- atomic -- 368.483x faster
NWORKER=3 NPATIENT=1000
ELAPSED=(0.021092976) RATE=(47409.147 p/s) -- chunked
ELAPSED=(0.000083985) RATE=(11906898.974 p/s) -- atomic -- 251.152x faster
NWORKER=4 NPATIENT=1000
ELAPSED=(0.024977652) RATE=(40035.789 p/s) -- chunked
ELAPSED=(0.000083009) RATE=(12046901.359 p/s) -- atomic -- 300.903x faster
NWORKER=5 NPATIENT=1000
ELAPSED=(0.038758768) RATE=(25800.614 p/s) -- chunked
ELAPSED=(0.000139154) RATE=(7186281.370 p/s) -- atomic -- 278.531x faster
NWORKER=6 NPATIENT=1000
ELAPSED=(0.029736476) RATE=(33628.733 p/s) -- chunked
ELAPSED=(0.000191748) RATE=(5215177.552 p/s) -- atomic -- 155.081x faster
NWORKER=7 NPATIENT=1000
ELAPSED=(0.026535172) RATE=(37685.831 p/s) -- chunked
ELAPSED=(0.000234081) RATE=(4272024.389 p/s) -- atomic -- 113.359x faster
NWORKER=8 NPATIENT=1000
ELAPSED=(0.025485060) RATE=(39238.676 p/s) -- chunked
ELAPSED=(0.000285933) RATE=(3497322.469 p/s) -- atomic -- 89.129x faster
NWORKER=9 NPATIENT=1000
ELAPSED=(0.026013032) RATE=(38442.270 p/s) -- chunked
ELAPSED=(0.000263240) RATE=(3798813.732 p/s) -- atomic -- 98.819x faster
NWORKER=10 NPATIENT=1000
ELAPSED=(0.029725359) RATE=(33641.309 p/s) -- chunked
ELAPSED=(0.000261056) RATE=(3830595.674 p/s) -- atomic -- 113.866x faster
NWORKER=11 NPATIENT=1000
ELAPSED=(0.026881332) RATE=(37200.538 p/s) -- chunked
ELAPSED=(0.000271164) RATE=(3687805.203 p/s) -- atomic -- 99.133x faster
NWORKER=12 NPATIENT=1000
ELAPSED=(0.030074292) RATE=(33250.991 p/s) -- chunked
ELAPSED=(0.000394198) RATE=(2536796.256 p/s) -- atomic -- 76.292x faster
NWORKER=13 NPATIENT=1000
ELAPSED=(0.030961288) RATE=(32298.398 p/s) -- chunked
ELAPSED=(0.000345326) RATE=(2895815.125 p/s) -- atomic -- 89.658x faster
NWORKER=14 NPATIENT=1000
ELAPSED=(0.027436778) RATE=(36447.428 p/s) -- chunked
ELAPSED=(0.000587254) RATE=(1702840.830 p/s) -- atomic -- 46.720x faster
NWORKER=15 NPATIENT=1000
ELAPSED=(0.032111215) RATE=(31141.768 p/s) -- chunked
ELAPSED=(0.000391190) RATE=(2556302.194 p/s) -- atomic -- 82.086x faster
NWORKER=16 NPATIENT=1000
ELAPSED=(0.027765346) RATE=(36016.119 p/s) -- chunked
ELAPSED=(0.000475762) RATE=(2101891.519 p/s) -- atomic -- 58.360x faster
NWORKER=17 NPATIENT=1000
ELAPSED=(0.026204446) RATE=(38161.463 p/s) -- chunked
ELAPSED=(0.000951203) RATE=(1051300.372 p/s) -- atomic -- 27.549x faster
NWORKER=18 NPATIENT=1000
ELAPSED=(0.030340088) RATE=(32959.694 p/s) -- chunked
ELAPSED=(0.000467318) RATE=(2139870.524 p/s) -- atomic -- 64.924x faster
NWORKER=19 NPATIENT=1000
ELAPSED=(0.028912229) RATE=(34587.440 p/s) -- chunked
ELAPSED=(0.000553825) RATE=(1805624.340 p/s) -- atomic -- 52.205x faster
NWORKER=20 NPATIENT=1000
ELAPSED=(0.029094981) RATE=(34370.189 p/s) -- chunked
ELAPSED=(0.000505824) RATE=(1976972.262 p/s) -- atomic -- 57.520x faster
NWORKER=21 NPATIENT=1000
ELAPSED=(0.031570002) RATE=(31675.639 p/s) -- chunked
ELAPSED=(0.000901482) RATE=(1109284.549 p/s) -- atomic -- 35.020x faster
NWORKER=22 NPATIENT=1000
ELAPSED=(0.033848829) RATE=(29543.120 p/s) -- chunked
ELAPSED=(0.000575106) RATE=(1738809.862 p/s) -- atomic -- 58.857x faster
NWORKER=23 NPATIENT=1000
ELAPSED=(0.029385494) RATE=(34030.396 p/s) -- chunked
ELAPSED=(0.000793229) RATE=(1260669.853 p/s) -- atomic -- 37.045x faster
NWORKER=24 NPATIENT=1000
ELAPSED=(0.031210263) RATE=(32040.742 p/s) -- chunked
ELAPSED=(0.000643074) RATE=(1555030.879 p/s) -- atomic -- 48.533x faster
NWORKER=25 NPATIENT=1000
ELAPSED=(0.029140703) RATE=(34316.262 p/s) -- chunked
ELAPSED=(0.000715511) RATE=(1397602.482 p/s) -- atomic -- 40.727x faster
NWORKER=26 NPATIENT=1000
ELAPSED=(0.032022561) RATE=(31227.983 p/s) -- chunked
ELAPSED=(0.000705709) RATE=(1417014.463 p/s) -- atomic -- 45.376x faster
NWORKER=27 NPATIENT=1000
ELAPSED=(0.029134086) RATE=(34324.056 p/s) -- chunked
ELAPSED=(0.000724864) RATE=(1379569.210 p/s) -- atomic -- 40.192x faster
NWORKER=28 NPATIENT=1000
ELAPSED=(0.035466630) RATE=(28195.518 p/s) -- chunked
ELAPSED=(0.000987683) RATE=(1012470.644 p/s) -- atomic -- 35.909x faster
NWORKER=29 NPATIENT=1000
ELAPSED=(0.035837240) RATE=(27903.935 p/s) -- chunked
ELAPSED=(0.001032722) RATE=(968314.850 p/s) -- atomic -- 34.702x faster
NWORKER=30 NPATIENT=1000
ELAPSED=(0.036233530) RATE=(27598.746 p/s) -- chunked
ELAPSED=(0.001048557) RATE=(953691.602 p/s) -- atomic -- 34.556x faster
NWORKER=31 NPATIENT=1000
ELAPSED=(0.034758216) RATE=(28770.176 p/s) -- chunked
ELAPSED=(0.000810737) RATE=(1233445.583 p/s) -- atomic -- 42.872x faster
NWORKER=32 NPATIENT=1000
ELAPSED=(0.032050096) RATE=(31201.155 p/s) -- chunked
ELAPSED=(0.001110657) RATE=(900368.073 p/s) -- atomic -- 28.857x faster
NWORKER=33 NPATIENT=1000
ELAPSED=(0.028196867) RATE=(35464.933 p/s) -- chunked
ELAPSED=(0.000948129) RATE=(1054708.812 p/s) -- atomic -- 29.739x faster
NWORKER=34 NPATIENT=1000
ELAPSED=(0.036432115) RATE=(27448.310 p/s) -- chunked
ELAPSED=(0.000938635) RATE=(1065376.884 p/s) -- atomic -- 38.814x faster
NWORKER=35 NPATIENT=1000
ELAPSED=(0.029211664) RATE=(34232.901 p/s) -- chunked
ELAPSED=(0.001254896) RATE=(796878.827 p/s) -- atomic -- 23.278x faster
NWORKER=36 NPATIENT=1000
ELAPSED=(0.035125977) RATE=(28468.959 p/s) -- chunked
ELAPSED=(0.001015229) RATE=(984999.410 p/s) -- atomic -- 34.599x faster
NWORKER=37 NPATIENT=1000
ELAPSED=(0.027013535) RATE=(37018.480 p/s) -- chunked
ELAPSED=(0.000971639) RATE=(1029188.881 p/s) -- atomic -- 27.802x faster
NWORKER=38 NPATIENT=1000
ELAPSED=(0.027284315) RATE=(36651.094 p/s) -- chunked
ELAPSED=(0.001343600) RATE=(744269.135 p/s) -- atomic -- 20.307x faster
NWORKER=39 NPATIENT=1000
ELAPSED=(0.026986172) RATE=(37056.015 p/s) -- chunked
ELAPSED=(0.001386600) RATE=(721188.537 p/s) -- atomic -- 19.462x faster
best nworkers:
fnc_nworker=3 fnc_tscbest=0.021092976 -- chunked
fnc_nworker=1 fnc_tscbest=0.000046097 -- atomic