Thread/warp cuda 中的本地锁
Thread/warp local lock in cuda
我想在cuda中实现临界区。我看了很多关于这个主题的问答,答案经常涉及atomicCAS和atomicExch。
但是,这在 warp 级别不起作用,因为 warp 中的所有线程在 atomicCAS 之后获取相同的锁,从而导致死锁。
我认为有一种方法可以通过使用 warp __ballot 或 __any 指令在 cuda 中实现真正的锁定。
然而,经过多次尝试,我没有找到令人满意的(阅读有效的)解决方案。
这里有人对此有好的答案吗?
ps:我知道扭曲发散很糟糕,所以不要告诉我改变我的算法。
原子可以在曲速层工作。对锁进行内部 warp 协商的危险与获取锁的方式以及释放锁的方式有关。 this and this and 之类的问题指出了这种危险,但您不应将其混为一谈使用原子的限制。由于条件代码在 GPU 上的执行方式 (*) 以及条件构造中两条路径之间的依赖性,会产生危险。
为了在 warp 中避免上述危险,有必要打破条件构造中 2 条路径之间的依赖关系。锁的获取必须能够独立于临界区和锁的释放进行。
但是,由于您只希望在扭曲内进行协调,因此原子学就有点过分了。我们已经拥有许多强大的 warp 通信和协调机制,例如您提到的 __ballot()
。
接下来是一个在机制中使用 __ballot()
的工作示例,以确保只有 1 个线程(每个 warp)将 "acquire a lock"。由于这一切只发生在 warp 级别,我们可以再次利用它来完全免除锁的释放。我们可以知道当关键部分完成时锁被释放(因为我们只需要跟踪整个 warp 的完成)。
此示例在 warp 级别执行流压缩,使用 __ballot()
机制确保一次只有 1 个线程进入 "critical section":
$ cat t398.cu
#include <stdio.h>
#include <stdlib.h>
const int num_threads = 1024; // should be 1024 or less for this demo
// returns true if the thread acquired the lock
// will return true for at most 1 thread per warp
__device__ bool warp_lock(int req){
return ((__ffs(__ballot(req))) == ((threadIdx.x & 31)+1));
}
__global__ void test_lock(int *pattern, int *result){
__shared__ int done;
__shared__ int warp_index[32];
int myreq = pattern[threadIdx.x];
done = 1;
if ((threadIdx.x & 31) == 0) warp_index[threadIdx.x>>5] = 0;
do {
done = 1;
__syncthreads();
// attempt to "acquire lock"
bool mylock = warp_lock(myreq);
// if lock acquired, do "critical section"
if (mylock){
done = 0;
int my_index = (warp_index[threadIdx.x>>5]++) + ((threadIdx.x>>5)*32);
result[my_index] = threadIdx.x;
myreq = 0;
}
__syncthreads();
} while (!done);
}
int main(){
int *h_pattern, *d_pattern, *h_result, *d_result;
h_pattern = (int *)malloc(num_threads*sizeof(int));
h_result = (int *)malloc(num_threads*sizeof(int));
cudaMalloc(&d_pattern, num_threads*sizeof(int));
cudaMalloc(&d_result, num_threads*sizeof(int));
for (int i = 0; i < num_threads; i++) {h_pattern[i] = rand()&1;}
cudaMemcpy(d_pattern, h_pattern, num_threads*sizeof(int), cudaMemcpyHostToDevice);
cudaMemset(d_result, 0, num_threads*sizeof(int));
test_lock<<<1, num_threads>>>(d_pattern, d_result);
cudaMemcpy(h_result, d_result, num_threads*sizeof(int), cudaMemcpyDeviceToHost);
printf("index, in, out\n");
for (int i = 0; i < num_threads; i++)
printf("%4d, %d, %4d\n", i, h_pattern[i], h_result[i]);
return 0;
}
$ nvcc -arch=sm_61 -o t398 t398.cu
$ cuda-memcheck ./t398
========= CUDA-MEMCHECK
index, in, out
0, 1, 0
1, 0, 2
2, 1, 3
3, 1, 4
4, 1, 5
5, 1, 8
6, 0, 9
7, 0, 11
8, 1, 13
9, 1, 14
10, 0, 20
11, 1, 22
12, 0, 23
13, 1, 27
14, 1, 28
15, 0, 29
16, 0, 30
17, 0, 0
18, 0, 0
19, 0, 0
20, 1, 0
21, 0, 0
22, 1, 0
23, 1, 0
24, 0, 0
25, 0, 0
26, 0, 0
27, 1, 0
28, 1, 0
29, 1, 0
30, 1, 0
31, 0, 0
32, 0, 34
33, 0, 35
34, 1, 36
35, 1, 38
36, 1, 40
37, 0, 41
38, 1, 42
39, 0, 43
40, 1, 45
41, 1, 48
42, 1, 50
43, 1, 52
44, 0, 55
45, 1, 59
46, 0, 60
47, 0, 61
48, 1, 63
49, 0, 0
50, 1, 0
51, 0, 0
52, 1, 0
53, 0, 0
54, 0, 0
55, 1, 0
56, 0, 0
57, 0, 0
58, 0, 0
59, 1, 0
60, 1, 0
61, 1, 0
62, 0, 0
63, 1, 0
64, 0, 65
65, 1, 67
66, 0, 68
67, 1, 69
68, 1, 71
69, 1, 73
70, 0, 75
71, 1, 78
72, 0, 80
73, 1, 86
74, 0, 87
75, 1, 89
76, 0, 94
77, 0, 0
78, 1, 0
79, 0, 0
80, 1, 0
81, 0, 0
82, 0, 0
83, 0, 0
84, 0, 0
85, 0, 0
86, 1, 0
87, 1, 0
88, 0, 0
89, 1, 0
90, 0, 0
91, 0, 0
92, 0, 0
93, 0, 0
94, 1, 0
95, 0, 0
96, 0, 99
97, 0, 100
98, 0, 104
99, 1, 105
100, 1, 106
101, 0, 108
102, 0, 112
103, 0, 115
104, 1, 116
105, 1, 117
106, 1, 119
107, 0, 121
108, 1, 122
109, 0, 123
110, 0, 124
111, 0, 125
112, 1, 126
113, 0, 127
114, 0, 0
115, 1, 0
116, 1, 0
117, 1, 0
118, 0, 0
119, 1, 0
120, 0, 0
121, 1, 0
122, 1, 0
123, 1, 0
124, 1, 0
125, 1, 0
126, 1, 0
127, 1, 0
128, 1, 128
129, 1, 129
130, 1, 130
131, 0, 132
132, 1, 133
133, 1, 135
134, 0, 137
135, 1, 138
136, 0, 141
137, 1, 143
138, 1, 144
139, 0, 145
140, 0, 149
141, 1, 151
142, 0, 153
143, 1, 155
144, 1, 157
145, 1, 158
146, 0, 159
147, 0, 0
148, 0, 0
149, 1, 0
150, 0, 0
151, 1, 0
152, 0, 0
153, 1, 0
154, 0, 0
155, 1, 0
156, 0, 0
157, 1, 0
158, 1, 0
159, 1, 0
160, 1, 160
161, 0, 164
162, 0, 166
163, 0, 172
164, 1, 174
165, 0, 175
166, 1, 176
167, 0, 177
168, 0, 178
169, 0, 181
170, 0, 182
171, 0, 184
172, 1, 185
173, 0, 187
174, 1, 189
175, 1, 190
176, 1, 191
177, 1, 0
178, 1, 0
179, 0, 0
180, 0, 0
181, 1, 0
182, 1, 0
183, 0, 0
184, 1, 0
185, 1, 0
186, 0, 0
187, 1, 0
188, 0, 0
189, 1, 0
190, 1, 0
191, 1, 0
192, 1, 192
193, 1, 193
194, 0, 196
195, 0, 197
196, 1, 199
197, 1, 202
198, 0, 203
199, 1, 204
200, 0, 212
201, 0, 213
202, 1, 214
203, 1, 217
204, 1, 219
205, 0, 221
206, 0, 223
207, 0, 0
208, 0, 0
209, 0, 0
210, 0, 0
211, 0, 0
212, 1, 0
213, 1, 0
214, 1, 0
215, 0, 0
216, 0, 0
217, 1, 0
218, 0, 0
219, 1, 0
220, 0, 0
221, 1, 0
222, 0, 0
223, 1, 0
224, 0, 226
225, 0, 227
226, 1, 230
227, 1, 234
228, 0, 235
229, 0, 236
230, 1, 239
231, 0, 242
232, 0, 243
233, 0, 248
234, 1, 250
235, 1, 251
236, 1, 252
237, 0, 255
238, 0, 0
239, 1, 0
240, 0, 0
241, 0, 0
242, 1, 0
243, 1, 0
244, 0, 0
245, 0, 0
246, 0, 0
247, 0, 0
248, 1, 0
249, 0, 0
250, 1, 0
251, 1, 0
252, 1, 0
253, 0, 0
254, 0, 0
255, 1, 0
256, 0, 261
257, 0, 267
258, 0, 273
259, 0, 274
260, 0, 277
261, 1, 278
262, 0, 279
263, 0, 280
264, 0, 282
265, 0, 285
266, 0, 286
267, 1, 287
268, 0, 0
269, 0, 0
270, 0, 0
271, 0, 0
272, 0, 0
273, 1, 0
274, 1, 0
275, 0, 0
276, 0, 0
277, 1, 0
278, 1, 0
279, 1, 0
280, 1, 0
281, 0, 0
282, 1, 0
283, 0, 0
284, 0, 0
285, 1, 0
286, 1, 0
287, 1, 0
288, 1, 288
289, 0, 290
290, 1, 291
291, 1, 292
292, 1, 295
293, 0, 305
294, 0, 306
295, 1, 308
296, 0, 310
297, 0, 314
298, 0, 315
299, 0, 319
300, 0, 0
301, 0, 0
302, 0, 0
303, 0, 0
304, 0, 0
305, 1, 0
306, 1, 0
307, 0, 0
308, 1, 0
309, 0, 0
310, 1, 0
311, 0, 0
312, 0, 0
313, 0, 0
314, 1, 0
315, 1, 0
316, 0, 0
317, 0, 0
318, 0, 0
319, 1, 0
320, 0, 321
321, 1, 322
322, 1, 324
323, 0, 325
324, 1, 326
325, 1, 327
326, 1, 328
327, 1, 330
328, 1, 331
329, 0, 333
330, 1, 334
331, 1, 336
332, 0, 343
333, 1, 345
334, 1, 0
335, 0, 0
336, 1, 0
337, 0, 0
338, 0, 0
339, 0, 0
340, 0, 0
341, 0, 0
342, 0, 0
343, 1, 0
344, 0, 0
345, 1, 0
346, 0, 0
347, 0, 0
348, 0, 0
349, 0, 0
350, 0, 0
351, 0, 0
352, 1, 352
353, 1, 353
354, 0, 355
355, 1, 359
356, 0, 364
357, 0, 365
358, 0, 366
359, 1, 369
360, 0, 372
361, 0, 373
362, 0, 374
363, 0, 377
364, 1, 380
365, 1, 382
366, 1, 383
367, 0, 0
368, 0, 0
369, 1, 0
370, 0, 0
371, 0, 0
372, 1, 0
373, 1, 0
374, 1, 0
375, 0, 0
376, 0, 0
377, 1, 0
378, 0, 0
379, 0, 0
380, 1, 0
381, 0, 0
382, 1, 0
383, 1, 0
384, 1, 384
385, 1, 385
386, 0, 387
387, 1, 388
388, 1, 391
389, 0, 397
390, 0, 401
391, 1, 408
392, 0, 410
393, 0, 411
394, 0, 412
395, 0, 0
396, 0, 0
397, 1, 0
398, 0, 0
399, 0, 0
400, 0, 0
401, 1, 0
402, 0, 0
403, 0, 0
404, 0, 0
405, 0, 0
406, 0, 0
407, 0, 0
408, 1, 0
409, 0, 0
410, 1, 0
411, 1, 0
412, 1, 0
413, 0, 0
414, 0, 0
415, 0, 0
416, 1, 416
417, 0, 419
418, 0, 423
419, 1, 424
420, 0, 426
421, 0, 427
422, 0, 428
423, 1, 429
424, 1, 430
425, 0, 433
426, 1, 436
427, 1, 438
428, 1, 439
429, 1, 443
430, 1, 447
431, 0, 0
432, 0, 0
433, 1, 0
434, 0, 0
435, 0, 0
436, 1, 0
437, 0, 0
438, 1, 0
439, 1, 0
440, 0, 0
441, 0, 0
442, 0, 0
443, 1, 0
444, 0, 0
445, 0, 0
446, 0, 0
447, 1, 0
448, 0, 453
449, 0, 454
450, 0, 455
451, 0, 456
452, 0, 459
453, 1, 461
454, 1, 462
455, 1, 464
456, 1, 465
457, 0, 466
458, 0, 468
459, 1, 470
460, 0, 471
461, 1, 473
462, 1, 474
463, 0, 476
464, 1, 477
465, 1, 478
466, 1, 479
467, 0, 0
468, 1, 0
469, 0, 0
470, 1, 0
471, 1, 0
472, 0, 0
473, 1, 0
474, 1, 0
475, 0, 0
476, 1, 0
477, 1, 0
478, 1, 0
479, 1, 0
480, 1, 480
481, 0, 483
482, 0, 484
483, 1, 485
484, 1, 486
485, 1, 489
486, 1, 490
487, 0, 492
488, 0, 496
489, 1, 497
490, 1, 500
491, 0, 503
492, 1, 504
493, 0, 505
494, 0, 506
495, 0, 507
496, 1, 509
497, 1, 511
498, 0, 0
499, 0, 0
500, 1, 0
501, 0, 0
502, 0, 0
503, 1, 0
504, 1, 0
505, 1, 0
506, 1, 0
507, 1, 0
508, 0, 0
509, 1, 0
510, 0, 0
511, 1, 0
512, 1, 512
513, 0, 517
514, 0, 520
515, 0, 531
516, 0, 534
517, 1, 536
518, 0, 538
519, 0, 539
520, 1, 540
521, 0, 541
522, 0, 0
523, 0, 0
524, 0, 0
525, 0, 0
526, 0, 0
527, 0, 0
528, 0, 0
529, 0, 0
530, 0, 0
531, 1, 0
532, 0, 0
533, 0, 0
534, 1, 0
535, 0, 0
536, 1, 0
537, 0, 0
538, 1, 0
539, 1, 0
540, 1, 0
541, 1, 0
542, 0, 0
543, 0, 0
544, 0, 545
545, 1, 549
546, 0, 551
547, 0, 552
548, 0, 554
549, 1, 555
550, 0, 557
551, 1, 558
552, 1, 565
553, 0, 568
554, 1, 569
555, 1, 570
556, 0, 571
557, 1, 572
558, 1, 574
559, 0, 575
560, 0, 0
561, 0, 0
562, 0, 0
563, 0, 0
564, 0, 0
565, 1, 0
566, 0, 0
567, 0, 0
568, 1, 0
569, 1, 0
570, 1, 0
571, 1, 0
572, 1, 0
573, 0, 0
574, 1, 0
575, 1, 0
576, 1, 576
577, 0, 578
578, 1, 579
579, 1, 580
580, 1, 581
581, 1, 582
582, 1, 584
583, 0, 586
584, 1, 587
585, 0, 589
586, 1, 592
587, 1, 595
588, 0, 596
589, 1, 597
590, 0, 598
591, 0, 601
592, 1, 602
593, 0, 603
594, 0, 604
595, 1, 605
596, 1, 0
597, 1, 0
598, 1, 0
599, 0, 0
600, 0, 0
601, 1, 0
602, 1, 0
603, 1, 0
604, 1, 0
605, 1, 0
606, 0, 0
607, 0, 0
608, 1, 608
609, 1, 609
610, 1, 610
611, 0, 615
612, 0, 616
613, 0, 617
614, 0, 618
615, 1, 619
616, 1, 621
617, 1, 622
618, 1, 623
619, 1, 624
620, 0, 625
621, 1, 626
622, 1, 631
623, 1, 632
624, 1, 636
625, 1, 637
626, 1, 0
627, 0, 0
628, 0, 0
629, 0, 0
630, 0, 0
631, 1, 0
632, 1, 0
633, 0, 0
634, 0, 0
635, 0, 0
636, 1, 0
637, 1, 0
638, 0, 0
639, 0, 0
640, 0, 643
641, 0, 647
642, 0, 648
643, 1, 649
644, 0, 653
645, 0, 654
646, 0, 655
647, 1, 656
648, 1, 658
649, 1, 659
650, 0, 660
651, 0, 665
652, 0, 666
653, 1, 667
654, 1, 669
655, 1, 670
656, 1, 0
657, 0, 0
658, 1, 0
659, 1, 0
660, 1, 0
661, 0, 0
662, 0, 0
663, 0, 0
664, 0, 0
665, 1, 0
666, 1, 0
667, 1, 0
668, 0, 0
669, 1, 0
670, 1, 0
671, 0, 0
672, 1, 672
673, 1, 673
674, 1, 674
675, 0, 676
676, 1, 678
677, 0, 680
678, 1, 684
679, 0, 686
680, 1, 689
681, 0, 691
682, 0, 692
683, 0, 695
684, 1, 696
685, 0, 697
686, 1, 699
687, 0, 701
688, 0, 0
689, 1, 0
690, 0, 0
691, 1, 0
692, 1, 0
693, 0, 0
694, 0, 0
695, 1, 0
696, 1, 0
697, 1, 0
698, 0, 0
699, 1, 0
700, 0, 0
701, 1, 0
702, 0, 0
703, 0, 0
704, 0, 705
705, 1, 707
706, 0, 708
707, 1, 709
708, 1, 710
709, 1, 711
710, 1, 712
711, 1, 714
712, 1, 715
713, 0, 718
714, 1, 720
715, 1, 721
716, 0, 726
717, 0, 727
718, 1, 728
719, 0, 729
720, 1, 730
721, 1, 731
722, 0, 733
723, 0, 734
724, 0, 0
725, 0, 0
726, 1, 0
727, 1, 0
728, 1, 0
729, 1, 0
730, 1, 0
731, 1, 0
732, 0, 0
733, 1, 0
734, 1, 0
735, 0, 0
736, 0, 737
737, 1, 738
738, 1, 740
739, 0, 741
740, 1, 742
741, 1, 744
742, 1, 746
743, 0, 747
744, 1, 749
745, 0, 750
746, 1, 753
747, 1, 756
748, 0, 760
749, 1, 761
750, 1, 765
751, 0, 0
752, 0, 0
753, 1, 0
754, 0, 0
755, 0, 0
756, 1, 0
757, 0, 0
758, 0, 0
759, 0, 0
760, 1, 0
761, 1, 0
762, 0, 0
763, 0, 0
764, 0, 0
765, 1, 0
766, 0, 0
767, 0, 0
768, 1, 768
769, 0, 772
770, 0, 773
771, 0, 776
772, 1, 778
773, 1, 780
774, 0, 783
775, 0, 787
776, 1, 792
777, 0, 795
778, 1, 796
779, 0, 798
780, 1, 0
781, 0, 0
782, 0, 0
783, 1, 0
784, 0, 0
785, 0, 0
786, 0, 0
787, 1, 0
788, 0, 0
789, 0, 0
790, 0, 0
791, 0, 0
792, 1, 0
793, 0, 0
794, 0, 0
795, 1, 0
796, 1, 0
797, 0, 0
798, 1, 0
799, 0, 0
800, 0, 803
801, 0, 804
802, 0, 805
803, 1, 806
804, 1, 807
805, 1, 808
806, 1, 810
807, 1, 812
808, 1, 813
809, 0, 814
810, 1, 815
811, 0, 816
812, 1, 817
813, 1, 820
814, 1, 826
815, 1, 829
816, 1, 831
817, 1, 0
818, 0, 0
819, 0, 0
820, 1, 0
821, 0, 0
822, 0, 0
823, 0, 0
824, 0, 0
825, 0, 0
826, 1, 0
827, 0, 0
828, 0, 0
829, 1, 0
830, 0, 0
831, 1, 0
832, 1, 832
833, 1, 833
834, 0, 838
835, 0, 839
836, 0, 842
837, 0, 843
838, 1, 844
839, 1, 847
840, 0, 849
841, 0, 850
842, 1, 851
843, 1, 852
844, 1, 853
845, 0, 856
846, 0, 857
847, 1, 859
848, 0, 863
849, 1, 0
850, 1, 0
851, 1, 0
852, 1, 0
853, 1, 0
854, 0, 0
855, 0, 0
856, 1, 0
857, 1, 0
858, 0, 0
859, 1, 0
860, 0, 0
861, 0, 0
862, 0, 0
863, 1, 0
864, 1, 864
865, 1, 865
866, 0, 867
867, 1, 868
868, 1, 869
869, 1, 871
870, 0, 873
871, 1, 874
872, 0, 875
873, 1, 877
874, 1, 881
875, 1, 882
876, 0, 885
877, 1, 887
878, 0, 888
879, 0, 890
880, 0, 891
881, 1, 893
882, 1, 894
883, 0, 0
884, 0, 0
885, 1, 0
886, 0, 0
887, 1, 0
888, 1, 0
889, 0, 0
890, 1, 0
891, 1, 0
892, 0, 0
893, 1, 0
894, 1, 0
895, 0, 0
896, 0, 897
897, 1, 898
898, 1, 899
899, 1, 902
900, 0, 904
901, 0, 907
902, 1, 908
903, 0, 910
904, 1, 913
905, 0, 919
906, 0, 921
907, 1, 922
908, 1, 927
909, 0, 0
910, 1, 0
911, 0, 0
912, 0, 0
913, 1, 0
914, 0, 0
915, 0, 0
916, 0, 0
917, 0, 0
918, 0, 0
919, 1, 0
920, 0, 0
921, 1, 0
922, 1, 0
923, 0, 0
924, 0, 0
925, 0, 0
926, 0, 0
927, 1, 0
928, 1, 928
929, 0, 931
930, 0, 933
931, 1, 935
932, 0, 936
933, 1, 938
934, 0, 939
935, 1, 942
936, 1, 944
937, 0, 945
938, 1, 946
939, 1, 948
940, 0, 949
941, 0, 950
942, 1, 951
943, 0, 954
944, 1, 958
945, 1, 959
946, 1, 0
947, 0, 0
948, 1, 0
949, 1, 0
950, 1, 0
951, 1, 0
952, 0, 0
953, 0, 0
954, 1, 0
955, 0, 0
956, 0, 0
957, 0, 0
958, 1, 0
959, 1, 0
960, 0, 962
961, 0, 964
962, 1, 965
963, 0, 966
964, 1, 967
965, 1, 968
966, 1, 971
967, 1, 972
968, 1, 973
969, 0, 977
970, 0, 979
971, 1, 985
972, 1, 987
973, 1, 988
974, 0, 991
975, 0, 0
976, 0, 0
977, 1, 0
978, 0, 0
979, 1, 0
980, 0, 0
981, 0, 0
982, 0, 0
983, 0, 0
984, 0, 0
985, 1, 0
986, 0, 0
987, 1, 0
988, 1, 0
989, 0, 0
990, 0, 0
991, 1, 0
992, 0, 993
993, 1, 994
994, 1, 995
995, 1, 997
996, 0, 999
997, 1, 1000
998, 0, 1002
999, 1, 1004
1000, 1, 1005
1001, 0, 1006
1002, 1, 1007
1003, 0, 1009
1004, 1, 1012
1005, 1, 1018
1006, 1, 1019
1007, 1, 1021
1008, 0, 1022
1009, 1, 0
1010, 0, 0
1011, 0, 0
1012, 1, 0
1013, 0, 0
1014, 0, 0
1015, 0, 0
1016, 0, 0
1017, 0, 0
1018, 1, 0
1019, 1, 0
1020, 0, 0
1021, 1, 0
1022, 1, 0
1023, 0, 0
========= ERROR SUMMARY: 0 errors
$
请注意,所有线程都可以通过 "acquire lock" 部分进行处理而没有任何线程间依赖性(尽管一次只有一个线程会 "win"),并注意所有线程都可以通过 "win" 进行处理"critical section" 没有任何线程间依赖。
(*)注意这种情况是mitigated in Volta
我想在cuda中实现临界区。我看了很多关于这个主题的问答,答案经常涉及atomicCAS和atomicExch。
但是,这在 warp 级别不起作用,因为 warp 中的所有线程在 atomicCAS 之后获取相同的锁,从而导致死锁。
我认为有一种方法可以通过使用 warp __ballot 或 __any 指令在 cuda 中实现真正的锁定。
然而,经过多次尝试,我没有找到令人满意的(阅读有效的)解决方案。
这里有人对此有好的答案吗?
ps:我知道扭曲发散很糟糕,所以不要告诉我改变我的算法。
原子可以在曲速层工作。对锁进行内部 warp 协商的危险与获取锁的方式以及释放锁的方式有关。 this and this and
为了在 warp 中避免上述危险,有必要打破条件构造中 2 条路径之间的依赖关系。锁的获取必须能够独立于临界区和锁的释放进行。
但是,由于您只希望在扭曲内进行协调,因此原子学就有点过分了。我们已经拥有许多强大的 warp 通信和协调机制,例如您提到的 __ballot()
。
接下来是一个在机制中使用 __ballot()
的工作示例,以确保只有 1 个线程(每个 warp)将 "acquire a lock"。由于这一切只发生在 warp 级别,我们可以再次利用它来完全免除锁的释放。我们可以知道当关键部分完成时锁被释放(因为我们只需要跟踪整个 warp 的完成)。
此示例在 warp 级别执行流压缩,使用 __ballot()
机制确保一次只有 1 个线程进入 "critical section":
$ cat t398.cu
#include <stdio.h>
#include <stdlib.h>
const int num_threads = 1024; // should be 1024 or less for this demo
// returns true if the thread acquired the lock
// will return true for at most 1 thread per warp
__device__ bool warp_lock(int req){
return ((__ffs(__ballot(req))) == ((threadIdx.x & 31)+1));
}
__global__ void test_lock(int *pattern, int *result){
__shared__ int done;
__shared__ int warp_index[32];
int myreq = pattern[threadIdx.x];
done = 1;
if ((threadIdx.x & 31) == 0) warp_index[threadIdx.x>>5] = 0;
do {
done = 1;
__syncthreads();
// attempt to "acquire lock"
bool mylock = warp_lock(myreq);
// if lock acquired, do "critical section"
if (mylock){
done = 0;
int my_index = (warp_index[threadIdx.x>>5]++) + ((threadIdx.x>>5)*32);
result[my_index] = threadIdx.x;
myreq = 0;
}
__syncthreads();
} while (!done);
}
int main(){
int *h_pattern, *d_pattern, *h_result, *d_result;
h_pattern = (int *)malloc(num_threads*sizeof(int));
h_result = (int *)malloc(num_threads*sizeof(int));
cudaMalloc(&d_pattern, num_threads*sizeof(int));
cudaMalloc(&d_result, num_threads*sizeof(int));
for (int i = 0; i < num_threads; i++) {h_pattern[i] = rand()&1;}
cudaMemcpy(d_pattern, h_pattern, num_threads*sizeof(int), cudaMemcpyHostToDevice);
cudaMemset(d_result, 0, num_threads*sizeof(int));
test_lock<<<1, num_threads>>>(d_pattern, d_result);
cudaMemcpy(h_result, d_result, num_threads*sizeof(int), cudaMemcpyDeviceToHost);
printf("index, in, out\n");
for (int i = 0; i < num_threads; i++)
printf("%4d, %d, %4d\n", i, h_pattern[i], h_result[i]);
return 0;
}
$ nvcc -arch=sm_61 -o t398 t398.cu
$ cuda-memcheck ./t398
========= CUDA-MEMCHECK
index, in, out
0, 1, 0
1, 0, 2
2, 1, 3
3, 1, 4
4, 1, 5
5, 1, 8
6, 0, 9
7, 0, 11
8, 1, 13
9, 1, 14
10, 0, 20
11, 1, 22
12, 0, 23
13, 1, 27
14, 1, 28
15, 0, 29
16, 0, 30
17, 0, 0
18, 0, 0
19, 0, 0
20, 1, 0
21, 0, 0
22, 1, 0
23, 1, 0
24, 0, 0
25, 0, 0
26, 0, 0
27, 1, 0
28, 1, 0
29, 1, 0
30, 1, 0
31, 0, 0
32, 0, 34
33, 0, 35
34, 1, 36
35, 1, 38
36, 1, 40
37, 0, 41
38, 1, 42
39, 0, 43
40, 1, 45
41, 1, 48
42, 1, 50
43, 1, 52
44, 0, 55
45, 1, 59
46, 0, 60
47, 0, 61
48, 1, 63
49, 0, 0
50, 1, 0
51, 0, 0
52, 1, 0
53, 0, 0
54, 0, 0
55, 1, 0
56, 0, 0
57, 0, 0
58, 0, 0
59, 1, 0
60, 1, 0
61, 1, 0
62, 0, 0
63, 1, 0
64, 0, 65
65, 1, 67
66, 0, 68
67, 1, 69
68, 1, 71
69, 1, 73
70, 0, 75
71, 1, 78
72, 0, 80
73, 1, 86
74, 0, 87
75, 1, 89
76, 0, 94
77, 0, 0
78, 1, 0
79, 0, 0
80, 1, 0
81, 0, 0
82, 0, 0
83, 0, 0
84, 0, 0
85, 0, 0
86, 1, 0
87, 1, 0
88, 0, 0
89, 1, 0
90, 0, 0
91, 0, 0
92, 0, 0
93, 0, 0
94, 1, 0
95, 0, 0
96, 0, 99
97, 0, 100
98, 0, 104
99, 1, 105
100, 1, 106
101, 0, 108
102, 0, 112
103, 0, 115
104, 1, 116
105, 1, 117
106, 1, 119
107, 0, 121
108, 1, 122
109, 0, 123
110, 0, 124
111, 0, 125
112, 1, 126
113, 0, 127
114, 0, 0
115, 1, 0
116, 1, 0
117, 1, 0
118, 0, 0
119, 1, 0
120, 0, 0
121, 1, 0
122, 1, 0
123, 1, 0
124, 1, 0
125, 1, 0
126, 1, 0
127, 1, 0
128, 1, 128
129, 1, 129
130, 1, 130
131, 0, 132
132, 1, 133
133, 1, 135
134, 0, 137
135, 1, 138
136, 0, 141
137, 1, 143
138, 1, 144
139, 0, 145
140, 0, 149
141, 1, 151
142, 0, 153
143, 1, 155
144, 1, 157
145, 1, 158
146, 0, 159
147, 0, 0
148, 0, 0
149, 1, 0
150, 0, 0
151, 1, 0
152, 0, 0
153, 1, 0
154, 0, 0
155, 1, 0
156, 0, 0
157, 1, 0
158, 1, 0
159, 1, 0
160, 1, 160
161, 0, 164
162, 0, 166
163, 0, 172
164, 1, 174
165, 0, 175
166, 1, 176
167, 0, 177
168, 0, 178
169, 0, 181
170, 0, 182
171, 0, 184
172, 1, 185
173, 0, 187
174, 1, 189
175, 1, 190
176, 1, 191
177, 1, 0
178, 1, 0
179, 0, 0
180, 0, 0
181, 1, 0
182, 1, 0
183, 0, 0
184, 1, 0
185, 1, 0
186, 0, 0
187, 1, 0
188, 0, 0
189, 1, 0
190, 1, 0
191, 1, 0
192, 1, 192
193, 1, 193
194, 0, 196
195, 0, 197
196, 1, 199
197, 1, 202
198, 0, 203
199, 1, 204
200, 0, 212
201, 0, 213
202, 1, 214
203, 1, 217
204, 1, 219
205, 0, 221
206, 0, 223
207, 0, 0
208, 0, 0
209, 0, 0
210, 0, 0
211, 0, 0
212, 1, 0
213, 1, 0
214, 1, 0
215, 0, 0
216, 0, 0
217, 1, 0
218, 0, 0
219, 1, 0
220, 0, 0
221, 1, 0
222, 0, 0
223, 1, 0
224, 0, 226
225, 0, 227
226, 1, 230
227, 1, 234
228, 0, 235
229, 0, 236
230, 1, 239
231, 0, 242
232, 0, 243
233, 0, 248
234, 1, 250
235, 1, 251
236, 1, 252
237, 0, 255
238, 0, 0
239, 1, 0
240, 0, 0
241, 0, 0
242, 1, 0
243, 1, 0
244, 0, 0
245, 0, 0
246, 0, 0
247, 0, 0
248, 1, 0
249, 0, 0
250, 1, 0
251, 1, 0
252, 1, 0
253, 0, 0
254, 0, 0
255, 1, 0
256, 0, 261
257, 0, 267
258, 0, 273
259, 0, 274
260, 0, 277
261, 1, 278
262, 0, 279
263, 0, 280
264, 0, 282
265, 0, 285
266, 0, 286
267, 1, 287
268, 0, 0
269, 0, 0
270, 0, 0
271, 0, 0
272, 0, 0
273, 1, 0
274, 1, 0
275, 0, 0
276, 0, 0
277, 1, 0
278, 1, 0
279, 1, 0
280, 1, 0
281, 0, 0
282, 1, 0
283, 0, 0
284, 0, 0
285, 1, 0
286, 1, 0
287, 1, 0
288, 1, 288
289, 0, 290
290, 1, 291
291, 1, 292
292, 1, 295
293, 0, 305
294, 0, 306
295, 1, 308
296, 0, 310
297, 0, 314
298, 0, 315
299, 0, 319
300, 0, 0
301, 0, 0
302, 0, 0
303, 0, 0
304, 0, 0
305, 1, 0
306, 1, 0
307, 0, 0
308, 1, 0
309, 0, 0
310, 1, 0
311, 0, 0
312, 0, 0
313, 0, 0
314, 1, 0
315, 1, 0
316, 0, 0
317, 0, 0
318, 0, 0
319, 1, 0
320, 0, 321
321, 1, 322
322, 1, 324
323, 0, 325
324, 1, 326
325, 1, 327
326, 1, 328
327, 1, 330
328, 1, 331
329, 0, 333
330, 1, 334
331, 1, 336
332, 0, 343
333, 1, 345
334, 1, 0
335, 0, 0
336, 1, 0
337, 0, 0
338, 0, 0
339, 0, 0
340, 0, 0
341, 0, 0
342, 0, 0
343, 1, 0
344, 0, 0
345, 1, 0
346, 0, 0
347, 0, 0
348, 0, 0
349, 0, 0
350, 0, 0
351, 0, 0
352, 1, 352
353, 1, 353
354, 0, 355
355, 1, 359
356, 0, 364
357, 0, 365
358, 0, 366
359, 1, 369
360, 0, 372
361, 0, 373
362, 0, 374
363, 0, 377
364, 1, 380
365, 1, 382
366, 1, 383
367, 0, 0
368, 0, 0
369, 1, 0
370, 0, 0
371, 0, 0
372, 1, 0
373, 1, 0
374, 1, 0
375, 0, 0
376, 0, 0
377, 1, 0
378, 0, 0
379, 0, 0
380, 1, 0
381, 0, 0
382, 1, 0
383, 1, 0
384, 1, 384
385, 1, 385
386, 0, 387
387, 1, 388
388, 1, 391
389, 0, 397
390, 0, 401
391, 1, 408
392, 0, 410
393, 0, 411
394, 0, 412
395, 0, 0
396, 0, 0
397, 1, 0
398, 0, 0
399, 0, 0
400, 0, 0
401, 1, 0
402, 0, 0
403, 0, 0
404, 0, 0
405, 0, 0
406, 0, 0
407, 0, 0
408, 1, 0
409, 0, 0
410, 1, 0
411, 1, 0
412, 1, 0
413, 0, 0
414, 0, 0
415, 0, 0
416, 1, 416
417, 0, 419
418, 0, 423
419, 1, 424
420, 0, 426
421, 0, 427
422, 0, 428
423, 1, 429
424, 1, 430
425, 0, 433
426, 1, 436
427, 1, 438
428, 1, 439
429, 1, 443
430, 1, 447
431, 0, 0
432, 0, 0
433, 1, 0
434, 0, 0
435, 0, 0
436, 1, 0
437, 0, 0
438, 1, 0
439, 1, 0
440, 0, 0
441, 0, 0
442, 0, 0
443, 1, 0
444, 0, 0
445, 0, 0
446, 0, 0
447, 1, 0
448, 0, 453
449, 0, 454
450, 0, 455
451, 0, 456
452, 0, 459
453, 1, 461
454, 1, 462
455, 1, 464
456, 1, 465
457, 0, 466
458, 0, 468
459, 1, 470
460, 0, 471
461, 1, 473
462, 1, 474
463, 0, 476
464, 1, 477
465, 1, 478
466, 1, 479
467, 0, 0
468, 1, 0
469, 0, 0
470, 1, 0
471, 1, 0
472, 0, 0
473, 1, 0
474, 1, 0
475, 0, 0
476, 1, 0
477, 1, 0
478, 1, 0
479, 1, 0
480, 1, 480
481, 0, 483
482, 0, 484
483, 1, 485
484, 1, 486
485, 1, 489
486, 1, 490
487, 0, 492
488, 0, 496
489, 1, 497
490, 1, 500
491, 0, 503
492, 1, 504
493, 0, 505
494, 0, 506
495, 0, 507
496, 1, 509
497, 1, 511
498, 0, 0
499, 0, 0
500, 1, 0
501, 0, 0
502, 0, 0
503, 1, 0
504, 1, 0
505, 1, 0
506, 1, 0
507, 1, 0
508, 0, 0
509, 1, 0
510, 0, 0
511, 1, 0
512, 1, 512
513, 0, 517
514, 0, 520
515, 0, 531
516, 0, 534
517, 1, 536
518, 0, 538
519, 0, 539
520, 1, 540
521, 0, 541
522, 0, 0
523, 0, 0
524, 0, 0
525, 0, 0
526, 0, 0
527, 0, 0
528, 0, 0
529, 0, 0
530, 0, 0
531, 1, 0
532, 0, 0
533, 0, 0
534, 1, 0
535, 0, 0
536, 1, 0
537, 0, 0
538, 1, 0
539, 1, 0
540, 1, 0
541, 1, 0
542, 0, 0
543, 0, 0
544, 0, 545
545, 1, 549
546, 0, 551
547, 0, 552
548, 0, 554
549, 1, 555
550, 0, 557
551, 1, 558
552, 1, 565
553, 0, 568
554, 1, 569
555, 1, 570
556, 0, 571
557, 1, 572
558, 1, 574
559, 0, 575
560, 0, 0
561, 0, 0
562, 0, 0
563, 0, 0
564, 0, 0
565, 1, 0
566, 0, 0
567, 0, 0
568, 1, 0
569, 1, 0
570, 1, 0
571, 1, 0
572, 1, 0
573, 0, 0
574, 1, 0
575, 1, 0
576, 1, 576
577, 0, 578
578, 1, 579
579, 1, 580
580, 1, 581
581, 1, 582
582, 1, 584
583, 0, 586
584, 1, 587
585, 0, 589
586, 1, 592
587, 1, 595
588, 0, 596
589, 1, 597
590, 0, 598
591, 0, 601
592, 1, 602
593, 0, 603
594, 0, 604
595, 1, 605
596, 1, 0
597, 1, 0
598, 1, 0
599, 0, 0
600, 0, 0
601, 1, 0
602, 1, 0
603, 1, 0
604, 1, 0
605, 1, 0
606, 0, 0
607, 0, 0
608, 1, 608
609, 1, 609
610, 1, 610
611, 0, 615
612, 0, 616
613, 0, 617
614, 0, 618
615, 1, 619
616, 1, 621
617, 1, 622
618, 1, 623
619, 1, 624
620, 0, 625
621, 1, 626
622, 1, 631
623, 1, 632
624, 1, 636
625, 1, 637
626, 1, 0
627, 0, 0
628, 0, 0
629, 0, 0
630, 0, 0
631, 1, 0
632, 1, 0
633, 0, 0
634, 0, 0
635, 0, 0
636, 1, 0
637, 1, 0
638, 0, 0
639, 0, 0
640, 0, 643
641, 0, 647
642, 0, 648
643, 1, 649
644, 0, 653
645, 0, 654
646, 0, 655
647, 1, 656
648, 1, 658
649, 1, 659
650, 0, 660
651, 0, 665
652, 0, 666
653, 1, 667
654, 1, 669
655, 1, 670
656, 1, 0
657, 0, 0
658, 1, 0
659, 1, 0
660, 1, 0
661, 0, 0
662, 0, 0
663, 0, 0
664, 0, 0
665, 1, 0
666, 1, 0
667, 1, 0
668, 0, 0
669, 1, 0
670, 1, 0
671, 0, 0
672, 1, 672
673, 1, 673
674, 1, 674
675, 0, 676
676, 1, 678
677, 0, 680
678, 1, 684
679, 0, 686
680, 1, 689
681, 0, 691
682, 0, 692
683, 0, 695
684, 1, 696
685, 0, 697
686, 1, 699
687, 0, 701
688, 0, 0
689, 1, 0
690, 0, 0
691, 1, 0
692, 1, 0
693, 0, 0
694, 0, 0
695, 1, 0
696, 1, 0
697, 1, 0
698, 0, 0
699, 1, 0
700, 0, 0
701, 1, 0
702, 0, 0
703, 0, 0
704, 0, 705
705, 1, 707
706, 0, 708
707, 1, 709
708, 1, 710
709, 1, 711
710, 1, 712
711, 1, 714
712, 1, 715
713, 0, 718
714, 1, 720
715, 1, 721
716, 0, 726
717, 0, 727
718, 1, 728
719, 0, 729
720, 1, 730
721, 1, 731
722, 0, 733
723, 0, 734
724, 0, 0
725, 0, 0
726, 1, 0
727, 1, 0
728, 1, 0
729, 1, 0
730, 1, 0
731, 1, 0
732, 0, 0
733, 1, 0
734, 1, 0
735, 0, 0
736, 0, 737
737, 1, 738
738, 1, 740
739, 0, 741
740, 1, 742
741, 1, 744
742, 1, 746
743, 0, 747
744, 1, 749
745, 0, 750
746, 1, 753
747, 1, 756
748, 0, 760
749, 1, 761
750, 1, 765
751, 0, 0
752, 0, 0
753, 1, 0
754, 0, 0
755, 0, 0
756, 1, 0
757, 0, 0
758, 0, 0
759, 0, 0
760, 1, 0
761, 1, 0
762, 0, 0
763, 0, 0
764, 0, 0
765, 1, 0
766, 0, 0
767, 0, 0
768, 1, 768
769, 0, 772
770, 0, 773
771, 0, 776
772, 1, 778
773, 1, 780
774, 0, 783
775, 0, 787
776, 1, 792
777, 0, 795
778, 1, 796
779, 0, 798
780, 1, 0
781, 0, 0
782, 0, 0
783, 1, 0
784, 0, 0
785, 0, 0
786, 0, 0
787, 1, 0
788, 0, 0
789, 0, 0
790, 0, 0
791, 0, 0
792, 1, 0
793, 0, 0
794, 0, 0
795, 1, 0
796, 1, 0
797, 0, 0
798, 1, 0
799, 0, 0
800, 0, 803
801, 0, 804
802, 0, 805
803, 1, 806
804, 1, 807
805, 1, 808
806, 1, 810
807, 1, 812
808, 1, 813
809, 0, 814
810, 1, 815
811, 0, 816
812, 1, 817
813, 1, 820
814, 1, 826
815, 1, 829
816, 1, 831
817, 1, 0
818, 0, 0
819, 0, 0
820, 1, 0
821, 0, 0
822, 0, 0
823, 0, 0
824, 0, 0
825, 0, 0
826, 1, 0
827, 0, 0
828, 0, 0
829, 1, 0
830, 0, 0
831, 1, 0
832, 1, 832
833, 1, 833
834, 0, 838
835, 0, 839
836, 0, 842
837, 0, 843
838, 1, 844
839, 1, 847
840, 0, 849
841, 0, 850
842, 1, 851
843, 1, 852
844, 1, 853
845, 0, 856
846, 0, 857
847, 1, 859
848, 0, 863
849, 1, 0
850, 1, 0
851, 1, 0
852, 1, 0
853, 1, 0
854, 0, 0
855, 0, 0
856, 1, 0
857, 1, 0
858, 0, 0
859, 1, 0
860, 0, 0
861, 0, 0
862, 0, 0
863, 1, 0
864, 1, 864
865, 1, 865
866, 0, 867
867, 1, 868
868, 1, 869
869, 1, 871
870, 0, 873
871, 1, 874
872, 0, 875
873, 1, 877
874, 1, 881
875, 1, 882
876, 0, 885
877, 1, 887
878, 0, 888
879, 0, 890
880, 0, 891
881, 1, 893
882, 1, 894
883, 0, 0
884, 0, 0
885, 1, 0
886, 0, 0
887, 1, 0
888, 1, 0
889, 0, 0
890, 1, 0
891, 1, 0
892, 0, 0
893, 1, 0
894, 1, 0
895, 0, 0
896, 0, 897
897, 1, 898
898, 1, 899
899, 1, 902
900, 0, 904
901, 0, 907
902, 1, 908
903, 0, 910
904, 1, 913
905, 0, 919
906, 0, 921
907, 1, 922
908, 1, 927
909, 0, 0
910, 1, 0
911, 0, 0
912, 0, 0
913, 1, 0
914, 0, 0
915, 0, 0
916, 0, 0
917, 0, 0
918, 0, 0
919, 1, 0
920, 0, 0
921, 1, 0
922, 1, 0
923, 0, 0
924, 0, 0
925, 0, 0
926, 0, 0
927, 1, 0
928, 1, 928
929, 0, 931
930, 0, 933
931, 1, 935
932, 0, 936
933, 1, 938
934, 0, 939
935, 1, 942
936, 1, 944
937, 0, 945
938, 1, 946
939, 1, 948
940, 0, 949
941, 0, 950
942, 1, 951
943, 0, 954
944, 1, 958
945, 1, 959
946, 1, 0
947, 0, 0
948, 1, 0
949, 1, 0
950, 1, 0
951, 1, 0
952, 0, 0
953, 0, 0
954, 1, 0
955, 0, 0
956, 0, 0
957, 0, 0
958, 1, 0
959, 1, 0
960, 0, 962
961, 0, 964
962, 1, 965
963, 0, 966
964, 1, 967
965, 1, 968
966, 1, 971
967, 1, 972
968, 1, 973
969, 0, 977
970, 0, 979
971, 1, 985
972, 1, 987
973, 1, 988
974, 0, 991
975, 0, 0
976, 0, 0
977, 1, 0
978, 0, 0
979, 1, 0
980, 0, 0
981, 0, 0
982, 0, 0
983, 0, 0
984, 0, 0
985, 1, 0
986, 0, 0
987, 1, 0
988, 1, 0
989, 0, 0
990, 0, 0
991, 1, 0
992, 0, 993
993, 1, 994
994, 1, 995
995, 1, 997
996, 0, 999
997, 1, 1000
998, 0, 1002
999, 1, 1004
1000, 1, 1005
1001, 0, 1006
1002, 1, 1007
1003, 0, 1009
1004, 1, 1012
1005, 1, 1018
1006, 1, 1019
1007, 1, 1021
1008, 0, 1022
1009, 1, 0
1010, 0, 0
1011, 0, 0
1012, 1, 0
1013, 0, 0
1014, 0, 0
1015, 0, 0
1016, 0, 0
1017, 0, 0
1018, 1, 0
1019, 1, 0
1020, 0, 0
1021, 1, 0
1022, 1, 0
1023, 0, 0
========= ERROR SUMMARY: 0 errors
$
请注意,所有线程都可以通过 "acquire lock" 部分进行处理而没有任何线程间依赖性(尽管一次只有一个线程会 "win"),并注意所有线程都可以通过 "win" 进行处理"critical section" 没有任何线程间依赖。
(*)注意这种情况是mitigated in Volta