CUDA并行扫描算法共享内存竞争条件
CUDA parallel scan algorithm shared memory race condition
我正在阅读“大规模并行处理器编程”一书(第 3 版),其中介绍了 Kogge-Stone 并行扫描算法的实现。
本算法本意是运行单块(这只是初步的简化)下面是实现。
// X is the input array, Y is the output array, InputSize is the size of the input array
__global__ void Kogge_Stone_scan_kernel(float* X, float* Y, int InputSize) {
__shared__ float XY[SECTION_SIZE]; // SECTION_SIZE is basically blockDim.x
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < InputSize)
XY[threadIdx.x] = X[i];
for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) {
__syncthreads();
if (threadIdx.x >= stride)
XY[threadIdx.x] += XY[threadIdx.x - stride]; // Race condition here?
}
Y[i] = XY[threadIdx.x];
}
不管算法的工作方式如何,我都对这条线感到有点困惑
XY[threadIdx.x] += XY[threadIdx.x - stride]
。说 stride = 1
,那么 threadIdx.x = 6
的线程将执行操作 XY[6] += XY[5]
。但是,同时 threadIdx.x = 5
的线程将执行 XY[5] += XY[4]
。问题是:是否可以保证线程 6
将读取 XY[5]
的原始值而不是 XY[5] + XY[4]
?。请注意,这不限于单个 warp,其中锁步执行可能会阻止竞争条件。
谢谢
is there any guarantee that the thread 6 will read the original value of XY[5] instead of XY[5] + XY[4]
不,CUDA 不保证线程执行顺序(锁步或其他),代码中也没有任何内容可以解决这个问题。
顺便说一下,cuda-memcheck
和 compute-sanitizer
非常擅长识别共享内存竞争条件:
$ cat t2.cu
const int SECTION_SIZE = 256;
__global__ void Kogge_Stone_scan_kernel(float* X, float* Y, int InputSize) {
__shared__ float XY[SECTION_SIZE]; // SECTION_SIZE is basically blockDim.x
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < InputSize)
XY[threadIdx.x] = X[i];
for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) {
__syncthreads();
if (threadIdx.x >= stride)
XY[threadIdx.x] += XY[threadIdx.x - stride]; // Race condition here?
}
Y[i] = XY[threadIdx.x];
}
int main(){
const int nblk = 1;
const int sz = nblk*SECTION_SIZE;
const int bsz = sz*sizeof(float);
float *X, *Y;
cudaMallocManaged(&X, bsz);
cudaMallocManaged(&Y, bsz);
Kogge_Stone_scan_kernel<<<nblk, SECTION_SIZE>>>(X, Y, sz);
cudaDeviceSynchronize();
}
$ nvcc -o t2 t2.cu -lineinfo
$ cuda-memcheck ./t2
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ cuda-memcheck --tool racecheck ./t2
========= CUDA-MEMCHECK
========= ERROR: Race reported between Read access at 0x000001a0 in /home/user2/misc/junk/t2.cu:12:Kogge_Stone_scan_kernel(float*, float*, int)
========= and Write access at 0x000001c0 in /home/user2/misc/junk/t2.cu:12:Kogge_Stone_scan_kernel(float*, float*, int) [6152 hazards]
=========
========= RACECHECK SUMMARY: 1 hazard displayed (1 error, 0 warnings)
$
正如您可能已经猜到的那样,您可以通过在有问题的行中分解读取和写入操作来解决这个问题,中间设置一个屏障:
$ cat t2.cu
const int SECTION_SIZE = 256;
__global__ void Kogge_Stone_scan_kernel(float* X, float* Y, int InputSize) {
__shared__ float XY[SECTION_SIZE]; // SECTION_SIZE is basically blockDim.x
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < InputSize)
XY[threadIdx.x] = X[i];
for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) {
__syncthreads();
float val;
if (threadIdx.x >= stride)
val = XY[threadIdx.x - stride];
__syncthreads();
if (threadIdx.x >= stride)
XY[threadIdx.x] += val;
}
Y[i] = XY[threadIdx.x];
}
int main(){
const int nblk = 1;
const int sz = nblk*SECTION_SIZE;
const int bsz = sz*sizeof(float);
float *X, *Y;
cudaMallocManaged(&X, bsz);
cudaMallocManaged(&Y, bsz);
Kogge_Stone_scan_kernel<<<nblk, SECTION_SIZE>>>(X, Y, sz);
cudaDeviceSynchronize();
}
$ nvcc -o t2 t2.cu -lineinfo
$ cuda-memcheck --tool racecheck ./t2
========= CUDA-MEMCHECK
========= RACECHECK SUMMARY: 0 hazards displayed (0 errors, 0 warnings)
$
我正在阅读“大规模并行处理器编程”一书(第 3 版),其中介绍了 Kogge-Stone 并行扫描算法的实现。 本算法本意是运行单块(这只是初步的简化)下面是实现。
// X is the input array, Y is the output array, InputSize is the size of the input array
__global__ void Kogge_Stone_scan_kernel(float* X, float* Y, int InputSize) {
__shared__ float XY[SECTION_SIZE]; // SECTION_SIZE is basically blockDim.x
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < InputSize)
XY[threadIdx.x] = X[i];
for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) {
__syncthreads();
if (threadIdx.x >= stride)
XY[threadIdx.x] += XY[threadIdx.x - stride]; // Race condition here?
}
Y[i] = XY[threadIdx.x];
}
不管算法的工作方式如何,我都对这条线感到有点困惑
XY[threadIdx.x] += XY[threadIdx.x - stride]
。说 stride = 1
,那么 threadIdx.x = 6
的线程将执行操作 XY[6] += XY[5]
。但是,同时 threadIdx.x = 5
的线程将执行 XY[5] += XY[4]
。问题是:是否可以保证线程 6
将读取 XY[5]
的原始值而不是 XY[5] + XY[4]
?。请注意,这不限于单个 warp,其中锁步执行可能会阻止竞争条件。
谢谢
is there any guarantee that the thread 6 will read the original value of XY[5] instead of XY[5] + XY[4]
不,CUDA 不保证线程执行顺序(锁步或其他),代码中也没有任何内容可以解决这个问题。
顺便说一下,cuda-memcheck
和 compute-sanitizer
非常擅长识别共享内存竞争条件:
$ cat t2.cu
const int SECTION_SIZE = 256;
__global__ void Kogge_Stone_scan_kernel(float* X, float* Y, int InputSize) {
__shared__ float XY[SECTION_SIZE]; // SECTION_SIZE is basically blockDim.x
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < InputSize)
XY[threadIdx.x] = X[i];
for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) {
__syncthreads();
if (threadIdx.x >= stride)
XY[threadIdx.x] += XY[threadIdx.x - stride]; // Race condition here?
}
Y[i] = XY[threadIdx.x];
}
int main(){
const int nblk = 1;
const int sz = nblk*SECTION_SIZE;
const int bsz = sz*sizeof(float);
float *X, *Y;
cudaMallocManaged(&X, bsz);
cudaMallocManaged(&Y, bsz);
Kogge_Stone_scan_kernel<<<nblk, SECTION_SIZE>>>(X, Y, sz);
cudaDeviceSynchronize();
}
$ nvcc -o t2 t2.cu -lineinfo
$ cuda-memcheck ./t2
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ cuda-memcheck --tool racecheck ./t2
========= CUDA-MEMCHECK
========= ERROR: Race reported between Read access at 0x000001a0 in /home/user2/misc/junk/t2.cu:12:Kogge_Stone_scan_kernel(float*, float*, int)
========= and Write access at 0x000001c0 in /home/user2/misc/junk/t2.cu:12:Kogge_Stone_scan_kernel(float*, float*, int) [6152 hazards]
=========
========= RACECHECK SUMMARY: 1 hazard displayed (1 error, 0 warnings)
$
正如您可能已经猜到的那样,您可以通过在有问题的行中分解读取和写入操作来解决这个问题,中间设置一个屏障:
$ cat t2.cu
const int SECTION_SIZE = 256;
__global__ void Kogge_Stone_scan_kernel(float* X, float* Y, int InputSize) {
__shared__ float XY[SECTION_SIZE]; // SECTION_SIZE is basically blockDim.x
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < InputSize)
XY[threadIdx.x] = X[i];
for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) {
__syncthreads();
float val;
if (threadIdx.x >= stride)
val = XY[threadIdx.x - stride];
__syncthreads();
if (threadIdx.x >= stride)
XY[threadIdx.x] += val;
}
Y[i] = XY[threadIdx.x];
}
int main(){
const int nblk = 1;
const int sz = nblk*SECTION_SIZE;
const int bsz = sz*sizeof(float);
float *X, *Y;
cudaMallocManaged(&X, bsz);
cudaMallocManaged(&Y, bsz);
Kogge_Stone_scan_kernel<<<nblk, SECTION_SIZE>>>(X, Y, sz);
cudaDeviceSynchronize();
}
$ nvcc -o t2 t2.cu -lineinfo
$ cuda-memcheck --tool racecheck ./t2
========= CUDA-MEMCHECK
========= RACECHECK SUMMARY: 0 hazards displayed (0 errors, 0 warnings)
$