cuda 内核通过增加网格大小给出不正确的结果
cuda kernel gives incorrect results by grid size increase
我正在测试一个简单的 CUDA 计时算法,我遇到了一个案例,当我增加内核的网格大小时,它给出了错误的结果:
#include <unistd.h>
#include <stdio.h>
#include <assert.h>
/* we need these includes for CUDA's random number stuff */
#include <curand.h>
#include <curand_kernel.h>
#define MAX 10
#ifdef GRID
#define REPEAT GRID
#else
#define REPEAT 65535
#endif
#ifdef VECSIZE
#define SIZE VECSIZE
#else
#define SIZE 1024
#endif
__global__ void random(int *result) {
curandState_t state;
curand_init(100, 0, threadIdx.x, &state);
result[threadIdx.x] = curand(&state) % MAX;
//printf("th %d random %d\n", threadIdx.x, *result);
}
__global__ void myadd(const int *in, int *sum) {
sum[blockIdx.x] = 0;
//printf("thread %d value %d\n",threadIdx.x, in[threadIdx.x]);
atomicAdd_block(&sum[blockIdx.x], in[threadIdx.x]);
//atomicAdd(sum, in[threadIdx.x]);
}
int main() {
int check = 0;
/* allocate an int on the GPU */
int *x = new int[SIZE];
int *sum = new int[REPEAT];
int *d_x, *d_sum;
cudaMalloc(&d_x, sizeof(int) * SIZE);
cudaMalloc(&d_sum, sizeof(int) * REPEAT);
/* invoke the GPU to initialize all of the random states */
random<<<1, SIZE>>>(d_x);
myadd<<<REPEAT, SIZE>>>(d_x, d_sum);
cudaDeviceSynchronize();
/* copy the random number back */
cudaMemcpy(x, d_x, sizeof(int) * SIZE, cudaMemcpyDeviceToHost);
cudaMemcpy(sum, d_sum, sizeof(int)* REPEAT, cudaMemcpyDeviceToHost);
for (int i = 0; i < SIZE; ++i) {
check += x[i];
//printf("Random[%d] = %d\n", i, x[i]);
}
cudaError_t err = cudaGetLastError(); // Get error code
if (err != cudaSuccess) {
printf("CUDA Error: %s\n", cudaGetErrorString(err));
exit(-1);
}
for (int i = 0; i < REPEAT; ++i) {
printf("i %d check %d sum[i] %d\n", i, check, sum[i]);
assert(check == sum[i]);
}
/* free the memory we allocated */
cudaFree(d_x);
cudaFree(d_sum);
delete[] x;
delete[] sum;
return 0;
}
我的显卡是V100,计算能力7.0。如您所见,我可以使用 nvcc test.cu -arch=sm_70 -O3 -g -G -DGRID=1024 -DVECSIZE=512
编译具有不同网格和矢量大小的上述代码,对于小矢量和网格大小,一切看起来都不错,但是当我将网格大小增加到最大 (65535) 时,有时计算的总和值为不正确。例如:
.
.
.
i 511 check 2331 sum[i] 2331
i 512 check 2331 sum[i] 2331
i 513 check 2331 sum[i] 2188
a.out: test.cu:87: int main(): Assertion `check == sum[i]' failed.
内核中存在竞争条件 myadd
。总和只能设置为 0 一次。并且在其他一些线程向其添加值后,不应将其设置为 0。
__global__ void myadd(const int *in, int *sum) {
if(threadIdx.x == 0){
sum[blockIdx.x] = 0;
}
__syncthreads(); // all threads wait until sum is initialized with 0
atomicAdd_block(&sum[blockIdx.x], in[threadIdx.x]);
}
如果你想正确地为你的代码计时,你应该删除 -G
编译器标志。
我正在测试一个简单的 CUDA 计时算法,我遇到了一个案例,当我增加内核的网格大小时,它给出了错误的结果:
#include <unistd.h>
#include <stdio.h>
#include <assert.h>
/* we need these includes for CUDA's random number stuff */
#include <curand.h>
#include <curand_kernel.h>
#define MAX 10
#ifdef GRID
#define REPEAT GRID
#else
#define REPEAT 65535
#endif
#ifdef VECSIZE
#define SIZE VECSIZE
#else
#define SIZE 1024
#endif
__global__ void random(int *result) {
curandState_t state;
curand_init(100, 0, threadIdx.x, &state);
result[threadIdx.x] = curand(&state) % MAX;
//printf("th %d random %d\n", threadIdx.x, *result);
}
__global__ void myadd(const int *in, int *sum) {
sum[blockIdx.x] = 0;
//printf("thread %d value %d\n",threadIdx.x, in[threadIdx.x]);
atomicAdd_block(&sum[blockIdx.x], in[threadIdx.x]);
//atomicAdd(sum, in[threadIdx.x]);
}
int main() {
int check = 0;
/* allocate an int on the GPU */
int *x = new int[SIZE];
int *sum = new int[REPEAT];
int *d_x, *d_sum;
cudaMalloc(&d_x, sizeof(int) * SIZE);
cudaMalloc(&d_sum, sizeof(int) * REPEAT);
/* invoke the GPU to initialize all of the random states */
random<<<1, SIZE>>>(d_x);
myadd<<<REPEAT, SIZE>>>(d_x, d_sum);
cudaDeviceSynchronize();
/* copy the random number back */
cudaMemcpy(x, d_x, sizeof(int) * SIZE, cudaMemcpyDeviceToHost);
cudaMemcpy(sum, d_sum, sizeof(int)* REPEAT, cudaMemcpyDeviceToHost);
for (int i = 0; i < SIZE; ++i) {
check += x[i];
//printf("Random[%d] = %d\n", i, x[i]);
}
cudaError_t err = cudaGetLastError(); // Get error code
if (err != cudaSuccess) {
printf("CUDA Error: %s\n", cudaGetErrorString(err));
exit(-1);
}
for (int i = 0; i < REPEAT; ++i) {
printf("i %d check %d sum[i] %d\n", i, check, sum[i]);
assert(check == sum[i]);
}
/* free the memory we allocated */
cudaFree(d_x);
cudaFree(d_sum);
delete[] x;
delete[] sum;
return 0;
}
我的显卡是V100,计算能力7.0。如您所见,我可以使用 nvcc test.cu -arch=sm_70 -O3 -g -G -DGRID=1024 -DVECSIZE=512
编译具有不同网格和矢量大小的上述代码,对于小矢量和网格大小,一切看起来都不错,但是当我将网格大小增加到最大 (65535) 时,有时计算的总和值为不正确。例如:
.
.
.
i 511 check 2331 sum[i] 2331
i 512 check 2331 sum[i] 2331
i 513 check 2331 sum[i] 2188
a.out: test.cu:87: int main(): Assertion `check == sum[i]' failed.
内核中存在竞争条件 myadd
。总和只能设置为 0 一次。并且在其他一些线程向其添加值后,不应将其设置为 0。
__global__ void myadd(const int *in, int *sum) {
if(threadIdx.x == 0){
sum[blockIdx.x] = 0;
}
__syncthreads(); // all threads wait until sum is initialized with 0
atomicAdd_block(&sum[blockIdx.x], in[threadIdx.x]);
}
如果你想正确地为你的代码计时,你应该删除 -G
编译器标志。