大于 16 位的线程块的无效配置参数

Question

这段代码工作正常：

#include <stdio.h>
#define N 1000 // <-- Works for values < 2^16

__global__
void add(int *a, int *b) {
    int i = blockIdx.x;
    if (i<N) {
        b[i] = 2*a[i];
    }
}
int main() {
    int max_value[2];
    int ha[N], hb[N];
    int *da, *db;
    cudaMalloc((void **)&da, N*sizeof(int));
    cudaMalloc((void **)&db, N*sizeof(int));
    for (int i = 0; i<N; ++i) {
        ha[i] = i;
    }
    cudaMemcpy(da, ha, N*sizeof(int), cudaMemcpyHostToDevice);
    add<<<N, 1>>>(da, db);
    cudaMemcpy(hb, db, N*sizeof(int), cudaMemcpyDeviceToHost);
    max_value[0] = hb[0];
    int i;
    for (i = 0; i < N; i++) {
        if (hb[i] > max_value[0]) {
            max_value[0] = hb[i];
            max_value[1] = i;
        }
    }
    cudaFree(da);
    cudaFree(db);
    printf("Max number %d, from value:%d \n", max_value[0], max_value[1]);
    getchar();
    return 0;
}

但是 当我将数字 N（数组中的项目）从 1000 更改为 >(2¹⁶)-1程序崩溃。

以为是host溢出，所以把ha和hb的数组声明移到BSS segment，把N改成100万。

#include <stdio.h>
#define N 1000000 // <----

__global__
void add(int *a, int *b) {
    int i = blockIdx.x;
    if (i<N) {
        b[i] = 2*a[i];
    }
}
static int ha[N]; // <----
static int hb[N]; // <----
int main() {
    int max_value[2];
    // int ha[N], hb[N];
    int *da, *db;
    cudaMalloc((void **)&da, N*sizeof(int));
    cudaMalloc((void **)&db, N*sizeof(int));
    for (int i = 0; i<N; ++i) {
        ha[i] = i;
    }
    cudaMemcpy(da, ha, N*sizeof(int), cudaMemcpyHostToDevice);
    add<<<N, 1>>>(da, db);
    cudaMemcpy(hb, db, N*sizeof(int), cudaMemcpyDeviceToHost);
    max_value[0] = hb[0];
    int i;
    for (i = 0; i < N; i++) {
        if (hb[i] > max_value[0]) {
            max_value[0] = hb[i];
            max_value[1] = i;
        }
    }
    cudaFree(da);
    cudaFree(db);
    printf("Max number %d, from value:%d \n", max_value[0], max_value[1]);
    getchar();
    return 0;
}

现在我没有收到错误，但 hb 数组为空。
我的代码有什么问题？
如何将大数组分配给设备并获得有效结果？

UPDATE: I've inserted the code for error checking,
the error I'm getting is -> "Invalid configuration argument".
The updated code is:

#include <stdio.h>
#include <time.h>
#include <math.h>
#include <thrust/system_error.h>
#include <thrust/system/cuda/error.h>
#include <sstream>
const int N = 70000;

#define checkCudaErrors(error) {\
    if (error != cudaSuccess) {\
        printf("CUDA Error - %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(error));\
        exit(1);\
        }\
}\

__global__
void add(int *a, int *b) {
    int i = blockIdx.x;
    if (i<N) {
        b[i] = 2*a[i];
    }
}
static int ha[N];
static int hb[N];
int main() {
    // int ha[N], hb[N];
    int max_value[2];

    int deviceCount = 0;
    cudaGetDeviceCount(&deviceCount);
    cudaError_t err=cudaDeviceReset();
    if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
    printf("Device count: %d \n", deviceCount);

    for (int i = 0; i<N; ++i) { ha[i] = i; }
    int *da, *db;
    checkCudaErrors(cudaMalloc((void **)&da, N*sizeof(int)));
    checkCudaErrors(cudaMalloc((void **)&db, N*sizeof(int)));
    checkCudaErrors(cudaMemcpy(da, ha, N*sizeof(int), cudaMemcpyHostToDevice));
    add<<<N, 1>>>(da, db);  // <--- Invalid configuration error
    checkCudaErrors(cudaMemcpy(hb, db, N*sizeof(int), cudaMemcpyDeviceToHost));
    max_value[0] = hb[0];
    int i;
    for (i = 0; i < N; i++) {
        if (hb[i] > max_value[0]) {
            max_value[0] = hb[i];
            max_value[1] = i;
        }
    }
    cudaError_t error = cudaGetLastError();     
    if(error != cudaSuccess) {
        printf("CUDA error: %s\n", cudaGetErrorString(error));
        getchar();
        exit(-1);
    }
    getchar();
    return 0;
}

The device is a GeForce GTX 470 and I'm compiling using
nvcc -o foo new.cu

Answer 1

您的设备 (GTX 470) 是 cc2.0 设备（计算能力）。

无效的配置参数错误是由于对于 cc2.0 设备，一维网格的块数限制为 65535。此信息可在 programming guide ("Maximum x-dimension of a grid of thread blocks") 或通过运行ning deviceQuery CUDA 示例代码。所以你这里选择的N太大了：

add<<<N, 1>>>(da, db);
      ^

对于 cc2.0 设备，通常的解决方法是创建一个多维的线程块网格，这样可以容纳更多的线程块。内核启动参数实际上可以是 dim3 允许指定多维网格（线程块）或多维线程块（线程）的变量。

要正确执行此操作，您还需要更改内核代码以根据可用的多维变量创建适当的全局唯一线程 ID。

以下工作示例提供了一组可能的最小更改来演示该概念，并且运行对我来说是正确的：

$ cat t363.cu
#include <stdio.h>
#include <time.h>
#include <math.h>
#include <thrust/system_error.h>
#include <thrust/system/cuda/error.h>
#include <sstream>
const int N = 70000;

#define checkCudaErrors(error) {\
    if (error != cudaSuccess) {\
        printf("CUDA Error - %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(error));\
        exit(1);\
        }\
}\

__global__
void add(int *a, int *b) {
    int i = blockIdx.x + blockIdx.y*gridDim.x;
    if (i<N) {
        b[i] = 2*a[i];
    }
}
static int ha[N];
static int hb[N];
int main() {
    int max_value[2];

    int deviceCount = 0;
    cudaGetDeviceCount(&deviceCount);
    cudaError_t err=cudaDeviceReset();
    if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
    printf("Device count: %d \n", deviceCount);

    for (int i = 0; i<N; ++i) { ha[i] = i; }
    int *da, *db;
    checkCudaErrors(cudaMalloc((void **)&da, N*sizeof(int)));
    checkCudaErrors(cudaMalloc((void **)&db, N*sizeof(int)));
    checkCudaErrors(cudaMemcpy(da, ha, N*sizeof(int), cudaMemcpyHostToDevice));
    dim3 mygrid(N/10, 10);
    add<<<mygrid, 1>>>(da, db);
    checkCudaErrors(cudaMemcpy(hb, db, N*sizeof(int), cudaMemcpyDeviceToHost));
    max_value[0] = hb[0];
    int i;
    for (i = 0; i < N; i++) {
        if (hb[i] > max_value[0]) {
            max_value[0] = hb[i];
            max_value[1] = i;
        }
    }
    printf("max_value[0] = %d, max_value[1] = %d\n", max_value[0], max_value[1]);
    cudaError_t error = cudaGetLastError();
    if(error != cudaSuccess) {
        printf("CUDA error: %s\n", cudaGetErrorString(error));
        getchar();
        exit(-1);
    }
    return 0;
}
$ nvcc -arch=sm_20 -o t363 t363.cu
nvcc warning : The 'compute_20', 'sm_20', and 'sm_21' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).
$ ./t363
Device count: 4
max_value[0] = 139998, max_value[1] = 69999
$

备注：

如果您运行在 cc3.0 或更高版本的设备上使用您的原始代码，它应该不会抛出该错误。较新的 CUDA 设备将一维网格限制提高到 2^31-1。但是如果你想超过这个块数（大约 2B），那么你将不得不再次进入多维网格。

cc2.0 设备在 CUDA 8 中已弃用，即将发布的 CUDA 9 版本将不再支持它们。

大于 16 位的线程块的无效配置参数

Invalid configuration argument for thread block greater than 16bit

c++

parallel-processing

cuda

nvcc