如果网格维度太大,Cuda 不会修改输入
Cuda does not modify input if grid dimension is too large
考虑以下代码:
#include<iostream>
#include<vector>
#include <cuda.h>
#include <cuda_runtime_api.h>
using namespace std;
__global__ void reduce_or(char* A) {
if(threadIdx.x == 0) {
A[blockIdx.x] = 1;
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int main(int argc, char** argv) {
const uint64_t group_size = 1 << 16; //1 << 15 would work
char *dr;
std::vector<char> result;
result.resize(group_size, 0);
gpuErrchk(cudaMalloc((void **)&dr, group_size));
gpuErrchk(cudaMemcpy(dr, result.data(), group_size, cudaMemcpyHostToDevice));
reduce_or<<<group_size, 32>>>(dr);
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(result.data(), dr, group_size, cudaMemcpyDeviceToHost));
for(int kk = 0; kk < group_size; ++kk) {
if(result[kk]) {
cout << std::dec << kk << std::hex << " " << (unsigned long) result[kk] << endl;
}
}
}
如果块数大于或等于 65536,则输入数组不会被修改,尽管 CUDA 代码示例中的 device_query
表示 x 维度的最大大小远大于 65536:
Device 0: "Tesla K20Xm"
CUDA Driver Version / Runtime Version 6.5 / 6.5
CUDA Capability Major/Minor version number: 3.5
Total amount of global memory: 5760 MBytes (6039339008
[...]
Maximum number of threads per multiprocessor: 2048
Maximum number of threads per block: 1024
Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
[...]
我是不是做错了什么,或者硬件是否谎报了它的功能?这是一个已知的错误?不应该抛出错误吗?
检查编译器选项,要获得大于 65535 的网格大小,您必须将最小计算能力设置为至少 3.0,通过选项:compute_30、sm_30.
查看 Table 13 此处 http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities 正如您可以看到 2.x 设备
Maximum x-dimension of a grid of thread blocks 65535
考虑以下代码:
#include<iostream>
#include<vector>
#include <cuda.h>
#include <cuda_runtime_api.h>
using namespace std;
__global__ void reduce_or(char* A) {
if(threadIdx.x == 0) {
A[blockIdx.x] = 1;
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int main(int argc, char** argv) {
const uint64_t group_size = 1 << 16; //1 << 15 would work
char *dr;
std::vector<char> result;
result.resize(group_size, 0);
gpuErrchk(cudaMalloc((void **)&dr, group_size));
gpuErrchk(cudaMemcpy(dr, result.data(), group_size, cudaMemcpyHostToDevice));
reduce_or<<<group_size, 32>>>(dr);
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(result.data(), dr, group_size, cudaMemcpyDeviceToHost));
for(int kk = 0; kk < group_size; ++kk) {
if(result[kk]) {
cout << std::dec << kk << std::hex << " " << (unsigned long) result[kk] << endl;
}
}
}
如果块数大于或等于 65536,则输入数组不会被修改,尽管 CUDA 代码示例中的 device_query
表示 x 维度的最大大小远大于 65536:
Device 0: "Tesla K20Xm"
CUDA Driver Version / Runtime Version 6.5 / 6.5
CUDA Capability Major/Minor version number: 3.5
Total amount of global memory: 5760 MBytes (6039339008
[...]
Maximum number of threads per multiprocessor: 2048
Maximum number of threads per block: 1024
Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
[...]
我是不是做错了什么,或者硬件是否谎报了它的功能?这是一个已知的错误?不应该抛出错误吗?
检查编译器选项,要获得大于 65535 的网格大小,您必须将最小计算能力设置为至少 3.0,通过选项:compute_30、sm_30.
查看 Table 13 此处 http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities 正如您可以看到 2.x 设备
Maximum x-dimension of a grid of thread blocks 65535