gpugems3 中的前缀扫描 CUDA 示例代码是否正确?
Is prefix scan CUDA sample code in gpugems3 correct?
我在GPU Gems 3, Chapter 39: Parallel Prefix Sum (Scan) with CUDA.
一书中写了一段调用内核的代码
但是我得到的结果是一堆负数而不是前缀扫描。
是我的内核调用错误还是 GPU Gems 3 书中的代码有问题?
这是我的代码:
#include <stdio.h>
#include <sys/time.h>
#include <cuda.h>
__global__ void kernel(int *g_odata, int *g_idata, int n, int dim)
{
extern __shared__ int temp[];// allocated on invocation
int thid = threadIdx.x;
int offset = 1;
temp[2*thid] = g_idata[2*thid]; // load input into shared memory
temp[2*thid+1] = g_idata[2*thid+1];
for (int d = n>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
temp[bi] += g_idata[ai];
}
offset *= 2;
}
if (thid == 0) { temp[n - 1] = 0; } // clear the last element
for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
int t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid]; // write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
void Initialize(int *h_in,int num_items)
{
int j;
for(j=0;j<num_items;j++)
h_in[j]=j;
printf(" input: ");
printf("\n\n");
}
int main(int argc, char** argv)
{
int num_items = 512;
int* h_in = new int[num_items];
// Initialize problem
Initialize(h_in, num_items);
int *d_in = NULL;
cudaMalloc((void**)&d_in, sizeof(int) * num_items);
if(cudaSuccess != cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice)) fprintf(stderr,"could not copy to gpu");
// Allocate device output array
int *d_out = NULL;
cudaMalloc((void**)&d_out, sizeof(int) * (num_items+1));
kernel<<<1,256,num_items*sizeof(int)>>>(d_out, d_in,num_items, 2);
int* h_out= new int[num_items+1];
if(cudaSuccess != cudaMemcpy(h_out,d_out,sizeof(int)*(num_items+1),cudaMemcpyDeviceToHost))fprintf(stderr,"could not copy back");
int i;
printf(" \n");
for(i=0;i<num_items;i++)
printf(" ,%d ",h_out[i]);
// Cleanup
if (h_in) delete[] h_in;
if (h_out) delete[] h_out;
if (d_in) cudaFree(d_in);
if (d_out) cudaFree(d_out);
printf("\n\n");
return 0;
}
看来您在将 GPU Gems 3 chapter 中的代码转录到您的内核时至少犯了 1 个错误。此行不正确:
temp[bi] += g_idata[ai];
应该是:
temp[bi] += temp[ai];
当我对您现在发布的代码进行这一更改时,它似乎为我打印出正确的(独占扫描)前缀总和。还有一些其他的事情我想提一下:
即使没有那个改变,我也会得到一些接近正确的结果。因此,如果您得到的结果差异很大(例如负数),则您的机器设置或 CUDA 安装可能有问题。我建议使用比现在更严格的 cuda error checking(尽管机器设置问题应该在您的一项检查中指出。)
制作的例程会有一些限制。它只能在单个线程块中使用,它会在共享内存访问上发生内存冲突,并且数据集大小将被限制为单个线程块可以处理的大小(该例程每个线程产生两个输出元素,因此数据集大小预计等于线程数的两倍)。正如已经介绍过的,动态共享内存分配需要与数据集大小一样大(即线程大小的两倍,元素数量)。
这可能对学习有用,但如果你想要一个健壮、快速的前缀扫描,建议你使用来自 thrust or cub 的例程而不是你自己的代码,即使是派生的来自这篇(旧)文章。
以下代码与您的代码类似,但已解决上述问题,并且我已将内核模板化以用于各种数据类型:
#include <stdio.h>
#define DSIZE 512
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef int mytype;
template <typename T>
__global__ void prescan(T *g_odata, T *g_idata, int n)
{
extern __shared__ T temp[]; // allocated on invocation
int thid = threadIdx.x;
int offset = 1;
temp[2*thid] = g_idata[2*thid]; // load input into shared memory
temp[2*thid+1] = g_idata[2*thid+1];
for (int d = n>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
if (thid == 0) { temp[n - 1] = 0; } // clear the last element
for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
T t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid]; // write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
int main(){
mytype *h_i, *d_i, *h_o, *d_o;
int dszp = (DSIZE)*sizeof(mytype);
h_i = (mytype *)malloc(dszp);
h_o = (mytype *)malloc(dszp);
if ((h_i == NULL) || (h_o == NULL)) {printf("malloc fail\n"); return 1;}
cudaMalloc(&d_i, dszp);
cudaMalloc(&d_o, dszp);
cudaCheckErrors("cudaMalloc fail");
for (int i = 0 ; i < DSIZE; i++){
h_i[i] = i;
h_o[i] = 0;}
cudaMemset(d_o, 0, dszp);
cudaCheckErrors("cudaMemset fail");
cudaMemcpy(d_i, h_i, dszp, cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy 1 fail");
prescan<<<1,DSIZE/2, dszp>>>(d_o, d_i, DSIZE);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
cudaMemcpy(h_o, d_o, dszp, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2 fail");
mytype psum = 0;
for (int i =1; i < DSIZE; i++){
psum += h_i[i-1];
if (psum != h_o[i]) {printf("mismatch at %d, was: %d, should be: %d\n", i, h_o[i], psum); return 1;}
}
return 0;
}
我在GPU Gems 3, Chapter 39: Parallel Prefix Sum (Scan) with CUDA.
一书中写了一段调用内核的代码但是我得到的结果是一堆负数而不是前缀扫描。
是我的内核调用错误还是 GPU Gems 3 书中的代码有问题?
这是我的代码:
#include <stdio.h>
#include <sys/time.h>
#include <cuda.h>
__global__ void kernel(int *g_odata, int *g_idata, int n, int dim)
{
extern __shared__ int temp[];// allocated on invocation
int thid = threadIdx.x;
int offset = 1;
temp[2*thid] = g_idata[2*thid]; // load input into shared memory
temp[2*thid+1] = g_idata[2*thid+1];
for (int d = n>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
temp[bi] += g_idata[ai];
}
offset *= 2;
}
if (thid == 0) { temp[n - 1] = 0; } // clear the last element
for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
int t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid]; // write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
void Initialize(int *h_in,int num_items)
{
int j;
for(j=0;j<num_items;j++)
h_in[j]=j;
printf(" input: ");
printf("\n\n");
}
int main(int argc, char** argv)
{
int num_items = 512;
int* h_in = new int[num_items];
// Initialize problem
Initialize(h_in, num_items);
int *d_in = NULL;
cudaMalloc((void**)&d_in, sizeof(int) * num_items);
if(cudaSuccess != cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice)) fprintf(stderr,"could not copy to gpu");
// Allocate device output array
int *d_out = NULL;
cudaMalloc((void**)&d_out, sizeof(int) * (num_items+1));
kernel<<<1,256,num_items*sizeof(int)>>>(d_out, d_in,num_items, 2);
int* h_out= new int[num_items+1];
if(cudaSuccess != cudaMemcpy(h_out,d_out,sizeof(int)*(num_items+1),cudaMemcpyDeviceToHost))fprintf(stderr,"could not copy back");
int i;
printf(" \n");
for(i=0;i<num_items;i++)
printf(" ,%d ",h_out[i]);
// Cleanup
if (h_in) delete[] h_in;
if (h_out) delete[] h_out;
if (d_in) cudaFree(d_in);
if (d_out) cudaFree(d_out);
printf("\n\n");
return 0;
}
看来您在将 GPU Gems 3 chapter 中的代码转录到您的内核时至少犯了 1 个错误。此行不正确:
temp[bi] += g_idata[ai];
应该是:
temp[bi] += temp[ai];
当我对您现在发布的代码进行这一更改时,它似乎为我打印出正确的(独占扫描)前缀总和。还有一些其他的事情我想提一下:
即使没有那个改变,我也会得到一些接近正确的结果。因此,如果您得到的结果差异很大(例如负数),则您的机器设置或 CUDA 安装可能有问题。我建议使用比现在更严格的 cuda error checking(尽管机器设置问题应该在您的一项检查中指出。)
制作的例程会有一些限制。它只能在单个线程块中使用,它会在共享内存访问上发生内存冲突,并且数据集大小将被限制为单个线程块可以处理的大小(该例程每个线程产生两个输出元素,因此数据集大小预计等于线程数的两倍)。正如已经介绍过的,动态共享内存分配需要与数据集大小一样大(即线程大小的两倍,元素数量)。
这可能对学习有用,但如果你想要一个健壮、快速的前缀扫描,建议你使用来自 thrust or cub 的例程而不是你自己的代码,即使是派生的来自这篇(旧)文章。
以下代码与您的代码类似,但已解决上述问题,并且我已将内核模板化以用于各种数据类型:
#include <stdio.h>
#define DSIZE 512
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef int mytype;
template <typename T>
__global__ void prescan(T *g_odata, T *g_idata, int n)
{
extern __shared__ T temp[]; // allocated on invocation
int thid = threadIdx.x;
int offset = 1;
temp[2*thid] = g_idata[2*thid]; // load input into shared memory
temp[2*thid+1] = g_idata[2*thid+1];
for (int d = n>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
if (thid == 0) { temp[n - 1] = 0; } // clear the last element
for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
T t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid]; // write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
int main(){
mytype *h_i, *d_i, *h_o, *d_o;
int dszp = (DSIZE)*sizeof(mytype);
h_i = (mytype *)malloc(dszp);
h_o = (mytype *)malloc(dszp);
if ((h_i == NULL) || (h_o == NULL)) {printf("malloc fail\n"); return 1;}
cudaMalloc(&d_i, dszp);
cudaMalloc(&d_o, dszp);
cudaCheckErrors("cudaMalloc fail");
for (int i = 0 ; i < DSIZE; i++){
h_i[i] = i;
h_o[i] = 0;}
cudaMemset(d_o, 0, dszp);
cudaCheckErrors("cudaMemset fail");
cudaMemcpy(d_i, h_i, dszp, cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy 1 fail");
prescan<<<1,DSIZE/2, dszp>>>(d_o, d_i, DSIZE);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
cudaMemcpy(h_o, d_o, dszp, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2 fail");
mytype psum = 0;
for (int i =1; i < DSIZE; i++){
psum += h_i[i-1];
if (psum != h_o[i]) {printf("mismatch at %d, was: %d, should be: %d\n", i, h_o[i], psum); return 1;}
}
return 0;
}