使用二维块网格减少 CUB
CUB reduction using 2D grid of blocks
我正在尝试使用 CUB 缩减法求和。
最大的问题是:
我不确定在使用二维网格时如何 return 将每个块的值发送给主机。
#include <iostream>
#include <math.h>
#include <cub/block/block_reduce.cuh>
#include <cub/block/block_load.cuh>
#include <cub/block/block_store.cuh>
#include <iomanip>
#define nat 1024
#define BLOCK_SIZE 32
#define GRID_SIZE 32
struct frame
{
int natm;
char title[100];
float conf[nat][3];
};
using namespace std;
using namespace cub;
__global__
void add(frame* s, float L, float rc, float* blocksum)
{
int i = blockDim.x*blockIdx.x + threadIdx.x;
int j = blockDim.y*blockIdx.y + threadIdx.y;
float E=0.0, rij, dx, dy, dz;
// Your calculations first so that each thread holds its result
dx = fabs(s->conf[j][0] - s->conf[i][0]);
dy = fabs(s->conf[j][1] - s->conf[i][1]);
dz = fabs(s->conf[j][2] - s->conf[i][2]);
dx = dx - round(dx/L)*L;
dy = dy - round(dy/L)*L;
dz = dz - round(dz/L)*L;
rij = sqrt(dx*dx + dy*dy + dz*dz);
if ((rij <= rc) && (rij > 0.0))
{E = (4*((1/pow(rij,12))-(1/pow(rij,6))));}
// E = 1.0;
__syncthreads();
// Block wise reduction so that one thread in each block holds sum of thread results
typedef cub::BlockReduce<float, BLOCK_SIZE, BLOCK_REDUCE_RAKING, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
float aggregate = BlockReduce(temp_storage).Sum(E);
if (threadIdx.x == 0 && threadIdx.y == 0)
blocksum[blockIdx.x*blockDim.y + blockIdx.y] = aggregate;
}
int main(void)
{
frame * state = (frame*)malloc(sizeof(frame));
float *blocksum = (float*)malloc(GRID_SIZE*GRID_SIZE*sizeof(float));
state->natm = nat; //inicializando o numero de atomos;
char name[] = "estado1";
strcpy(state->title,name);
for (int i = 0; i < nat; i++) {
state->conf[i][0] = i;
state->conf[i][1] = i;
state->conf[i][2] = i;
}
frame * d_state;
float *d_blocksum;
cudaMalloc((void**)&d_state, sizeof(frame));
cudaMalloc((void**)&d_blocksum, ((GRID_SIZE*GRID_SIZE)*sizeof(float)));
cudaMemcpy(d_state, state, sizeof(frame),cudaMemcpyHostToDevice);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 gridBlock(GRID_SIZE,GRID_SIZE);
add<<<gridBlock,dimBlock>>>(d_state, 3000, 15, d_blocksum);
cudaError_t status = cudaMemcpy(blocksum, d_blocksum, ((GRID_SIZE*GRID_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost);
float Etotal = 0.0;
for (int k = 0; k < GRID_SIZE*GRID_SIZE; k++){
Etotal += blocksum[k];
}
cout << endl << "energy: " << Etotal << endl;
if (cudaSuccess != status)
{
cout << cudaGetErrorString(status) << endl;
}
// Free memory
cudaFree(d_state);
cudaFree(d_blocksum);
return cudaThreadExit();
}
如果 GRID_SIZE
的值与 BLOCK_SIZE
的值相同,如上所写。计算正确。但是如果我改变GRID_SIZE
的值,结果就出错了。这让我认为错误出在这段代码中:
blocksum[blockIdx.x*blockDim.y + blockIdx.y] = aggregate;
这里的想法是return一个一维数组,其中包含每个块的总和。
我不打算更改 BLOCK_SIZE
值,但 GRID_SIZE
的值取决于我正在查看的系统,我打算使用大于 32 的值(始终是那个)。
我找了一些将 2D 网格与 CUB 结合使用的示例,但没有找到。
我是 CUDA 程序的新手,也许我犯了一个错误。
edit: 我把完整的代码放上去。
作为比较,当我为一个串行程序计算这些精确值时,它给了我能量:-297,121
可能主要问题是您的输出索引不正确。这是代码的简化版本,演示了任意 GRID_SIZE
:
的正确结果
$ cat t1360.cu
#include <stdio.h>
#include <cub/cub.cuh>
#define BLOCK_SIZE 32
#define GRID_SIZE 25
__global__
void add(float* blocksum)
{
float E = 1.0;
// Block wise reduction so that one thread in each block holds sum of thread results
typedef cub::BlockReduce<float, BLOCK_SIZE, cub::BLOCK_REDUCE_RAKING, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
float aggregate = BlockReduce(temp_storage).Sum(E);
__syncthreads();
if (threadIdx.x == 0 && threadIdx.y == 0)
blocksum[blockIdx.y*gridDim.x + blockIdx.x] = aggregate;
}
int main(){
float *d_result, *h_result;
h_result = (float *)malloc(GRID_SIZE*GRID_SIZE*sizeof(float));
cudaMalloc(&d_result, GRID_SIZE*GRID_SIZE*sizeof(float));
dim3 grid = dim3(GRID_SIZE,GRID_SIZE);
dim3 block = dim3(BLOCK_SIZE, BLOCK_SIZE);
add<<<grid, block>>>(d_result);
cudaMemcpy(h_result, d_result, GRID_SIZE*GRID_SIZE*sizeof(float), cudaMemcpyDeviceToHost);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {printf("cuda error: %s\n", cudaGetErrorString(err)); return -1;}
float result = 0;
for (int i = 0; i < GRID_SIZE*GRID_SIZE; i++) result += h_result[i];
if (result != (float)(GRID_SIZE*GRID_SIZE*BLOCK_SIZE*BLOCK_SIZE)) printf("mismatch, should be: %f, was: %f\n", (float)(GRID_SIZE*GRID_SIZE*BLOCK_SIZE*BLOCK_SIZE), result);
else printf("Success\n");
return 0;
}
$ nvcc -o t1360 t1360.cu
$ ./t1360
Success
$
我对您的内核代码所做的重要更改是在输出索引中:
blocksum[blockIdx.y*gridDim.x + blockIdx.x] = aggregate;
我们想要一个模拟的二维索引到一个数组中,该数组的宽度和高度为 GRID_SIZE
,每个点包含一个 float
个数量。因此,此数组的宽度由 gridDim.x
(而非 blockDim
)给出。 gridDim
变量以块为单位给出了网格的尺寸——这与我们的结果数组的设置方式完全一致。
如果 GRID_SIZE
和 BLOCK_SIZE
不同(例如,如果 GRID_SIZE
小于 BLOCK_SIZE
,则 cuda-memcheck
将显示非法访问,如果 GRID_SIZE
大于 BLOCK_SIZE
那么这个索引错误将导致块覆盖输出数组中彼此的值)因为 blockDim
和 [=17= 之间的这种混淆].
另请注意,float
操作通常只有大约 5 位小数的精度。如此小的第 5 位或第 6 位小数差异可能归因于 order of operations differences when doing floating-point arithmetic。您可以通过切换到 double
算术来向自己证明这一点。
我正在尝试使用 CUB 缩减法求和。
最大的问题是: 我不确定在使用二维网格时如何 return 将每个块的值发送给主机。
#include <iostream>
#include <math.h>
#include <cub/block/block_reduce.cuh>
#include <cub/block/block_load.cuh>
#include <cub/block/block_store.cuh>
#include <iomanip>
#define nat 1024
#define BLOCK_SIZE 32
#define GRID_SIZE 32
struct frame
{
int natm;
char title[100];
float conf[nat][3];
};
using namespace std;
using namespace cub;
__global__
void add(frame* s, float L, float rc, float* blocksum)
{
int i = blockDim.x*blockIdx.x + threadIdx.x;
int j = blockDim.y*blockIdx.y + threadIdx.y;
float E=0.0, rij, dx, dy, dz;
// Your calculations first so that each thread holds its result
dx = fabs(s->conf[j][0] - s->conf[i][0]);
dy = fabs(s->conf[j][1] - s->conf[i][1]);
dz = fabs(s->conf[j][2] - s->conf[i][2]);
dx = dx - round(dx/L)*L;
dy = dy - round(dy/L)*L;
dz = dz - round(dz/L)*L;
rij = sqrt(dx*dx + dy*dy + dz*dz);
if ((rij <= rc) && (rij > 0.0))
{E = (4*((1/pow(rij,12))-(1/pow(rij,6))));}
// E = 1.0;
__syncthreads();
// Block wise reduction so that one thread in each block holds sum of thread results
typedef cub::BlockReduce<float, BLOCK_SIZE, BLOCK_REDUCE_RAKING, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
float aggregate = BlockReduce(temp_storage).Sum(E);
if (threadIdx.x == 0 && threadIdx.y == 0)
blocksum[blockIdx.x*blockDim.y + blockIdx.y] = aggregate;
}
int main(void)
{
frame * state = (frame*)malloc(sizeof(frame));
float *blocksum = (float*)malloc(GRID_SIZE*GRID_SIZE*sizeof(float));
state->natm = nat; //inicializando o numero de atomos;
char name[] = "estado1";
strcpy(state->title,name);
for (int i = 0; i < nat; i++) {
state->conf[i][0] = i;
state->conf[i][1] = i;
state->conf[i][2] = i;
}
frame * d_state;
float *d_blocksum;
cudaMalloc((void**)&d_state, sizeof(frame));
cudaMalloc((void**)&d_blocksum, ((GRID_SIZE*GRID_SIZE)*sizeof(float)));
cudaMemcpy(d_state, state, sizeof(frame),cudaMemcpyHostToDevice);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 gridBlock(GRID_SIZE,GRID_SIZE);
add<<<gridBlock,dimBlock>>>(d_state, 3000, 15, d_blocksum);
cudaError_t status = cudaMemcpy(blocksum, d_blocksum, ((GRID_SIZE*GRID_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost);
float Etotal = 0.0;
for (int k = 0; k < GRID_SIZE*GRID_SIZE; k++){
Etotal += blocksum[k];
}
cout << endl << "energy: " << Etotal << endl;
if (cudaSuccess != status)
{
cout << cudaGetErrorString(status) << endl;
}
// Free memory
cudaFree(d_state);
cudaFree(d_blocksum);
return cudaThreadExit();
}
如果 GRID_SIZE
的值与 BLOCK_SIZE
的值相同,如上所写。计算正确。但是如果我改变GRID_SIZE
的值,结果就出错了。这让我认为错误出在这段代码中:
blocksum[blockIdx.x*blockDim.y + blockIdx.y] = aggregate;
这里的想法是return一个一维数组,其中包含每个块的总和。
我不打算更改 BLOCK_SIZE
值,但 GRID_SIZE
的值取决于我正在查看的系统,我打算使用大于 32 的值(始终是那个)。
我找了一些将 2D 网格与 CUB 结合使用的示例,但没有找到。
我是 CUDA 程序的新手,也许我犯了一个错误。
edit: 我把完整的代码放上去。 作为比较,当我为一个串行程序计算这些精确值时,它给了我能量:-297,121
可能主要问题是您的输出索引不正确。这是代码的简化版本,演示了任意 GRID_SIZE
:
$ cat t1360.cu
#include <stdio.h>
#include <cub/cub.cuh>
#define BLOCK_SIZE 32
#define GRID_SIZE 25
__global__
void add(float* blocksum)
{
float E = 1.0;
// Block wise reduction so that one thread in each block holds sum of thread results
typedef cub::BlockReduce<float, BLOCK_SIZE, cub::BLOCK_REDUCE_RAKING, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
float aggregate = BlockReduce(temp_storage).Sum(E);
__syncthreads();
if (threadIdx.x == 0 && threadIdx.y == 0)
blocksum[blockIdx.y*gridDim.x + blockIdx.x] = aggregate;
}
int main(){
float *d_result, *h_result;
h_result = (float *)malloc(GRID_SIZE*GRID_SIZE*sizeof(float));
cudaMalloc(&d_result, GRID_SIZE*GRID_SIZE*sizeof(float));
dim3 grid = dim3(GRID_SIZE,GRID_SIZE);
dim3 block = dim3(BLOCK_SIZE, BLOCK_SIZE);
add<<<grid, block>>>(d_result);
cudaMemcpy(h_result, d_result, GRID_SIZE*GRID_SIZE*sizeof(float), cudaMemcpyDeviceToHost);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {printf("cuda error: %s\n", cudaGetErrorString(err)); return -1;}
float result = 0;
for (int i = 0; i < GRID_SIZE*GRID_SIZE; i++) result += h_result[i];
if (result != (float)(GRID_SIZE*GRID_SIZE*BLOCK_SIZE*BLOCK_SIZE)) printf("mismatch, should be: %f, was: %f\n", (float)(GRID_SIZE*GRID_SIZE*BLOCK_SIZE*BLOCK_SIZE), result);
else printf("Success\n");
return 0;
}
$ nvcc -o t1360 t1360.cu
$ ./t1360
Success
$
我对您的内核代码所做的重要更改是在输出索引中:
blocksum[blockIdx.y*gridDim.x + blockIdx.x] = aggregate;
我们想要一个模拟的二维索引到一个数组中,该数组的宽度和高度为 GRID_SIZE
,每个点包含一个 float
个数量。因此,此数组的宽度由 gridDim.x
(而非 blockDim
)给出。 gridDim
变量以块为单位给出了网格的尺寸——这与我们的结果数组的设置方式完全一致。
如果 GRID_SIZE
和 BLOCK_SIZE
不同(例如,如果 GRID_SIZE
小于 BLOCK_SIZE
,则 cuda-memcheck
将显示非法访问,如果 GRID_SIZE
大于 BLOCK_SIZE
那么这个索引错误将导致块覆盖输出数组中彼此的值)因为 blockDim
和 [=17= 之间的这种混淆].
另请注意,float
操作通常只有大约 5 位小数的精度。如此小的第 5 位或第 6 位小数差异可能归因于 order of operations differences when doing floating-point arithmetic。您可以通过切换到 double
算术来向自己证明这一点。