双数组的 CUDA 最小缩减和索引
CUDA min reduction and index of double array
我正在尝试实施取自 的最小缩减算法。我不能为这个项目使用 thrust 或其他库,所以我必须坚持使用纯 CUDA。目标是将代码扩展到非常大的数组,根据我的经验,在我的机器上,推力对于我的目的来说太慢了。
下面是代码,它生成一个包含 4096 个元素的 double
数组,其中每个元素都等于其索引(即 [0,1,2,3,....,4096-1] ) 并且在给定索引处人为地添加了 -1 值(在本例中为 4091)。但是,代码似乎不起作用。我正在使用 cuda 11 和 Visual Studio 2017 使用 nvcc -w -arch=sm_50 input.cu -o output.exe
在 Windows 机器上为我的 GPU (CC=5.0) 编译它
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <math.h>
//#include <unistd.h>
//#include <sys/time.h>
#if __DEVICE_EMULATION__
#define DEBUG_SYNC __syncthreads();
#else
#define DEBUG_SYNC
#endif
#ifndef MIN
#define MIN(x,y) ((x < y) ? x : y)
#endif
#ifndef MIN_IDX
#define MIN_IDX(x,y, idx_x, idx_y) ((x < y) ? idx_x : idx_y)
#endif
#if (__CUDA_ARCH__ < 200)
#define int_mult(x,y) __mul24(x,y)
#else
#define int_mult(x,y) x*y
#endif
#define inf 0x7f800000
bool isPow2(unsigned int x)
{
return ((x&(x-1))==0);
}
unsigned int nextPow2(unsigned int x)
{
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
// Utility class used to avoid linker errors with extern
// unsized shared memory arrays with templated type
template<class T>
struct SharedMemory {
__device__ inline operator T *() {
extern __shared__ int __smem[];
return (T *) __smem;
}
__device__ inline operator const T *() const {
extern __shared__ int __smem[];
return (T *) __smem;
}
};
// specialize for double to avoid unaligned memory
// access compile errors
template<>
struct SharedMemory<double> {
__device__ inline operator double *() {
extern __shared__ double __smem_d[];
return (double *) __smem_d;
}
__device__ inline operator const double *() const {
extern __shared__ double __smem_d[];
return (double *) __smem_d;
}
};
template<class T, unsigned int blockSize, bool nIsPow2>
__global__ void reduceMin6(T *g_idata, int *g_idxs, T *g_odata,int *g_oIdxs, unsigned int n) {
T *sdata = SharedMemory<T>();
int *sdataIdx = ((int *)sdata) + blockSize;
//int *sdataIdx = SharedMemory<int>();
// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
unsigned int gridSize = blockSize * 2 * gridDim.x;
T myMin = 99999;
int myMinIdx = -1;
// we reduce multiple elements per thread. The number is determined by the
// number of active thread blocks (via gridDim). More blocks will result
// in a larger gridSize and therefore fewer elements per thread
while (i < n) {
myMinIdx = MIN_IDX(g_idata[i], myMin, g_idxs[i], myMinIdx);
myMin = MIN(g_idata[i], myMin);
// ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
if (nIsPow2 || i + blockSize < n){
//myMin += g_idata[i + blockSize];
myMinIdx = MIN_IDX(g_idata[i + blockSize], myMin, g_idxs[i + blockSize], myMinIdx);
myMin = MIN(g_idata[i + blockSize], myMin);
}
i += gridSize;
}
// each thread puts its local sum into shared memory
sdata[tid] = myMin;
sdataIdx[tid] = myMinIdx;
__syncthreads();
// do reduction in shared mem
if ((blockSize >= 2048) && (tid < 1024)) {
//sdata[tid] = mySum = mySum + sdata[tid + 256];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 512], myMin, sdataIdx[tid + 512], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 512], myMin);
}
__syncthreads();
// do reduction in shared mem
if ((blockSize >= 1024) && (tid < 512)) {
//sdata[tid] = mySum = mySum + sdata[tid + 256];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 512], myMin, sdataIdx[tid + 512], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 512], myMin);
}
__syncthreads();
// do reduction in shared mem
if ((blockSize >= 512) && (tid < 256)) {
//sdata[tid] = mySum = mySum + sdata[tid + 256];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 256], myMin, sdataIdx[tid + 256], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 256], myMin);
}
__syncthreads();
if ((blockSize >= 256) && (tid < 128)) {
//sdata[tid] = myMin = myMin + sdata[tid + 128];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 128], myMin, sdataIdx[tid + 128], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 128], myMin);
}
__syncthreads();
if ((blockSize >= 128) && (tid < 64)) {
//sdata[tid] = myMin = myMin + sdata[tid + 64];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 64], myMin, sdataIdx[tid + 64], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 64], myMin);
}
__syncthreads();
#if (__CUDA_ARCH__ >= 300 )
if (tid < 32) {
// Fetch final intermediate sum from 2nd warp
if (blockSize >= 64){
//myMin += sdata[tid + 32];
myMinIdx = MIN_IDX(sdata[tid + 32], myMin, sdataIdx[tid + 32], myMinIdx);
myMin = MIN(sdata[tid + 32], myMin);
}
// Reduce final warp using shuffle
for (int offset = warpSize / 2; offset > 0; offset /= 2) {
//myMin += __shfl_down(myMin, offset);
int tempMyMinIdx = __shfl_down(myMinIdx, offset);
double tempMyMin = __shfl_down(myMin, offset);
myMinIdx = MIN_IDX(tempMyMin, myMin, tempMyMinIdx , myMinIdx);
myMin = MIN(tempMyMin, myMin);
}
}
#else
// fully unroll reduction within a single warp
if ((blockSize >= 64) && (tid < 32))
{
//sdata[tid] = myMin = myMin + sdata[tid + 32];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 32], myMin, sdataIdx[tid + 32], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 32], myMin);
}
__syncthreads();
if ((blockSize >= 32) && (tid < 16))
{
//sdata[tid] = myMin = myMin + sdata[tid + 16];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 16], myMin, sdataIdx[tid + 16], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 16], myMin);
}
__syncthreads();
if ((blockSize >= 16) && (tid < 8))
{
//sdata[tid] = myMin = myMin + sdata[tid + 8];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 8], myMin, sdataIdx[tid + 8], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 8], myMin);
}
__syncthreads();
if ((blockSize >= 8) && (tid < 4))
{
//sdata[tid] = myMin = myMin + sdata[tid + 4];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 4], myMin, sdataIdx[tid + 4], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 4], myMin);
}
__syncthreads();
if ((blockSize >= 4) && (tid < 2))
{
//sdata[tid] = myMin = myMin + sdata[tid + 2];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 2], myMin, sdataIdx[tid + 2], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 2], myMin);
}
__syncthreads();
if ((blockSize >= 2) && ( tid < 1))
{
//sdata[tid] = myMin = myMin + sdata[tid + 1];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 1], myMin, sdataIdx[tid + 1], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 1], myMin);
}
__syncthreads();
#endif
__syncthreads();
// write result for this block to global mem
if (tid == 0){
g_odata[blockIdx.x] = myMin;
g_oIdxs[blockIdx.x] = myMinIdx;
}
}
void getNumBlocksAndThreads(int whichKernel, int n, int maxBlocks,
int maxThreads, int &blocks, int &threads) {
//get device capability, to avoid block/grid size exceed the upper bound
cudaDeviceProp prop;
int device;
cudaGetDevice(&device);
cudaGetDeviceProperties(&prop, device);
if (whichKernel < 3) {
threads = (n < maxThreads) ? nextPow2(n) : maxThreads;
blocks = (n + threads - 1) / threads;
} else {
threads = (n < maxThreads * 2) ? nextPow2((n + 1) / 2) : maxThreads;
blocks = (n + (threads * 2 - 1)) / (threads * 2);
}
if ((float) threads * blocks
> (float) prop.maxGridSize[0] * prop.maxThreadsPerBlock) {
printf("n is too large, please choose a smaller number!\n");
}
if (blocks > prop.maxGridSize[0]) {
printf(
"Grid size <%d> exceeds the device capability <%d>, set block size as %d (original %d)\n",
blocks, prop.maxGridSize[0], threads * 2, threads);
blocks /= 2;
threads *= 2;
}
if (whichKernel == 6) {
blocks = MIN(maxBlocks, blocks);
}
}
template<class T>
void reduceMin(int size, int threads, int blocks, int whichKernel, T *d_idata,
T *d_odata, int *idxs, int *oIdxs) {
dim3 dimBlock(threads, 1, 1);
dim3 dimGrid(blocks, 1, 1);
// when there is only one warp per block, we need to allocate two warps
// worth of shared memory so that we don't index shared memory out of bounds
int smemSize =(threads <= 32) ? 2 * threads * sizeof(T) : threads * sizeof(T); // shared mem. for the amount of double i have
smemSize += threads*sizeof(int); // shared memory for the indexes
if (isPow2(size)) {
switch (threads) {
case 2048:
reduceMin6<T, 2048, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 1024:
reduceMin6<T, 1024, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 512:
reduceMin6<T, 512, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 256:
reduceMin6<T, 256, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 128:
reduceMin6<T, 128, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 64:
reduceMin6<T, 64, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 32:
reduceMin6<T, 32, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 16:
reduceMin6<T, 16, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 8:
reduceMin6<T, 8, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 4:
reduceMin6<T, 4, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 2:
reduceMin6<T, 2, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 1:
reduceMin6<T, 1, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
}
} else {
switch (threads) {
case 2048:
reduceMin6<T, 2048, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break ;
case 1024:
reduceMin6<T, 1024, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break ;
case 512:
reduceMin6<T, 512, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 256:
reduceMin6<T, 256, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 128:
reduceMin6<T, 128, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 64:
reduceMin6<T, 64, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 32:
reduceMin6<T, 32, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 16:
reduceMin6<T, 16, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 8:
reduceMin6<T, 8, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 4:
reduceMin6<T, 4, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 2:
reduceMin6<T, 2, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 1:
reduceMin6<T, 1, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
}
}
}
template<class T>
void reduceMINCPU(T *data, int size, T *min, int *idx)
{
*min = data[0];
int min_idx = 0;
T c = (T)0.0;
for (int i = 1; i < size; i++)
{
T y = data[i];
T t = MIN(*min, y);
min_idx = MIN_IDX(*min, y, min_idx, i);
(*min) = t;
}
*idx = min_idx;
return;
}
// Instantiate the reduction function for 3 types
template void
reduceMin<int>(int size, int threads, int blocks, int whichKernel, int *d_idata,
int *d_odata, int *idxs, int *oIdxs);
template void
reduceMin<float>(int size, int threads, int blocks, int whichKernel, float *d_idata,
float *d_odata, int *idxs, int *oIdxs);
template void
reduceMin<double>(int size, int threads, int blocks, int whichKernel, double *d_idata,
double *d_odata, int *idxs, int *oIdxs);
int main(){
int num_els=8*8*8*8;
int maxThreads = 128; // number of threads per block
int whichKernel = 6;
int maxBlocks = 1000000;
double* d_in = NULL;
double* d_out = NULL;
int *d_idxs = NULL;
int *d_oIdxs = NULL;
printf("%d elements\n", num_els);
printf("%d threads (max)\n", maxThreads);
int numBlocks = 0;
int numThreads = 0;
getNumBlocksAndThreads(whichKernel, num_els, maxBlocks, maxThreads, numBlocks,
numThreads);
printf("%d numBlocks \n", numBlocks);
printf("%d numThreads \n", numThreads);
cudaMalloc((void **) &d_in, num_els * sizeof(double));
cudaMalloc((void **) &d_idxs, num_els * sizeof(int));
cudaMalloc((void **) &d_out, numBlocks * sizeof(double));
cudaMalloc((void **) &d_oIdxs, numBlocks * sizeof(int));
printf("Finished cudaMalloc");
double* in = (double*) malloc(num_els * sizeof(double));
int *idxs = (int*) malloc(num_els * sizeof(int));
double* out = (double*) malloc(numBlocks * sizeof(double));
int* oIdxs = (int*) malloc(numBlocks * sizeof(int));
for (int i = 0; i < num_els; i++) {
in[i] = (double)i*1.0;
idxs[i] = i;
}
in[num_els-5]=(double)-1.0;
printf("Verify that the in array is correctly filled \n");
for (int i = num_els-10; i < num_els; i++) {
//
printf("\n [%d] = %.4f", idxs[i], in[i]);
}
// copy data directly to device memory
cudaMemcpy(d_in, in, num_els * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_idxs, idxs, num_els * sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(d_out, out, numBlocks * sizeof(double),cudaMemcpyHostToDevice);
cudaMemcpy(d_oIdxs, oIdxs, numBlocks * sizeof(int),cudaMemcpyHostToDevice);
printf(" \n Finished cudaMemcopy \n");
reduceMin<double>(num_els, numThreads, numBlocks, whichKernel, d_in, d_out, d_idxs, d_oIdxs);
cudaMemcpy(out, d_out, numBlocks * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(oIdxs, d_oIdxs, numBlocks * sizeof(int), cudaMemcpyDeviceToHost);
for(int i=0; i< numBlocks; i++){
//for(int i=numBlocks-20; i< numBlocks; i++)
printf("\n Reduce MIN GPU idx: %d value: %f", oIdxs[i], out[i]);
}
cudaFree(d_in);
cudaFree(d_out);
cudaFree(d_idxs);
free(in);
free(out);
return 0;
}
但是,如果我转换用于最小化和查找 float
数组索引的代码,它可以正常工作。这是 float 版本的 main
(这是唯一不同的部分,一些变量和数组被指定为“float”而不是“double”)。这个和前者一样编译。
int main(){
int num_els=8*8*8*8;
int maxThreads = 128; // number of threads per block
int whichKernel = 6;
int maxBlocks = 1000000;
float* d_in = NULL;
float* d_out = NULL;
int *d_idxs = NULL;
int *d_oIdxs = NULL;
printf("%d elements\n", num_els);
printf("%d threads (max)\n", maxThreads);
int numBlocks = 0;
int numThreads = 0;
getNumBlocksAndThreads(whichKernel, num_els, maxBlocks, maxThreads, numBlocks,
numThreads);
printf("%d numBlocks \n", numBlocks);
printf("%d numThreads \n", numThreads);
cudaMalloc((void **) &d_in, num_els * sizeof(float));
cudaMalloc((void **) &d_idxs, num_els * sizeof(int));
cudaMalloc((void **) &d_out, numBlocks * sizeof(float));
cudaMalloc((void **) &d_oIdxs, numBlocks * sizeof(int));
printf("Finished cudaMalloc");
float* in = (float*) malloc(num_els * sizeof(float));
int *idxs = (int*) malloc(num_els * sizeof(int));
float* out = (float*) malloc(numBlocks * sizeof(float));
int* oIdxs = (int*) malloc(numBlocks * sizeof(int));
for (int i = 0; i < num_els; i++) {
in[i] = (float)i*1.0;
idxs[i] = i;
}
in[num_els-5]=(float)-1.0;
printf("Verify that the in array is correctly filled \n");
for (int i = num_els-10; i < num_els; i++) {
//
printf("\n [%d] = %.4f", idxs[i], in[i]);
}
// copy data directly to device memory
cudaMemcpy(d_in, in, num_els * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_idxs, idxs, num_els * sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(d_out, out, numBlocks * sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(d_oIdxs, oIdxs, numBlocks * sizeof(int),cudaMemcpyHostToDevice);
printf(" \n Finished cudaMemcopy \n");
reduceMin<float>(num_els, numThreads, numBlocks, whichKernel, d_in, d_out, d_idxs, d_oIdxs);
cudaMemcpy(out, d_out, numBlocks * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(oIdxs, d_oIdxs, numBlocks * sizeof(int), cudaMemcpyDeviceToHost);
for(int i=0; i< numBlocks; i++){
//for(int i=numBlocks-20; i< numBlocks; i++)
printf("\n Reduce MIN GPU idx: %d value: %f", oIdxs[i], out[i]);
}
cudaFree(d_in);
cudaFree(d_out);
cudaFree(d_idxs);
free(in);
free(out);
return 0;
}
问题在这里:
int *sdataIdx = ((int *)sdata) + blockSize;
这行代码正在设置一个指针,以便共享内存可以分为 2 个逻辑部分,一个用于保存要减少的数据以找到最小值(第一个),一个用于保存相应的索引(第二部分——这个指针实际指向的开始。
要正确处理不同的数据类型,应该是:
int *sdataIdx = (int *)(((T *)sdata) + blockSize);
由于sdata
已经是T
类型的指针,我们可以简化:
int *sdataIdx = (int *)(sdata + blockSize);
这会保留一块大小足以容纳正确数据类型(在本例中为双倍)的共享内存,然后是 int
数据类型的块。 float
起作用的原因是 float
和 int
具有相同的大小要求。
另请注意,这适用于 float
、double
和 int
缩减类型。它不一定适用于其他类型。
此代码还会发出许多编译时警告。这些不应被忽视。但这不是问题的关键。
另请注意,如果您愿意,可以使用更少的代码行找到最小或最大加索引,例如 。
我正在尝试实施取自
下面是代码,它生成一个包含 4096 个元素的 double
数组,其中每个元素都等于其索引(即 [0,1,2,3,....,4096-1] ) 并且在给定索引处人为地添加了 -1 值(在本例中为 4091)。但是,代码似乎不起作用。我正在使用 cuda 11 和 Visual Studio 2017 使用 nvcc -w -arch=sm_50 input.cu -o output.exe
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <math.h>
//#include <unistd.h>
//#include <sys/time.h>
#if __DEVICE_EMULATION__
#define DEBUG_SYNC __syncthreads();
#else
#define DEBUG_SYNC
#endif
#ifndef MIN
#define MIN(x,y) ((x < y) ? x : y)
#endif
#ifndef MIN_IDX
#define MIN_IDX(x,y, idx_x, idx_y) ((x < y) ? idx_x : idx_y)
#endif
#if (__CUDA_ARCH__ < 200)
#define int_mult(x,y) __mul24(x,y)
#else
#define int_mult(x,y) x*y
#endif
#define inf 0x7f800000
bool isPow2(unsigned int x)
{
return ((x&(x-1))==0);
}
unsigned int nextPow2(unsigned int x)
{
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
// Utility class used to avoid linker errors with extern
// unsized shared memory arrays with templated type
template<class T>
struct SharedMemory {
__device__ inline operator T *() {
extern __shared__ int __smem[];
return (T *) __smem;
}
__device__ inline operator const T *() const {
extern __shared__ int __smem[];
return (T *) __smem;
}
};
// specialize for double to avoid unaligned memory
// access compile errors
template<>
struct SharedMemory<double> {
__device__ inline operator double *() {
extern __shared__ double __smem_d[];
return (double *) __smem_d;
}
__device__ inline operator const double *() const {
extern __shared__ double __smem_d[];
return (double *) __smem_d;
}
};
template<class T, unsigned int blockSize, bool nIsPow2>
__global__ void reduceMin6(T *g_idata, int *g_idxs, T *g_odata,int *g_oIdxs, unsigned int n) {
T *sdata = SharedMemory<T>();
int *sdataIdx = ((int *)sdata) + blockSize;
//int *sdataIdx = SharedMemory<int>();
// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
unsigned int gridSize = blockSize * 2 * gridDim.x;
T myMin = 99999;
int myMinIdx = -1;
// we reduce multiple elements per thread. The number is determined by the
// number of active thread blocks (via gridDim). More blocks will result
// in a larger gridSize and therefore fewer elements per thread
while (i < n) {
myMinIdx = MIN_IDX(g_idata[i], myMin, g_idxs[i], myMinIdx);
myMin = MIN(g_idata[i], myMin);
// ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
if (nIsPow2 || i + blockSize < n){
//myMin += g_idata[i + blockSize];
myMinIdx = MIN_IDX(g_idata[i + blockSize], myMin, g_idxs[i + blockSize], myMinIdx);
myMin = MIN(g_idata[i + blockSize], myMin);
}
i += gridSize;
}
// each thread puts its local sum into shared memory
sdata[tid] = myMin;
sdataIdx[tid] = myMinIdx;
__syncthreads();
// do reduction in shared mem
if ((blockSize >= 2048) && (tid < 1024)) {
//sdata[tid] = mySum = mySum + sdata[tid + 256];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 512], myMin, sdataIdx[tid + 512], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 512], myMin);
}
__syncthreads();
// do reduction in shared mem
if ((blockSize >= 1024) && (tid < 512)) {
//sdata[tid] = mySum = mySum + sdata[tid + 256];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 512], myMin, sdataIdx[tid + 512], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 512], myMin);
}
__syncthreads();
// do reduction in shared mem
if ((blockSize >= 512) && (tid < 256)) {
//sdata[tid] = mySum = mySum + sdata[tid + 256];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 256], myMin, sdataIdx[tid + 256], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 256], myMin);
}
__syncthreads();
if ((blockSize >= 256) && (tid < 128)) {
//sdata[tid] = myMin = myMin + sdata[tid + 128];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 128], myMin, sdataIdx[tid + 128], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 128], myMin);
}
__syncthreads();
if ((blockSize >= 128) && (tid < 64)) {
//sdata[tid] = myMin = myMin + sdata[tid + 64];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 64], myMin, sdataIdx[tid + 64], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 64], myMin);
}
__syncthreads();
#if (__CUDA_ARCH__ >= 300 )
if (tid < 32) {
// Fetch final intermediate sum from 2nd warp
if (blockSize >= 64){
//myMin += sdata[tid + 32];
myMinIdx = MIN_IDX(sdata[tid + 32], myMin, sdataIdx[tid + 32], myMinIdx);
myMin = MIN(sdata[tid + 32], myMin);
}
// Reduce final warp using shuffle
for (int offset = warpSize / 2; offset > 0; offset /= 2) {
//myMin += __shfl_down(myMin, offset);
int tempMyMinIdx = __shfl_down(myMinIdx, offset);
double tempMyMin = __shfl_down(myMin, offset);
myMinIdx = MIN_IDX(tempMyMin, myMin, tempMyMinIdx , myMinIdx);
myMin = MIN(tempMyMin, myMin);
}
}
#else
// fully unroll reduction within a single warp
if ((blockSize >= 64) && (tid < 32))
{
//sdata[tid] = myMin = myMin + sdata[tid + 32];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 32], myMin, sdataIdx[tid + 32], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 32], myMin);
}
__syncthreads();
if ((blockSize >= 32) && (tid < 16))
{
//sdata[tid] = myMin = myMin + sdata[tid + 16];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 16], myMin, sdataIdx[tid + 16], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 16], myMin);
}
__syncthreads();
if ((blockSize >= 16) && (tid < 8))
{
//sdata[tid] = myMin = myMin + sdata[tid + 8];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 8], myMin, sdataIdx[tid + 8], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 8], myMin);
}
__syncthreads();
if ((blockSize >= 8) && (tid < 4))
{
//sdata[tid] = myMin = myMin + sdata[tid + 4];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 4], myMin, sdataIdx[tid + 4], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 4], myMin);
}
__syncthreads();
if ((blockSize >= 4) && (tid < 2))
{
//sdata[tid] = myMin = myMin + sdata[tid + 2];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 2], myMin, sdataIdx[tid + 2], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 2], myMin);
}
__syncthreads();
if ((blockSize >= 2) && ( tid < 1))
{
//sdata[tid] = myMin = myMin + sdata[tid + 1];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 1], myMin, sdataIdx[tid + 1], myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 1], myMin);
}
__syncthreads();
#endif
__syncthreads();
// write result for this block to global mem
if (tid == 0){
g_odata[blockIdx.x] = myMin;
g_oIdxs[blockIdx.x] = myMinIdx;
}
}
void getNumBlocksAndThreads(int whichKernel, int n, int maxBlocks,
int maxThreads, int &blocks, int &threads) {
//get device capability, to avoid block/grid size exceed the upper bound
cudaDeviceProp prop;
int device;
cudaGetDevice(&device);
cudaGetDeviceProperties(&prop, device);
if (whichKernel < 3) {
threads = (n < maxThreads) ? nextPow2(n) : maxThreads;
blocks = (n + threads - 1) / threads;
} else {
threads = (n < maxThreads * 2) ? nextPow2((n + 1) / 2) : maxThreads;
blocks = (n + (threads * 2 - 1)) / (threads * 2);
}
if ((float) threads * blocks
> (float) prop.maxGridSize[0] * prop.maxThreadsPerBlock) {
printf("n is too large, please choose a smaller number!\n");
}
if (blocks > prop.maxGridSize[0]) {
printf(
"Grid size <%d> exceeds the device capability <%d>, set block size as %d (original %d)\n",
blocks, prop.maxGridSize[0], threads * 2, threads);
blocks /= 2;
threads *= 2;
}
if (whichKernel == 6) {
blocks = MIN(maxBlocks, blocks);
}
}
template<class T>
void reduceMin(int size, int threads, int blocks, int whichKernel, T *d_idata,
T *d_odata, int *idxs, int *oIdxs) {
dim3 dimBlock(threads, 1, 1);
dim3 dimGrid(blocks, 1, 1);
// when there is only one warp per block, we need to allocate two warps
// worth of shared memory so that we don't index shared memory out of bounds
int smemSize =(threads <= 32) ? 2 * threads * sizeof(T) : threads * sizeof(T); // shared mem. for the amount of double i have
smemSize += threads*sizeof(int); // shared memory for the indexes
if (isPow2(size)) {
switch (threads) {
case 2048:
reduceMin6<T, 2048, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 1024:
reduceMin6<T, 1024, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 512:
reduceMin6<T, 512, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 256:
reduceMin6<T, 256, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 128:
reduceMin6<T, 128, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 64:
reduceMin6<T, 64, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 32:
reduceMin6<T, 32, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 16:
reduceMin6<T, 16, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 8:
reduceMin6<T, 8, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 4:
reduceMin6<T, 4, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 2:
reduceMin6<T, 2, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 1:
reduceMin6<T, 1, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
}
} else {
switch (threads) {
case 2048:
reduceMin6<T, 2048, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break ;
case 1024:
reduceMin6<T, 1024, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break ;
case 512:
reduceMin6<T, 512, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 256:
reduceMin6<T, 256, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 128:
reduceMin6<T, 128, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 64:
reduceMin6<T, 64, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 32:
reduceMin6<T, 32, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 16:
reduceMin6<T, 16, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 8:
reduceMin6<T, 8, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 4:
reduceMin6<T, 4, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 2:
reduceMin6<T, 2, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
case 1:
reduceMin6<T, 1, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
d_odata, oIdxs, size);
break;
}
}
}
template<class T>
void reduceMINCPU(T *data, int size, T *min, int *idx)
{
*min = data[0];
int min_idx = 0;
T c = (T)0.0;
for (int i = 1; i < size; i++)
{
T y = data[i];
T t = MIN(*min, y);
min_idx = MIN_IDX(*min, y, min_idx, i);
(*min) = t;
}
*idx = min_idx;
return;
}
// Instantiate the reduction function for 3 types
template void
reduceMin<int>(int size, int threads, int blocks, int whichKernel, int *d_idata,
int *d_odata, int *idxs, int *oIdxs);
template void
reduceMin<float>(int size, int threads, int blocks, int whichKernel, float *d_idata,
float *d_odata, int *idxs, int *oIdxs);
template void
reduceMin<double>(int size, int threads, int blocks, int whichKernel, double *d_idata,
double *d_odata, int *idxs, int *oIdxs);
int main(){
int num_els=8*8*8*8;
int maxThreads = 128; // number of threads per block
int whichKernel = 6;
int maxBlocks = 1000000;
double* d_in = NULL;
double* d_out = NULL;
int *d_idxs = NULL;
int *d_oIdxs = NULL;
printf("%d elements\n", num_els);
printf("%d threads (max)\n", maxThreads);
int numBlocks = 0;
int numThreads = 0;
getNumBlocksAndThreads(whichKernel, num_els, maxBlocks, maxThreads, numBlocks,
numThreads);
printf("%d numBlocks \n", numBlocks);
printf("%d numThreads \n", numThreads);
cudaMalloc((void **) &d_in, num_els * sizeof(double));
cudaMalloc((void **) &d_idxs, num_els * sizeof(int));
cudaMalloc((void **) &d_out, numBlocks * sizeof(double));
cudaMalloc((void **) &d_oIdxs, numBlocks * sizeof(int));
printf("Finished cudaMalloc");
double* in = (double*) malloc(num_els * sizeof(double));
int *idxs = (int*) malloc(num_els * sizeof(int));
double* out = (double*) malloc(numBlocks * sizeof(double));
int* oIdxs = (int*) malloc(numBlocks * sizeof(int));
for (int i = 0; i < num_els; i++) {
in[i] = (double)i*1.0;
idxs[i] = i;
}
in[num_els-5]=(double)-1.0;
printf("Verify that the in array is correctly filled \n");
for (int i = num_els-10; i < num_els; i++) {
//
printf("\n [%d] = %.4f", idxs[i], in[i]);
}
// copy data directly to device memory
cudaMemcpy(d_in, in, num_els * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_idxs, idxs, num_els * sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(d_out, out, numBlocks * sizeof(double),cudaMemcpyHostToDevice);
cudaMemcpy(d_oIdxs, oIdxs, numBlocks * sizeof(int),cudaMemcpyHostToDevice);
printf(" \n Finished cudaMemcopy \n");
reduceMin<double>(num_els, numThreads, numBlocks, whichKernel, d_in, d_out, d_idxs, d_oIdxs);
cudaMemcpy(out, d_out, numBlocks * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(oIdxs, d_oIdxs, numBlocks * sizeof(int), cudaMemcpyDeviceToHost);
for(int i=0; i< numBlocks; i++){
//for(int i=numBlocks-20; i< numBlocks; i++)
printf("\n Reduce MIN GPU idx: %d value: %f", oIdxs[i], out[i]);
}
cudaFree(d_in);
cudaFree(d_out);
cudaFree(d_idxs);
free(in);
free(out);
return 0;
}
但是,如果我转换用于最小化和查找 float
数组索引的代码,它可以正常工作。这是 float 版本的 main
(这是唯一不同的部分,一些变量和数组被指定为“float”而不是“double”)。这个和前者一样编译。
int main(){
int num_els=8*8*8*8;
int maxThreads = 128; // number of threads per block
int whichKernel = 6;
int maxBlocks = 1000000;
float* d_in = NULL;
float* d_out = NULL;
int *d_idxs = NULL;
int *d_oIdxs = NULL;
printf("%d elements\n", num_els);
printf("%d threads (max)\n", maxThreads);
int numBlocks = 0;
int numThreads = 0;
getNumBlocksAndThreads(whichKernel, num_els, maxBlocks, maxThreads, numBlocks,
numThreads);
printf("%d numBlocks \n", numBlocks);
printf("%d numThreads \n", numThreads);
cudaMalloc((void **) &d_in, num_els * sizeof(float));
cudaMalloc((void **) &d_idxs, num_els * sizeof(int));
cudaMalloc((void **) &d_out, numBlocks * sizeof(float));
cudaMalloc((void **) &d_oIdxs, numBlocks * sizeof(int));
printf("Finished cudaMalloc");
float* in = (float*) malloc(num_els * sizeof(float));
int *idxs = (int*) malloc(num_els * sizeof(int));
float* out = (float*) malloc(numBlocks * sizeof(float));
int* oIdxs = (int*) malloc(numBlocks * sizeof(int));
for (int i = 0; i < num_els; i++) {
in[i] = (float)i*1.0;
idxs[i] = i;
}
in[num_els-5]=(float)-1.0;
printf("Verify that the in array is correctly filled \n");
for (int i = num_els-10; i < num_els; i++) {
//
printf("\n [%d] = %.4f", idxs[i], in[i]);
}
// copy data directly to device memory
cudaMemcpy(d_in, in, num_els * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_idxs, idxs, num_els * sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(d_out, out, numBlocks * sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(d_oIdxs, oIdxs, numBlocks * sizeof(int),cudaMemcpyHostToDevice);
printf(" \n Finished cudaMemcopy \n");
reduceMin<float>(num_els, numThreads, numBlocks, whichKernel, d_in, d_out, d_idxs, d_oIdxs);
cudaMemcpy(out, d_out, numBlocks * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(oIdxs, d_oIdxs, numBlocks * sizeof(int), cudaMemcpyDeviceToHost);
for(int i=0; i< numBlocks; i++){
//for(int i=numBlocks-20; i< numBlocks; i++)
printf("\n Reduce MIN GPU idx: %d value: %f", oIdxs[i], out[i]);
}
cudaFree(d_in);
cudaFree(d_out);
cudaFree(d_idxs);
free(in);
free(out);
return 0;
}
问题在这里:
int *sdataIdx = ((int *)sdata) + blockSize;
这行代码正在设置一个指针,以便共享内存可以分为 2 个逻辑部分,一个用于保存要减少的数据以找到最小值(第一个),一个用于保存相应的索引(第二部分——这个指针实际指向的开始。
要正确处理不同的数据类型,应该是:
int *sdataIdx = (int *)(((T *)sdata) + blockSize);
由于sdata
已经是T
类型的指针,我们可以简化:
int *sdataIdx = (int *)(sdata + blockSize);
这会保留一块大小足以容纳正确数据类型(在本例中为双倍)的共享内存,然后是 int
数据类型的块。 float
起作用的原因是 float
和 int
具有相同的大小要求。
另请注意,这适用于 float
、double
和 int
缩减类型。它不一定适用于其他类型。
此代码还会发出许多编译时警告。这些不应被忽视。但这不是问题的关键。
另请注意,如果您愿意,可以使用更少的代码行找到最小或最大加索引,例如