双数组的 CUDA 最小缩减和索引

CUDA min reduction and index of double array

我正在尝试实施取自 的最小缩减算法。我不能为这个项目使用 thrust 或其他库,所以我必须坚持使用纯 CUDA。目标是将代码扩展到非常大的数组,根据我的经验,在我的机器上,推力对于我的目的来说太慢了。

下面是代码,它生成一个包含 4096 个元素的 double 数组,其中每个元素都等于其索引(即 [0,1,2,3,....,4096-1] ) 并且在给定索引处人为地添加了 -1 值(在本例中为 4091)。但是,代码似乎不起作用。我正在使用 cuda 11 和 Visual Studio 2017 使用 nvcc -w -arch=sm_50 input.cu -o output.exe

在 Windows 机器上为我的 GPU (CC=5.0) 编译它
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <math.h>
//#include <unistd.h>
//#include <sys/time.h>

#if __DEVICE_EMULATION__
#define DEBUG_SYNC __syncthreads();
#else
#define DEBUG_SYNC
#endif

#ifndef MIN
#define MIN(x,y) ((x < y) ? x : y)
#endif

#ifndef MIN_IDX
#define MIN_IDX(x,y, idx_x, idx_y) ((x < y) ? idx_x : idx_y)
#endif

#if (__CUDA_ARCH__ < 200)
#define int_mult(x,y)   __mul24(x,y)
#else
#define int_mult(x,y)   x*y
#endif
#define inf 0x7f800000

bool isPow2(unsigned int x)
{
    return ((x&(x-1))==0);
}

unsigned int nextPow2(unsigned int x)
{
    --x;
    x |= x >> 1;
    x |= x >> 2;
    x |= x >> 4;
    x |= x >> 8;
    x |= x >> 16;
    return ++x;
}

// Utility class used to avoid linker errors with extern
// unsized shared memory arrays with templated type
template<class T>
struct SharedMemory {
    __device__ inline operator T *() {
        extern __shared__ int __smem[];
        return (T *) __smem;
    }

    __device__ inline operator const T *() const {
        extern __shared__ int __smem[];
        return (T *) __smem;
    }
};

// specialize for double to avoid unaligned memory
// access compile errors
template<>
struct SharedMemory<double> {
    __device__ inline operator double *() {
        extern __shared__ double __smem_d[];
        return (double *) __smem_d;
    }

    __device__ inline operator const double *() const {
        extern __shared__ double __smem_d[];
        return (double *) __smem_d;
    }
};

template<class T, unsigned int blockSize, bool nIsPow2>
__global__ void reduceMin6(T *g_idata, int *g_idxs, T *g_odata,int *g_oIdxs, unsigned int n) {
    T *sdata = SharedMemory<T>();
    int *sdataIdx = ((int *)sdata) + blockSize;

    //int *sdataIdx = SharedMemory<int>();

    // perform first level of reduction,
    // reading from global memory, writing to shared memory
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
    unsigned int gridSize = blockSize * 2 * gridDim.x;

    T myMin = 99999;
    int myMinIdx = -1;
    // we reduce multiple elements per thread.  The number is determined by the
    // number of active thread blocks (via gridDim).  More blocks will result
    // in a larger gridSize and therefore fewer elements per thread
    while (i < n) {
        myMinIdx  = MIN_IDX(g_idata[i], myMin, g_idxs[i], myMinIdx);
        myMin = MIN(g_idata[i], myMin);

        // ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
        if (nIsPow2 || i + blockSize < n){
            //myMin += g_idata[i + blockSize];
            myMinIdx  = MIN_IDX(g_idata[i + blockSize], myMin, g_idxs[i + blockSize], myMinIdx);
            myMin = MIN(g_idata[i + blockSize], myMin);
        }

        i += gridSize;
    }

    // each thread puts its local sum into shared memory
    sdata[tid] = myMin;
    sdataIdx[tid] = myMinIdx;
    __syncthreads();
    // do reduction in shared mem
    if ((blockSize >= 2048) && (tid < 1024)) {
            //sdata[tid] = mySum = mySum + sdata[tid + 256];
        
            sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 512], myMin, sdataIdx[tid + 512], myMinIdx);
            sdata[tid] = myMin = MIN(sdata[tid + 512], myMin);
        }
        
        __syncthreads();


        // do reduction in shared mem
    if ((blockSize >= 1024) && (tid < 512)) {
            //sdata[tid] = mySum = mySum + sdata[tid + 256];
    
            sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 512], myMin, sdataIdx[tid + 512], myMinIdx);
            sdata[tid] = myMin = MIN(sdata[tid + 512], myMin);
        }
    
        __syncthreads();

    // do reduction in shared mem
    if ((blockSize >= 512) && (tid < 256)) {
        //sdata[tid] = mySum = mySum + sdata[tid + 256];

        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 256], myMin, sdataIdx[tid + 256], myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 256], myMin);
    }

    __syncthreads();

    if ((blockSize >= 256) && (tid < 128)) {
        //sdata[tid] = myMin = myMin + sdata[tid + 128];

        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 128], myMin, sdataIdx[tid + 128], myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 128], myMin);
    }

    __syncthreads();

    if ((blockSize >= 128) && (tid < 64)) {
        //sdata[tid] = myMin = myMin + sdata[tid + 64];

        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 64], myMin, sdataIdx[tid + 64], myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 64], myMin);
    }

    __syncthreads();

#if (__CUDA_ARCH__ >= 300 )
    if (tid < 32) {
        // Fetch final intermediate sum from 2nd warp
        if (blockSize >= 64){
        //myMin += sdata[tid + 32];
            myMinIdx = MIN_IDX(sdata[tid + 32], myMin, sdataIdx[tid + 32], myMinIdx);
            myMin = MIN(sdata[tid + 32], myMin);
        }
        // Reduce final warp using shuffle
        for (int offset = warpSize / 2; offset > 0; offset /= 2) {
            //myMin += __shfl_down(myMin, offset);
            int tempMyMinIdx = __shfl_down(myMinIdx, offset);
            double tempMyMin = __shfl_down(myMin, offset);

            myMinIdx = MIN_IDX(tempMyMin, myMin, tempMyMinIdx , myMinIdx);
            myMin = MIN(tempMyMin, myMin);
        }

    }
#else
    // fully unroll reduction within a single warp
    if ((blockSize >= 64) && (tid < 32))
    {
        //sdata[tid] = myMin = myMin + sdata[tid + 32];
        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 32], myMin, sdataIdx[tid + 32], myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 32], myMin);
    }

    __syncthreads();

    if ((blockSize >= 32) && (tid < 16))
    {
        //sdata[tid] = myMin = myMin + sdata[tid + 16];

        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 16], myMin, sdataIdx[tid + 16], myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 16], myMin);
    }

    __syncthreads();

    if ((blockSize >= 16) && (tid < 8))
    {
        //sdata[tid] = myMin = myMin + sdata[tid + 8];

        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 8], myMin, sdataIdx[tid + 8], myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 8], myMin);
    }

    __syncthreads();

    if ((blockSize >= 8) && (tid < 4))
    {
        //sdata[tid] = myMin = myMin + sdata[tid + 4];

        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 4], myMin, sdataIdx[tid + 4], myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 4], myMin);
    }

    __syncthreads();

    if ((blockSize >= 4) && (tid < 2))
    {
        //sdata[tid] = myMin = myMin + sdata[tid + 2];
        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 2], myMin, sdataIdx[tid + 2], myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 2], myMin);
    }

    __syncthreads();

    if ((blockSize >= 2) && ( tid < 1))
    {
        //sdata[tid] = myMin = myMin + sdata[tid + 1];
        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 1], myMin, sdataIdx[tid + 1], myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 1], myMin);
    }

    __syncthreads();
#endif

    __syncthreads();
    // write result for this block to global mem
    if (tid == 0){
        g_odata[blockIdx.x] = myMin;
        g_oIdxs[blockIdx.x] = myMinIdx;
    }
}

void getNumBlocksAndThreads(int whichKernel, int n, int maxBlocks,
        int maxThreads, int &blocks, int &threads) {

    //get device capability, to avoid block/grid size exceed the upper bound
    cudaDeviceProp prop;
    int device;
    cudaGetDevice(&device);
    cudaGetDeviceProperties(&prop, device);

    if (whichKernel < 3) {
        threads = (n < maxThreads) ? nextPow2(n) : maxThreads;
        blocks = (n + threads - 1) / threads;
    } else {
        threads = (n < maxThreads * 2) ? nextPow2((n + 1) / 2) : maxThreads;
        blocks = (n + (threads * 2 - 1)) / (threads * 2);
    }

    if ((float) threads * blocks
            > (float) prop.maxGridSize[0] * prop.maxThreadsPerBlock) {
        printf("n is too large, please choose a smaller number!\n");
    }

    if (blocks > prop.maxGridSize[0]) {
        printf(
                "Grid size <%d> exceeds the device capability <%d>, set block size as %d (original %d)\n",
                blocks, prop.maxGridSize[0], threads * 2, threads);

        blocks /= 2;
        threads *= 2;
    }

    if (whichKernel == 6) {
        blocks = MIN(maxBlocks, blocks);
    }
}

template<class T>
void reduceMin(int size, int threads, int blocks, int whichKernel, T *d_idata,
        T *d_odata,  int *idxs,  int *oIdxs) {
    dim3 dimBlock(threads, 1, 1);
    dim3 dimGrid(blocks, 1, 1);

    // when there is only one warp per block, we need to allocate two warps
    // worth of shared memory so that we don't index shared memory out of bounds
    int smemSize =(threads <= 32) ? 2 * threads * sizeof(T) : threads * sizeof(T); // shared mem. for the amount of double i have
    smemSize += threads*sizeof(int); // shared memory for the indexes
    if (isPow2(size)) {
        switch (threads) {
        case 2048:
            reduceMin6<T, 2048, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;
        case 1024:
            reduceMin6<T, 1024, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;   
        case 512:
            reduceMin6<T, 512, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 256:
            reduceMin6<T, 256, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 128:
            reduceMin6<T, 128, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 64:
            reduceMin6<T, 64, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 32:
            reduceMin6<T, 32, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 16:
            reduceMin6<T, 16, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 8:
            reduceMin6<T, 8, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 4:
            reduceMin6<T, 4, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 2:
            reduceMin6<T, 2, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 1:
            reduceMin6<T, 1, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;
        }
    } else {
        switch (threads) {
        case 2048:
            reduceMin6<T, 2048, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break ;     
        case 1024:
            reduceMin6<T, 1024, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break ;   
        case 512:
            reduceMin6<T, 512, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 256:
            reduceMin6<T, 256, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 128:
            reduceMin6<T, 128, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 64:
            reduceMin6<T, 64, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 32:
            reduceMin6<T, 32, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 16:
            reduceMin6<T, 16, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 8:
            reduceMin6<T, 8, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 4:
            reduceMin6<T, 4, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 2:
            reduceMin6<T, 2, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 1:
            reduceMin6<T, 1, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;
        }
    }

}

template<class T>
void reduceMINCPU(T *data, int size, T *min, int *idx)
{
    *min = data[0];
    int min_idx = 0;
    T c = (T)0.0;

    for (int i = 1; i < size; i++)
    {
        T y = data[i];
        T t = MIN(*min, y);
        min_idx = MIN_IDX(*min, y, min_idx, i);
        (*min) = t;
    }

    *idx = min_idx;

    return;
}


// Instantiate the reduction function for 3 types
template void
reduceMin<int>(int size, int threads, int blocks, int whichKernel, int *d_idata,
        int *d_odata, int *idxs,  int *oIdxs);

template void
reduceMin<float>(int size, int threads, int blocks, int whichKernel, float *d_idata,
        float *d_odata, int *idxs,  int *oIdxs);

template void
reduceMin<double>(int size, int threads, int blocks, int whichKernel, double *d_idata,
        double *d_odata,  int *idxs,  int *oIdxs);

int main(){
    int num_els=8*8*8*8;

    int maxThreads = 128;  // number of threads per block
    int whichKernel = 6;
    int maxBlocks = 1000000;

    double* d_in = NULL;
    double* d_out = NULL;
    int *d_idxs = NULL;
    int *d_oIdxs = NULL;

    printf("%d elements\n", num_els);
    printf("%d threads (max)\n", maxThreads);

    int numBlocks = 0;
    int numThreads = 0;
    getNumBlocksAndThreads(whichKernel, num_els, maxBlocks, maxThreads, numBlocks,
            numThreads);
    printf("%d numBlocks \n", numBlocks);
    printf("%d numThreads \n", numThreads);

    cudaMalloc((void **) &d_in, num_els * sizeof(double));
    cudaMalloc((void **) &d_idxs, num_els * sizeof(int));
    cudaMalloc((void **) &d_out, numBlocks * sizeof(double));
    cudaMalloc((void **) &d_oIdxs, numBlocks * sizeof(int));
    printf("Finished cudaMalloc");

    double* in = (double*) malloc(num_els * sizeof(double));
    int *idxs = (int*) malloc(num_els * sizeof(int));
    double* out = (double*) malloc(numBlocks * sizeof(double));
    int* oIdxs = (int*) malloc(numBlocks * sizeof(int));

    for (int i = 0; i < num_els; i++) {
        in[i] = (double)i*1.0;
        idxs[i] = i;
    }
    in[num_els-5]=(double)-1.0;
    printf("Verify that the in array is correctly filled \n");
    for (int i = num_els-10; i < num_els; i++) {
   //
       printf("\n [%d] = %.4f", idxs[i], in[i]);
    }

    // copy data directly to device memory
    cudaMemcpy(d_in, in, num_els * sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(d_idxs, idxs, num_els * sizeof(int),cudaMemcpyHostToDevice);
    cudaMemcpy(d_out, out, numBlocks * sizeof(double),cudaMemcpyHostToDevice);
    cudaMemcpy(d_oIdxs, oIdxs, numBlocks * sizeof(int),cudaMemcpyHostToDevice);
    printf(" \n Finished cudaMemcopy \n");

    reduceMin<double>(num_els, numThreads, numBlocks, whichKernel, d_in, d_out, d_idxs, d_oIdxs);

    cudaMemcpy(out, d_out, numBlocks * sizeof(double), cudaMemcpyDeviceToHost);
    cudaMemcpy(oIdxs, d_oIdxs, numBlocks * sizeof(int), cudaMemcpyDeviceToHost);

    for(int i=0; i< numBlocks; i++){
    //for(int i=numBlocks-20; i< numBlocks; i++)
       printf("\n Reduce MIN GPU idx: %d  value: %f", oIdxs[i], out[i]);
    }
    cudaFree(d_in);
    cudaFree(d_out);
    cudaFree(d_idxs);

    free(in);
    free(out);
    return 0;

}

但是,如果我转换用于最小化和查找 float 数组索引的代码,它可以正常工作。这是 float 版本的 main(这是唯一不同的部分,一些变量和数组被指定为“float”而不是“double”)。这个和前者一样编译。

int main(){
    int num_els=8*8*8*8;

    int maxThreads = 128;  // number of threads per block
    int whichKernel = 6;
    int maxBlocks = 1000000;



    float* d_in = NULL;
    float* d_out = NULL;
    int *d_idxs = NULL;
    int *d_oIdxs = NULL;

    printf("%d elements\n", num_els);
    printf("%d threads (max)\n", maxThreads);

    int numBlocks = 0;
    int numThreads = 0;
    getNumBlocksAndThreads(whichKernel, num_els, maxBlocks, maxThreads, numBlocks,
            numThreads);
    printf("%d numBlocks \n", numBlocks);
    printf("%d numThreads \n", numThreads);

    cudaMalloc((void **) &d_in, num_els * sizeof(float));
    cudaMalloc((void **) &d_idxs, num_els * sizeof(int));
    cudaMalloc((void **) &d_out, numBlocks * sizeof(float));
    cudaMalloc((void **) &d_oIdxs, numBlocks * sizeof(int));
    printf("Finished cudaMalloc");

    float* in = (float*) malloc(num_els * sizeof(float));
    int *idxs = (int*) malloc(num_els * sizeof(int));
    float* out = (float*) malloc(numBlocks * sizeof(float));
    int* oIdxs = (int*) malloc(numBlocks * sizeof(int));

    for (int i = 0; i < num_els; i++) {
        in[i] = (float)i*1.0;
        idxs[i] = i;
    }
    in[num_els-5]=(float)-1.0;
    printf("Verify that the in array is correctly filled \n");
    for (int i = num_els-10; i < num_els; i++) {
   //
       printf("\n [%d] = %.4f", idxs[i], in[i]);
    }

    // copy data directly to device memory
    cudaMemcpy(d_in, in, num_els * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_idxs, idxs, num_els * sizeof(int),cudaMemcpyHostToDevice);
    cudaMemcpy(d_out, out, numBlocks * sizeof(float),cudaMemcpyHostToDevice);
    cudaMemcpy(d_oIdxs, oIdxs, numBlocks * sizeof(int),cudaMemcpyHostToDevice);
    printf(" \n Finished cudaMemcopy \n");

    reduceMin<float>(num_els, numThreads, numBlocks, whichKernel, d_in, d_out, d_idxs, d_oIdxs);

    cudaMemcpy(out, d_out, numBlocks * sizeof(float), cudaMemcpyDeviceToHost);
    cudaMemcpy(oIdxs, d_oIdxs, numBlocks * sizeof(int), cudaMemcpyDeviceToHost);

    for(int i=0; i< numBlocks; i++){
    //for(int i=numBlocks-20; i< numBlocks; i++)
       printf("\n Reduce MIN GPU idx: %d  value: %f", oIdxs[i], out[i]);
    }
    cudaFree(d_in);
    cudaFree(d_out);
    cudaFree(d_idxs);

    free(in);
    free(out);
    return 0;

}

问题在这里:

int *sdataIdx = ((int *)sdata) + blockSize;

这行代码正在设置一个指针,以便共享内存可以分为 2 个逻辑部分,一个用于保存要减少的数据以找到最小值(第一个),一个用于保存相应的索引(第二部分——这个指针实际指向的开始。

要正确处理不同的数据类型,应该是:

int *sdataIdx = (int *)(((T *)sdata) + blockSize);

由于sdata已经是T类型的指针,我们可以简化:

int *sdataIdx = (int *)(sdata + blockSize);

这会保留一块大小足以容纳正确数据类型(在本例中为双倍)的共享内存,然后是 int 数据类型的块。 float 起作用的原因是 floatint 具有相同的大小要求。

另请注意,这适用于 floatdoubleint 缩减类型。它不一定适用于其他类型。

此代码还会发出许多编译时警告。这些不应被忽视。但这不是问题的关键。

另请注意,如果您愿意,可以使用更少的代码行找到最小或最大加索引,例如