CUDA

Question

我是 CUDA 编程的新手，我正在尝试编写一个 CUDA 内核，用于仅在 3 维张量的 1 个维度上进行并行缩减，该 3 维张量是一个馈入内核的行主展平浮点数组。

换句话说，我正在尝试用 axis=0、axis=1 和 axis=2 的有限轴组合重写 numpy.sum。

我已经成功实施了 "reduce over axis 0" 和 "reduce over axis 1" 但是 "reduce over axis2" 的性能问题让我 post 在这里提出问题寻求建议。

内核以一维网格和一维块配置启动，并将每个线程映射到缩减输出张量的每个元素。所以，它应该是这样的： Link to image

这是我的内核：

__global__ void kernel_reduce_sum_3d_try02(
        float* g_idata,
        float* g_odata,
        int dim0,
        int dim1,
        int dim2,
        int overaxis0,
        int overaxis1,
        int overaxis2)
{

    if (overaxis0 == 0 && overaxis1 == 0 && overaxis2 == 1) { 
        // static shared memory
        __shared__ float smem_store[BLOCK_SIZE];

        // set thread ID
        //unsigned int tid = threadIdx.x;
        unsigned int tid =  threadIdx.x;

        // global index
        unsigned int idx = (blockIdx.x * blockDim.x + threadIdx.x);

        // unrolling
        float tmpSum0 = 0;
        unsigned int i = 0;
        unsigned int src_index ;
        unsigned int _limit = (unsigned int)(dim0 * dim1 * dim2);

        //Indices over output 
        int thrd_d0 = (idx) / (dim1*1);
        int thrd_d1 = (idx - thrd_d0*dim1);

        //Only for debugging 
        printf("tid: %03d \tidx: %03d\td0: %02d\td1: %02d\n",tid,idx,thrd_d0,thrd_d1);



        for (i = 0; i < dim2; i++) {
            src_index = thrd_d0*dim1*dim2 + thrd_d1 * dim2 + i;
            if(idx<15) printf("idx: %d : src_index: %d\n",idx,src_index);
            if(src_index < _limit)
                tmpSum0 += g_idata[src_index];
        }


        if (src_index + 0 < _limit) smem_store[tid + 0] = tmpSum0;
        __syncthreads();

        unsigned int oindx = (unsigned int)( thrd_d0*dim1 + thrd_d1 );
        if (src_index + 0 <= _limit) g_odata[oindx + 0] = smem_store[tid + 0];
    }
}

问题1.有没有更好的分配线程和块来计算输出张量的方法？

问题2.在我的内核中如何增加占用率？（大约 50%）

问题3.全局内存读操作应该如何使用共享内存？（在大'dim2'的情况下，每个块都应该分配巨大的共享内存缓冲区，这不利于并发执行和占用）

Answer 1

任何 CUDA 程序员的前两个优化 objective 是：

暴露足够的并行性（大致：能够使用大量线程）
有效利用内存子系统

对于全局内存，objective2表示我们在读写全局内存时应该争取合并访问。合并访问的一个示例是 warp 中的相邻线程正在读取内存中的相邻位置。

你的内核有这个功能吗？它不是。这是一个简单的测试用例：

$ cat t1.cu
#include <stdio.h>
#define BLOCK_SIZE 256

__global__ void kernel_reduce_sum_3d_try02(
        float* g_idata,
        float* g_odata,
        int dim0,
        int dim1,
        int dim2,
        int overaxis0,
        int overaxis1,
        int overaxis2)
{

    if (overaxis0 == 0 && overaxis1 == 0 && overaxis2 == 1) {
        // static shared memory
        __shared__ float smem_store[BLOCK_SIZE];

        // set thread ID
        //unsigned int tid = threadIdx.x;
        unsigned int tid =  threadIdx.x;

        // global index
        unsigned int idx = (blockIdx.x * blockDim.x + threadIdx.x);

        // unrolling
        float tmpSum0 = 0;
        unsigned int i = 0;
        unsigned int src_index ;
        unsigned int _limit = (unsigned int)(dim0 * dim1 * dim2);

        //Indices over output
        int thrd_d0 = (idx) / (dim1*1);
        int thrd_d1 = (idx - thrd_d0*dim1);

        //Only for debugging
        //printf("tid: %03d \tidx: %03d\td0: %02d\td1: %02d\n",tid,idx,thrd_d0,thrd_d1);



        for (i = 0; i < dim2; i++) {
            src_index = thrd_d0*dim1*dim2 + thrd_d1 * dim2 + i;
            //if(idx<15) printf("idx: %d : src_index: %d\n",idx,src_index);
            if(src_index < _limit){
                tmpSum0 += g_idata[src_index];
                if ((blockIdx.x == 0) && (i == 0) && (threadIdx.x < 32)) printf("thread: %d, src_index: %u\n", threadIdx.x, src_index);
        }


        if (src_index + 0 < _limit) smem_store[tid + 0] = tmpSum0;
        __syncthreads();

        unsigned int oindx = (unsigned int)( thrd_d0*dim1 + thrd_d1 );
        if (src_index + 0 <= _limit) g_odata[oindx + 0] = smem_store[tid + 0];
    }
  }
}


int main(){



  size_t dim = 256;
  size_t sz = dim*dim*dim*sizeof(float);
  float *g_idata, *g_odata;
  cudaMalloc(&g_idata, sz);
  cudaMalloc(&g_odata, sz);
  kernel_reduce_sum_3d_try02<<<dim/BLOCK_SIZE, BLOCK_SIZE>>>(
        g_idata,
        g_odata,
        dim,
        dim,
        dim,
        0,
        0,
        1);
  cudaDeviceSynchronize();
}
$ nvcc -o t1 t1.cu
$ cuda-memcheck ./t1
========= CUDA-MEMCHECK
thread: 0, src_index: 0
thread: 1, src_index: 256
thread: 2, src_index: 512
thread: 3, src_index: 768
thread: 4, src_index: 1024
thread: 5, src_index: 1280
thread: 6, src_index: 1536
thread: 7, src_index: 1792
thread: 8, src_index: 2048
thread: 9, src_index: 2304
thread: 10, src_index: 2560
thread: 11, src_index: 2816
thread: 12, src_index: 3072
thread: 13, src_index: 3328
thread: 14, src_index: 3584
thread: 15, src_index: 3840
thread: 16, src_index: 4096
thread: 17, src_index: 4352
thread: 18, src_index: 4608
thread: 19, src_index: 4864
thread: 20, src_index: 5120
thread: 21, src_index: 5376
thread: 22, src_index: 5632
thread: 23, src_index: 5888
thread: 24, src_index: 6144
thread: 25, src_index: 6400
thread: 26, src_index: 6656
thread: 27, src_index: 6912
thread: 28, src_index: 7168
thread: 29, src_index: 7424
thread: 30, src_index: 7680
thread: 31, src_index: 7936
========= ERROR SUMMARY: 0 errors
$

我们看到在一个特定的访问中，warp 中的线程正在从索引距离为 256 的位置读取（理想情况下我们希望这个 "distance" 在我们从线程开始时为 1线程化，以便 "coalesced" 访问全局内存）。

这可能吗？对于 3 个总和方向中的每一个都是可能的，尽管在每种情况下需要应用一些不同的 methodologies/realizations。我们会回到这个。

我们还想暴露足够的并行度，大致可以转化为"be able to launch kernels with ~10,000 or more threads"。此处的行为会因 GPU 架构和其他因素而异，但这是一般理解的合理起点。一个总共有 256 个线程（例如）的内核不足以使任何当前的 CUDA GPU 饱和。

鉴于此求和函数适用于 3 维数据，让我们从每维约 256 个元素的 3 维数组开始。如果我们选择只创建 256 个线程，并按它们的方式处理数组，那么对于 objective 1 来说太小了。所以让我们考虑一下可以处理 256x256（或更多）线程的实现。

情况一：X方向求和

我将采用C风格存储，因此X方向将是内存中线性存储的方向。因此，当我们在一行中从一个元素前进到另一个元素时，我们将内存存储索引增加 1。因此，沿着这些 "rows" 求和，将需要相邻线程读取属于特定总和的相邻元素。因此，为了让相邻的线程这样做，然后协同工作以形成总和，我们将使用 classical parallel reduction method。我们将选择每个块 256 个线程的网格（每个块合作形成一个总和）和每个总和一个块（在这种情况下总共有 65536 个块，所以总共有 65536*256 个线程，满足我们的第一个 objective）和每个256 个线程块将负责计算一行总和。为了促进这一点，我们将选择等于数组的 Y 维（在我们的例子中为 256）乘以数组中的 Z 维（在我们的示例中为 256）的块数，并且 256 个线程是块。每个块将负责计算一个总和。这将是唯一使用或需要共享内存的情况。

情况二：Y方向求和

我们可以将其称为 "summing columns in the Y direction"。为了满足我们的第二个 objective，我们要求一个 warp 中的每个线程读取一个相邻的元素，但是内存中的相邻元素现在属于单独的 Y 列总和。在这种情况下，如果我们能让足够多的线程保持忙碌，一个有效的实现是在每个线程中计算一个单独的总和。每个线程向下遍历一个 Y 列，并在一个循环中保留一个运行和。对于我们的示例情况，单个 Z-sheet（X-Y 平面中的一个切片）总共只需要 256 个线程来计算，但我们可以通过处理所有 256 个 sheet 来增加线程数（在我们的例子中）同时。所以我们可以使用 256x256 = 65536 个线程，满足我们的第一个 objective.

情况3：Z方向求和（你问题中的例子）

这种情况只是情况 2 的一个小变体。再次，为了满足 objective 2，我们希望 warp 中的相邻线程读取内存中的相邻位置。同样，这些属于单独的金额。但是，现在我们希望线程在 Z 方向遍历列，而不是在 Y 方向遍历列。因此索引会略有不同，但整体实现看起来与案例 2 非常相似。我们还将使用 256x256 线程的网格，其中每个线程保持单独的运行和。然而，每个运行总和的起点将是第一个 "sheet"，然后在 Z 方向遍历下一个 sheet 和下一个 sheet，求和 "Z-columns", 每个线程一个。

下面是一个演示几个案例的例子：

$ cat t2.cu
#include <stdio.h>
#include <assert.h>

// modifiable
typedef float mytype;  // consider possibility of overflow e.g. for char types
const size_t ddimX = 32768;
const size_t ddimY = 100;
const size_t ddimZ = 100;

//don't modify
const int nTPB=256;

template <typename T>
__inline__ __device__
T warpReduceSum(T val) {
  for (int offset = warpSize/2; offset > 0; offset /= 2)
    val += __shfl_down_sync(0xFFFFFFFF, val, offset); // requires CUDA 9+
  return val;
}


template <typename T>
__inline__ __device__
T blockReduceSum(T val) {

  static __shared__ T shared[32]; // Shared mem for 32 partial sums
  int lane = threadIdx.x % warpSize;
  int wid = threadIdx.x / warpSize;
  val = warpReduceSum(val);     // Each warp performs partial reduction
  if (lane==0) shared[wid]=val; // Write reduced value to shared memory
  __syncthreads();              // Wait for all partial reductions

                      //read from shared memory only if that warp existed
  val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;

  if (wid==0) val = warpReduceSum(val); //Final reduce within first warp
  return val;
}

template <typename T>
__global__ void kernel_reduce_sum_3d(
        const T * __restrict__  g_idata,
        T * __restrict__  g_odata,
        const size_t dim0,
        const size_t dim1,
        const size_t dim2,
        const bool overaxis0,
        const bool overaxis1,
        const bool overaxis2)
{
  if (overaxis0){
    size_t bidx = blockIdx.x;
    size_t tidx = threadIdx.x;
    size_t limit;
    size_t base;
    T res = 0;
    if (!overaxis1 && !overaxis2){
    // Case 1 - sums in X-direction
    // each threadblock is responsible for a separate row sum
      limit = dim0;
      base = bidx*dim0;
      while (tidx < limit){
        res += g_idata[base+tidx];
        tidx += blockDim.x;}} // block-stride loop
    else if (overaxis1 && !overaxis2){
    // Case 4 - sums in X-Y plane
    // each threadblock will be responsible for an X-Y plane
      limit = dim0*dim1;
      base = bidx*dim0*dim1;
      while (tidx < limit){
        res += g_idata[base+tidx];
        tidx += blockDim.x;}} // block-stride loop
    else if (!overaxis1 && overaxis2){
    // Case 5 - sums in X-Z plane
    // each threadblock will be responsible for an X-Z plane
      for (int i = 0; i < dim2; i++){
      tidx = threadIdx.x;
      limit = dim0;
      base = (bidx*dim0)+(i*dim0*dim1);
      while (tidx < limit){
        res += g_idata[base+tidx];
        tidx += blockDim.x;}}} // block-stride loop
    else assert(0); // not implemented! - the remaining case here is all 3 axes selected
#ifndef USE_WS
    __shared__ T sm[nTPB];
    sm[threadIdx.x] = res;
    __syncthreads();
    // parallel reduction
    for (int i = blockDim.x>>1; i > warpSize; i>>=1){
      if (threadIdx.x < i) sm[threadIdx.x] += sm[threadIdx.x + i];
      __syncthreads();}
    for (int i = (blockDim.x == warpSize)?warpSize>>1:warpSize; i > 0; i>>=1){
      if (threadIdx.x < i) sm[threadIdx.x] += sm[threadIdx.x + i];
      if (threadIdx.x < warpSize) __syncwarp();}
    if (!threadIdx.x) g_odata[bidx] = sm[0];
#else
    res = blockReduceSum(res);
    if (!threadIdx.x) g_odata[bidx] = res;
#endif
    }
  else if (!overaxis0 && overaxis1 && !overaxis2){
    // Case 2 - sums in Y-direction
    // each thread is responsible for a separate Y-column sum
    size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
    if (idx < (dim0*dim2)){
      size_t tidx = idx%dim0 + (idx/dim0)*(dim0*dim1);
      T tsum = 0;
      for (size_t i = 0; i < dim1; i++){
        tsum += g_idata[tidx];
        tidx += dim0;}
      g_odata[idx] = tsum;
      }
    }
  else if (!overaxis0 && overaxis1 && overaxis2){
    // Case 6 - sums in Y-Z plane
    // each thread is responsible for a separate Y-Z plane sum (probably not optimal)
    size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
    if (idx < (dim0)){
      size_t tidx = idx;
      T tsum = 0;
      for (size_t i = 0; i < dim1*dim2; i++){
        tsum += g_idata[tidx];
        tidx += dim0;}
      g_odata[idx] = tsum;
      }
    }
  else if (!overaxis0 && !overaxis1 && overaxis2){
    // Case 3 - sums in Z-direction
    // each thread is responsible for a separate Z-column sum
    size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
    if (idx < (dim0*dim1)){
      size_t tidx = idx;
      T tsum = 0;
      for (size_t i = 0; i < dim2; i++){
        tsum += g_idata[tidx];
        tidx += dim0*dim1;}
      g_odata[idx] = tsum;
      }
    }
  else assert(0); // not implemented! - the remaining case here is no axes selected
}

template <typename T>
bool validate(T *data, size_t dim, T val){
  for (size_t i = 0; i < dim; i++)
    if (data[i] != val) {printf("mismatch at %lu, was: %f, should be: %f\n", i, (float)(data[i]), (float)val); return false;}
  return true;
}

size_t choose_block_size(size_t val){
  if (val >= nTPB) return nTPB;
  if (val <= 32) return 32;
  val = (val >> 1) | val;
  val = (val >> 2) | val;
  val = (val >> 4) | val;
  val = (val >> 8) | val;
  val = (val >> 16) | val;
  val++;
  return val;
}

int main(int argc, char *argv[]){
  size_t dimX = ddimX;
  size_t dimY = ddimY;
  size_t dimZ = ddimZ;
  if (argc > 1) {
    size_t nx = atoi(argv[1]);
    dimX = nx;
    dimY = nx-1;
    dimZ = nx-2;}
  // setup input array of all 1
  const size_t sz = dimX*dimY*dimZ*sizeof(mytype);
  mytype *d_in, *d_out, *h_in, *h_out;
  size_t rsz;
  h_in = new mytype[dimX*dimY*dimZ];
  assert(h_in != NULL);
  cudaError_t err = cudaMalloc(&d_in, sz);
  for (size_t i = 0; i < dimX*dimY*dimZ; i++) h_in[i] = 1;
  if (err != cudaSuccess) {printf("cudaMalloc1 error: %s\n", cudaGetErrorString(err)); return -1;}
  for (size_t i = 0; i < dimX*dimY*dimZ; i++) h_in[i] = 1;
  err = cudaMemcpy(d_in, h_in, sz, cudaMemcpyHostToDevice);
  if (err != cudaSuccess) {printf("cudaMemcpy1 error: %s\n", cudaGetErrorString(err)); return -1;}
  // Test X-direction sums
  printf("testing X-direction sums\n");
  rsz = dimY*dimZ*sizeof(mytype);
  h_out=new mytype[dimY*dimZ];
  err = cudaMalloc(&d_out, rsz);
  if (err != cudaSuccess) {printf("cudaMalloc2 error: %s\n", cudaGetErrorString(err)); return -1;}
  err = cudaMemset(d_out, 0, rsz);
  if (err != cudaSuccess) {printf("cudaMemset1 error: %s\n", cudaGetErrorString(err)); return -1;}
  kernel_reduce_sum_3d<<<dimY*dimZ, choose_block_size(dimX)>>>(d_in, d_out, dimX, dimY, dimZ, true, false, false);
  err = cudaMemcpy(h_out, d_out, rsz, cudaMemcpyDeviceToHost);
  if (err != cudaSuccess) {printf("result1 error: %s\n", cudaGetErrorString(err)); return -1;}
  if (!validate(h_out, dimY*dimZ, (mytype)dimX))  return -1;
  cudaFree(d_out);
  delete[] h_out;
  // Test Y-direction sums
  printf("testing Y-direction sums\n");
  rsz = dimX*dimZ*sizeof(mytype);
  h_out=new mytype[dimX*dimZ];
  err = cudaMalloc(&d_out, rsz);
  if (err != cudaSuccess) {printf("cudaMalloc3 error: %s\n", cudaGetErrorString(err)); return -1;}
  err = cudaMemset(d_out, 0, rsz);
  if (err != cudaSuccess) {printf("cudaMemset2 error: %s\n", cudaGetErrorString(err)); return -1;}
  kernel_reduce_sum_3d<<<((dimX*dimZ)+(nTPB-1))/nTPB, nTPB>>>(d_in, d_out, dimX, dimY, dimZ, false, true, false);
  err = cudaMemcpy(h_out, d_out, rsz, cudaMemcpyDeviceToHost);
  if (err != cudaSuccess) {printf("result2 error: %s\n", cudaGetErrorString(err)); return -1;}
  if (!validate(h_out, dimX*dimZ, (mytype)dimY))  return -1;
  cudaFree(d_out);
  delete[] h_out;
  // Test Z-direction sums
  printf("testing Z-direction sums\n");
  rsz = dimX*dimY*sizeof(mytype);
  h_out=new mytype[dimX*dimY];
  err = cudaMalloc(&d_out, rsz);
  if (err != cudaSuccess) {printf("cudaMalloc4 error: %s\n", cudaGetErrorString(err)); return -1;}
  err = cudaMemset(d_out, 0, rsz);
  if (err != cudaSuccess) {printf("cudaMemset3 error: %s\n", cudaGetErrorString(err)); return -1;}
  kernel_reduce_sum_3d<<<((dimX*dimY)+(nTPB-1))/nTPB, nTPB>>>(d_in, d_out, dimX, dimY, dimZ, false, false, true);
  err = cudaMemcpy(h_out, d_out, rsz, cudaMemcpyDeviceToHost);
  if (err != cudaSuccess) {printf("result3 error: %s\n", cudaGetErrorString(err)); return -1;}
  if (!validate(h_out, dimX*dimY, (mytype)dimZ))  return -1;
  cudaFree(d_out);
  delete[] h_out;
  // Test X-Y plane sums
  printf("testing X-Y plane sums\n");
  rsz = dimZ*sizeof(mytype);
  h_out=new mytype[dimZ];
  err = cudaMalloc(&d_out, rsz);
  if (err != cudaSuccess) {printf("cudaMalloc5 error: %s\n", cudaGetErrorString(err)); return -1;}
  err = cudaMemset(d_out, 0, rsz);
  if (err != cudaSuccess) {printf("cudaMemset4 error: %s\n", cudaGetErrorString(err)); return -1;}
  kernel_reduce_sum_3d<<<dimZ, nTPB>>>(d_in, d_out, dimX, dimY, dimZ, true, true, false);
  err = cudaMemcpy(h_out, d_out, rsz, cudaMemcpyDeviceToHost);
  if (err != cudaSuccess) {printf("result4 error: %s\n", cudaGetErrorString(err)); return -1;}
  if (!validate(h_out, dimZ, (mytype)(dimX*dimY)))  return -1;
  cudaFree(d_out);
  delete[] h_out;
  // Test X-Z plane sums
  printf("testing X-Z plane sums\n");
  rsz = dimY*sizeof(mytype);
  h_out=new mytype[dimY];
  err = cudaMalloc(&d_out, rsz);
  if (err != cudaSuccess) {printf("cudaMalloc6 error: %s\n", cudaGetErrorString(err)); return -1;}
  err = cudaMemset(d_out, 0, rsz);
  if (err != cudaSuccess) {printf("cudaMemset5 error: %s\n", cudaGetErrorString(err)); return -1;}
  kernel_reduce_sum_3d<<<dimY, choose_block_size(dimX)>>>(d_in, d_out, dimX, dimY, dimZ, true, false, true);
  err = cudaMemcpy(h_out, d_out, rsz, cudaMemcpyDeviceToHost);
  if (err != cudaSuccess) {printf("result5 error: %s\n", cudaGetErrorString(err)); return -1;}
  if (!validate(h_out, dimY, (mytype)(dimX*dimZ)))  return -1;
  cudaFree(d_out);
  delete[] h_out;
  // Test Y-Z plane sums
  printf("testing Y-Z plane sums\n");
  rsz = dimX*sizeof(mytype);
  h_out=new mytype[dimX];
  err = cudaMalloc(&d_out, rsz);
  if (err != cudaSuccess) {printf("cudaMalloc7 error: %s\n", cudaGetErrorString(err)); return -1;}
  err = cudaMemset(d_out, 0, rsz);
  if (err != cudaSuccess) {printf("cudaMemset6 error: %s\n", cudaGetErrorString(err)); return -1;}
  size_t tpb = choose_block_size(dimX);
  kernel_reduce_sum_3d<<<(dimX+tpb-1)/tpb, tpb>>>(d_in, d_out, dimX, dimY, dimZ, false, true, true);
  err = cudaMemcpy(h_out, d_out, rsz, cudaMemcpyDeviceToHost);
  if (err != cudaSuccess) {printf("result6 error: %s\n", cudaGetErrorString(err)); return -1;}
  if (!validate(h_out, dimX, (mytype)(dimY*dimZ)))  return -1;
  cudaFree(d_out);
  delete[] h_out;
  cudaFree(d_in);
  err=cudaGetLastError();
  if (err != cudaSuccess) {printf("CUDA error: %s\n", cudaGetErrorString(err)); return -1;}
  printf("success!\n");
  delete[] h_in;
  return 0;
}
$ nvcc -o t2 t2.cu
$ cuda-memcheck ./t2
========= CUDA-MEMCHECK
testing X-direction sums
testing Y-direction sums
testing Z-direction sums
testing X-Y plane sums
testing X-Z plane sums
testing Y-Z plane sums
success!
========= ERROR SUMMARY: 0 errors
$ nvprof --print-gpu-trace ./t2
==11084== NVPROF is profiling process 11084, command: ./t2
testing X-direction sums
testing Y-direction sums
testing Z-direction sums
testing X-Y plane sums
testing X-Z plane sums
testing Y-Z plane sums
success!
==11084== Profiling application: ./t2
==11084== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
2.32676s  478.32ms                    -               -         -         -         -  1.2207GB  2.5521GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
2.80537s  5.2480us                    -               -         -         -         -  39.063KB  7.0985GB/s      Device           -  Quadro K2000 (0         1         7  [CUDA memset]
2.80596s  39.427ms          (10000 1 1)       (256 1 1)        32  1.0000KB        0B         -           -           -           -  Quadro K2000 (0         1         7  void kernel_reduce_sum_3d<float>(float const *, float*, unsigned long, unsigned long, unsigned long, bool, bool, bool) [109]
2.84539s  7.2320us                    -               -         -         -         -  39.063KB  5.1511GB/s      Device    Pageable  Quadro K2000 (0         1         7  [CUDA memcpy DtoH]
2.84586s  1.2160us                    -               -         -         -         -  12.500MB   1e+04GB/s      Device           -  Quadro K2000 (0         1         7  [CUDA memset]
2.84619s  22.838ms          (12800 1 1)       (256 1 1)        32  1.0000KB        0B         -           -           -           -  Quadro K2000 (0         1         7  void kernel_reduce_sum_3d<float>(float const *, float*, unsigned long, unsigned long, unsigned long, bool, bool, bool) [114]
2.86904s  5.8913ms                    -               -         -         -         -  12.500MB  2.0721GB/s      Device    Pageable  Quadro K2000 (0         1         7  [CUDA memcpy DtoH]
2.88707s  1.1840us                    -               -         -         -         -  12.500MB   1e+04GB/s      Device           -  Quadro K2000 (0         1         7  [CUDA memset]
2.88740s  23.046ms          (12800 1 1)       (256 1 1)        32  1.0000KB        0B         -           -           -           -  Quadro K2000 (0         1         7  void kernel_reduce_sum_3d<float>(float const *, float*, unsigned long, unsigned long, unsigned long, bool, bool, bool) [119]
2.91046s  5.5715ms                    -               -         -         -         -  12.500MB  2.1910GB/s      Device    Pageable  Quadro K2000 (0         1         7  [CUDA memcpy DtoH]
2.92758s  2.6240us                    -               -         -         -         -      400B  145.38MB/s      Device           -  Quadro K2000 (0         1         7  [CUDA memset]
2.92762s  40.990ms            (100 1 1)       (256 1 1)        32  1.0000KB        0B         -           -           -           -  Quadro K2000 (0         1         7  void kernel_reduce_sum_3d<float>(float const *, float*, unsigned long, unsigned long, unsigned long, bool, bool, bool) [124]
2.96861s  1.5360us                    -               -         -         -         -      400B  248.35MB/s      Device    Pageable  Quadro K2000 (0         1         7  [CUDA memcpy DtoH]
2.96898s  2.6240us                    -               -         -         -         -      400B  145.38MB/s      Device           -  Quadro K2000 (0         1         7  [CUDA memset]
2.96900s  43.392ms            (100 1 1)       (256 1 1)        32  1.0000KB        0B         -           -           -           -  Quadro K2000 (0         1         7  void kernel_reduce_sum_3d<float>(float const *, float*, unsigned long, unsigned long, unsigned long, bool, bool, bool) [129]
3.01239s  1.5680us                    -               -         -         -         -      400B  243.28MB/s      Device    Pageable  Quadro K2000 (0         1         7  [CUDA memcpy DtoH]
3.01277s  1.2160us                    -               -         -         -         -  128.00KB  100.39GB/s      Device           -  Quadro K2000 (0         1         7  [CUDA memset]
3.01279s  23.059ms            (128 1 1)       (256 1 1)        32  1.0000KB        0B         -           -           -           -  Quadro K2000 (0         1         7  void kernel_reduce_sum_3d<float>(float const *, float*, unsigned long, unsigned long, unsigned long, bool, bool, bool) [134]
3.03585s  20.928us                    -               -         -         -         -  128.00KB  5.8329GB/s      Device    Pageable  Quadro K2000 (0         1         7  [CUDA memcpy DtoH]

Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
SrcMemType: The type of source memory accessed by memory operation/copy
DstMemType: The type of destination memory accessed by memory operation/copy

简要回答您的问题：

Question 1. Is there a better way to assign threads and blocks to compute output tensor?

如前所述，您选择的索引不是最佳选择，因为它不会导致合并访问全局内存。我提供的替代实现将导致从全局内存中合并读取。

Question 2. In my kernel how can I increase occupancy? (which is around 50%)

这个内存绑定内核的优点不是占用率，而是内核运行时间是否约等于读取和写入数据所花费的时间。上面的内核应该满足这一点。如果您满足内存绑定内核的此条件，则无论占用情况如何，都无法进一步改进。

Question 3. How should I use shared memory for global memory read operations? (In case of a large 'dim2', each block should allocate huge shared memory buffer which is not good for concurrent warp execution and occupancy)

对于情况 2 和情况 3（您的情况），我所展示的最佳实现根本不需要共享内存，而在情况 1 中，我们可以制作一个只需要少量共享内存的内核每块；不足以成为占用问题或引起任何其他问题。

CUDA - 单轴平行缩减

CUDA - Parallel Reduction over one axis

shared-memory

reduction