Cuda - 每个向量元素中的多个总和

Cuda - Multiple sums in each vector element

系数a和b的两个切比雪夫多项式的乘积可以用公式表示

问题是尽可能并行化。

我已经设法使用 cuda 通过简单地为每个向量元素应用一个线程来并行化上面的公式。因此一个线程执行 sums/multiplications.

#include <stdio.h>
#include <iostream>
#include <cuda.h>
#include <time.h>

__global__ void chebyprod(int n, float *a, float *b, float *c){
   int i = blockIdx.x *blockDim.x + threadIdx.x;
   float sum;
   if (i < n) {
      sum = 0.f;
      for (int j = 0; j<=i; j++){
         sum += a[j]*b[j-i];
      }
      for (int j = 1; j < n-i; j++){
         sum += a[j]*b[j+i]+a[j+i]*b[j];
      }
      c[i] = 0.5f*sum;
   }
   /*
   if (i < n)
      c[i] = a[i] + b[i];
   */  
}

int main(void){
  clock_t tStart = clock();
  int N = 10000;
  float *a, *b, *c, *d_a, *d_b, *d_c;
  a = (float*)malloc(N*sizeof(float));
  b = (float*)malloc(N*sizeof(float));
  c = (float*)malloc(N*sizeof(float));

  cudaMalloc(&d_a, N*sizeof(float)); 
  cudaMalloc(&d_b, N*sizeof(float));
  cudaMalloc(&d_c, N*sizeof(float));

  for (int i = 0; i < N; i++) {
    a[i] = 0.1f;
    b[i] = 0.2f;
  }

  cudaMemcpy(d_a, a, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, N*sizeof(float), cudaMemcpyHostToDevice);

  int blockSize, gridSize;
  // Number of threads in each thread block
  blockSize = 1024;

  // Number of thread blocks in grid
  gridSize = (int)ceil((float)N/blockSize);

  std::cout << "blockSize: " << blockSize << "\ngridSize: " << gridSize << "\n";

  // Perform chebyprod on N elements
  chebyprod<<< gridSize, blockSize >>>(N, d_a, d_b, d_c);
  printf("Time taken: %.2fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);

  cudaMemcpy(c, d_c, N*sizeof(float), cudaMemcpyDeviceToHost);

  std::cout << "Vector c: [ ";
  for (int k = 0; k < 10; ++k)
    std::cout << c[k] << " ";
  std::cout <<"]\n";

  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);
  free(a);
  free(b);
  free(c);
}

在另一个代码中,我设法使用求和归约法对向量中的所有元素求和(我从 nvidia 演示文稿中复制了别人的代码)。现在的问题是,如何将这两种方法结合起来?我想要一堆线程来计算 c 的每个元素中的所有 sums/multiplications。有小费吗?或者我可以从中学习的类似问题?

矩阵中的行减少可能与该问题类似。但是我有多个不同长度和乘法的和。

这是nvidia员工提供的代码(我认为)

template <unsigned int blockSize>
__global__ void parreduc(float *array_in, float *reduct, size_t array_len)
    {
    extern volatile __shared__ float sdata[];
    size_t  tid        = threadIdx.x,
            gridSize   = blockSize * gridDim.x,
            i          = blockIdx.x * blockSize + tid;
    sdata[tid] = 0;
    while (i < array_len)
        { sdata[tid] += array_in[i];
        i += gridSize; }
    __syncthreads();
    if (blockSize >= 512)
        { if (tid < 256) sdata[tid] += sdata[tid + 256]; __syncthreads();         }
    if (blockSize >= 256)
        { if (tid < 128) sdata[tid] += sdata[tid + 128]; __syncthreads(); }
    if (blockSize >= 128)
        { if (tid <  64) sdata[tid] += sdata[tid + 64]; __syncthreads(); }
    if (tid < 32)
        { if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
          if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
          if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
          if (blockSize >= 8)  sdata[tid] += sdata[tid + 4];
          if (blockSize >= 4)  sdata[tid] += sdata[tid + 2];
          if (blockSize >= 2)  sdata[tid] += sdata[tid + 1]; }
    if (tid == 0) reduct[blockIdx.x] = sdata[0];
    }

问题中提供的代码是实现的明智的第一步。线程策略最common/typical:给每个输出点分配一个线程(这里N个输出点)。每个线程必须执行计算特定输出点所需的所有计算。提高 CUDA 代码性能的动机应始终至少解决 2 个 CUDA 优化优先级:

  1. 公开足够的并行性(大致:创建足够的线程)
  2. 高效利用内存(大致:对于全局内存访问,争取合并)

关于第1项,问题中提供的代码的有效性将取决于GPU。作为一个粗略的经验法则,我们寻求在我们 运行 正在使用的 GPU 中为每个 SM 启动至少 2048 个线程(图灵上为 1024 个),以便有机会 "saturate" GPU。对于 N = 10000,我们可以用 5 个 SM 使 GPU 饱和。对于具有 80 个 SM 的 Tesla V100,我们没有希望用 10,000 个线程使 GPU 饱和。

关于第2项,提供的代码也有一定的不足;在合并方面存在问题:在许多情况下,相邻线程不会读取内存中的相邻值。仅举一个例子,我看到的第一个全局负载是 a[j]。这是为每个线程加载相同的 value/location,而不是相邻线程中的相邻值。

我们能否想出一个可能会改进这两个方面的替代实现?我们将考虑线程策略的以下变化:为每个输出点分配一个 threadblock,而不是为每个输出点分配一个线程。每个输出点所需的计算将可视化为矩阵的一个 "row"。线程块将 "stride" 沿行执行所需的计算,并最终进行线程块级缩减以生成该行的单个结果。这将使我们能够解决这两个问题:warp 中的相邻线程将能够从 ab 中读取相邻值,并且我们也将能够立即将我们的线程总数增加一个因子高达 1024(因此,我们可以启动约 1000 万个线程,而不是 10000 个线程。1000 万个线程足以使当前的任何 CUDA GPU 饱和)。这种线程策略还有一个很好的特点:上面提到的 "rows" 计算的长度是可变的。第一行和最后一行最长,大约有 N 个计算元素,而中间的行将有接近 N/2 个计算元素。通过选择块步幅循环(概念上类似于 grid-stride loop),我们可以有效地处理不同的行长度。每个线程块将 "stride" 沿行,仅在需要时累积结果。

这是一个实现该实现的示例:

$ cat t1497.cu
#include <stdio.h>
#include <iostream>
#include <cuda.h>
typedef float mt;
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
const bool sync = true;
const bool nosync = false;
unsigned long long dtime_usec(unsigned long long start, bool use_sync = nosync){
  if (use_sync == sync) cudaDeviceSynchronize();
  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
__global__ void chebyprod(int n, const mt * __restrict__ a, const mt * __restrict__ b, mt * __restrict__ c){
   int i = blockIdx.x *blockDim.x + threadIdx.x;
   mt sum;
   if (i < n) {
      sum = 0.f;
      for (int j = 0; j<=i; j++){
         sum += a[j]*b[i-j];
      }
      for (int j = 1; j < n-i; j++){
         sum += a[j]*b[j+i]+a[j+i]*b[j];
      }
      c[i] = 0.5f*sum;
   }
}
// assume one threadblock per c_k coefficient
// assume a power-of-2 threadblock size
const int tpb_p2 = 8;
const int nTPB = 1<<tpb_p2;
const unsigned row_mask = ~((0xFFFFFFFFU>>tpb_p2)<<tpb_p2);

__global__ void chebyprod_imp(int n, const mt * __restrict__ a, const mt * __restrict__ b, mt * __restrict__ c){
#ifndef NO_WS
  __shared__ mt sd[32];
  if (threadIdx.x < 32) sd[threadIdx.x] = 0;
  __syncthreads();
#else
  __shared__ mt sd[nTPB];
#endif
  int k = blockIdx.x;
  mt sum = 0.0f;
  int row_width = (((k)>(n-k))?(k):(n-k))+1;
  int strides = (row_width>>tpb_p2)+ ((row_width&row_mask)?1:0);
  int j = threadIdx.x;
  mt tmp_a;
  for (int s=0; s < strides; s++){ // block-stride loop
    if (j < n) tmp_a = a[j];
    if (j <= k) sum += tmp_a*b[k-j];
    if ((j > 0) && (j < (n-k))) sum += tmp_a*b[j+k] + a[j+k]*b[j];
    j += nTPB;
    }
#ifndef NO_WS
  // 1st warp-shuffle reduction
  int lane = threadIdx.x & (warpSize-1);
  int warpID = threadIdx.x >> 5; // assumes warpSize == 32
  unsigned mask = 0xFFFFFFFFU;
  for (int offset = warpSize>>1; offset > 0; offset >>= 1)
    sum += __shfl_down_sync(mask, sum, offset);
  if (lane == 0) sd[warpID] = sum;
  __syncthreads(); // put warp results in shared mem
  // hereafter, just warp 0
  if (warpID == 0){
  // reload val from shared mem if warp existed
    sum = sd[lane];
  // final warp-shuffle reduction
    for (int offset = warpSize>>1; offset > 0; offset >>= 1)
      sum += __shfl_down_sync(mask, sum, offset);
  }
#else
  sd[threadIdx.x] = sum;
  for (int s = nTPB>>1; s > 0; s>>=1){ // sweep reduction
    __syncthreads();
    if (threadIdx.x < s) sd[threadIdx.x] += sd[threadIdx.x+s];}
  if (!threadIdx.x) sum = sd[0];
#endif
  if (!threadIdx.x) c[k] = sum*0.5f;
}

int main(int argc, char *argv[]){
  int N = 10000;
  if (argc>1) N = atoi(argv[1]);
  std::cout << "N = " << N << std::endl;
  mt *a, *b, *c, *ic, *d_a, *d_b, *d_c;
  a  = (mt*)malloc(N*sizeof(mt));
  b  = (mt*)malloc(N*sizeof(mt));
  c  = (mt*)malloc(N*sizeof(mt));
  ic = (mt*)malloc(N*sizeof(mt));

  cudaMalloc(&d_a, N*sizeof(mt));
  cudaMalloc(&d_b, N*sizeof(mt));
  cudaMalloc(&d_c, N*sizeof(mt));

  for (int i = 0; i < N; i++) {
    a[i] = 0.1f;
    b[i] = 0.2f;
  }

  cudaMemcpy(d_a, a, N*sizeof(mt), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, N*sizeof(mt), cudaMemcpyHostToDevice);
  int blockSize, gridSize;
  // Number of threads in each thread block
  blockSize = 1024;

  // Number of thread blocks in grid
  gridSize = (int)ceil((float)N/blockSize);

  std::cout << "blockSize: " << blockSize << "\ngridSize: " << gridSize << "\n";

  // Perform chebyprod on N elements
  unsigned long long  dt = dtime_usec(0);
  chebyprod<<< gridSize, blockSize >>>(N, d_a, d_b, d_c);
  dt = dtime_usec(dt,sync);

  cudaMemcpy(c, d_c, N*sizeof(mt), cudaMemcpyDeviceToHost);
  printf("Time taken: %fs\n", dt/(float)USECPSEC);
  std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
  std::cout << "Vector c: [ ";
  for (int k = 0; k < 10; ++k)
    std::cout << c[k] << " ";
  std::cout <<"]\n";
  dt = dtime_usec(0);
  chebyprod_imp<<< N, nTPB >>>(N, d_a, d_b, d_c);
  dt = dtime_usec(dt,sync);
  cudaMemcpy(ic, d_c, N*sizeof(mt), cudaMemcpyDeviceToHost);
  printf("Time taken: %fs\n", dt/(float)USECPSEC);
  std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
  std::cout << "Vector c: [ ";
  for (int k = 0; k < 10; ++k)
    std::cout << ic[k] << " ";
  std::cout <<"]\n";
  mt max_error = 0;
  for (int k = 0; k < N; k++)
    max_error = fmax(max_error, fabs(c[k] - ic[k]));
  std::cout << "Max error = " << max_error << std::endl;
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);
  free(a);
  free(b);
  free(c);
  free(ic);
}
$ nvcc -arch=sm_52 -o t1497 t1497.cu
$ ./t1497
blockSize: 1024
gridSize: 10
Time taken: 0.001687s
no error
Vector c: [ 199.996 199.986 199.976 199.966 199.956 199.946 199.936 199.926 199.916 199.906 ]
Time taken: 0.000350s
no error
Vector c: [ 199.99 199.98 199.97 199.96 199.95 199.94 199.93 199.92 199.91 199.9 ]
Max error = 0.0137787
$

(更改 -arch 开关以匹配您的 GPU)

以上示例表明修改后的算法 运行 快了大约 5 倍(在 Tesla V100 上)。尽管存在数值差异,但这是由于浮点数问题造成的。为证明该算法给出了正确的结果,请将 typedeffloat 切换为 double。您将看到结果中基本上不再有任何数值差异(表明算法在逻辑上是相同的)并且 float 分辨率的改进算法版本给出了前 10 个数值元素的答案更接近 double 算法产生的 "more accurate" 结果。

正如评论中所讨论的,这种算法转换可能并非在所有情况下都有益。主要好处将来自利用具有更大线程容量(大于 N 线程)的 GPU。相对较小的 GPU(例如,对于 N = 10000,可能有 8 个 SM 或更少)可能无法从中受益,实际上代码可能 运行 比原始算法慢。

虽然我提到了合并,但对于 N = 10000,这里的输入数据非常小(~80K 字节),适合大多数 GPU 的二级缓存。一旦数据在 L2 缓存中,低效的访问模式就不再是问题了。因此,在这种情况下,该算法的主要好处可能是由于第 1 项。如果无法利用第 1 项,则该算法显示出很少或没有好处。

出于测试目的,我创建了另一个使用 warp-stride 循环的版本。然而,它在小型 GPU 上似乎并没有显着加快,实际上在 V100 上更慢:

#include <stdio.h>
#include <iostream>
#include <cuda.h>
typedef float mt;
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
const bool sync = true;
const bool nosync = false;
unsigned long long dtime_usec(unsigned long long start, bool use_sync = nosync){
  if (use_sync == sync) cudaDeviceSynchronize();
  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
__global__ void chebyprod(int n, const mt * __restrict__ a, const mt * __restrict__ b, mt * __restrict__ c){
   int i = blockIdx.x *blockDim.x + threadIdx.x;
   mt sum;
   if (i < n) {
      sum = 0.f;
      for (int j = 0; j<=i; j++){
         sum += a[j]*b[i-j];
      }
      for (int j = 1; j < n-i; j++){
         sum += a[j]*b[j+i]+a[j+i]*b[j];
      }
      c[i] = 0.5f*sum;
   }
}
// assume one warp per c_k coefficient
// assume a multiple-of-32 threadblock size
const int nTPB = 32*8;
const int warpSize_p2 = 5; // assumes warpSize == 32
const int nWarps = nTPB>>warpSize_p2;
const unsigned row_mask = ~((0xFFFFFFFFU>>warpSize_p2)<<warpSize_p2);
__global__ void chebyprod_imp(int n, const mt * __restrict__ a, const mt * __restrict__ b, mt * __restrict__ c){
  int warpID = threadIdx.x >> warpSize_p2;
  int k = blockIdx.x*(nWarps)+warpID;
  if (k < n){
    mt sum = 0.0f;
    int lane = threadIdx.x & (warpSize-1);
    int row_width = (((k)>(n-k))?(k):(n-k))+1;
    int strides = (row_width>>warpSize_p2)+ ((row_width&row_mask)?1:0);
    int j = lane;
    mt tmp_a;
    for (int s=0; s < strides; s++){ // warp-stride loop
      if (j < n) tmp_a = a[j];
      if (j <= k) sum += tmp_a*b[k-j];
      if ((j > 0) && (j < (n-k))) sum += tmp_a*b[j+k] + a[j+k]*b[j];
      j += warpSize;
      }
  // warp-shuffle reduction
    for (int offset = warpSize>>1; offset > 0; offset >>= 1)
      sum += __shfl_down_sync(0xFFFFFFFFU, sum, offset);
    if (lane==0) c[k] = sum*0.5f;}
}

int main(int argc, char *argv[]){
  int N = 10000;
  if (argc>1) N = atoi(argv[1]);
  std::cout << "N = " << N << std::endl;
  mt *a, *b, *c, *ic, *d_a, *d_b, *d_c;
  a  = (mt*)malloc(N*sizeof(mt));
  b  = (mt*)malloc(N*sizeof(mt));
  c  = (mt*)malloc(N*sizeof(mt));
  ic = (mt*)malloc(N*sizeof(mt));

  cudaMalloc(&d_a, N*sizeof(mt));
  cudaMalloc(&d_b, N*sizeof(mt));
  cudaMalloc(&d_c, N*sizeof(mt));

  for (int i = 0; i < N; i++) {
    a[i] = 0.1f;
    b[i] = 0.2f;
  }

  cudaMemcpy(d_a, a, N*sizeof(mt), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, N*sizeof(mt), cudaMemcpyHostToDevice);
  int blockSize, gridSize;
  // Number of threads in each thread block
  blockSize = 1024;

  // Number of thread blocks in grid
  gridSize = (int)ceil((float)N/blockSize);

  std::cout << "blockSize: " << blockSize << "\ngridSize: " << gridSize << "\n";

  // Perform chebyprod on N elements
  unsigned long long  dt = dtime_usec(0);
  chebyprod<<< gridSize, blockSize >>>(N, d_a, d_b, d_c);
  dt = dtime_usec(dt,sync);

  cudaMemcpy(c, d_c, N*sizeof(mt), cudaMemcpyDeviceToHost);
  printf("Time taken: %fs\n", dt/(float)USECPSEC);
  std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
  std::cout << "Vector c: [ ";
  for (int k = 0; k < 10; ++k)
    std::cout << c[k] << " ";
  std::cout <<"]\n";
  dt = dtime_usec(0);
  chebyprod_imp<<< (N/nWarps)+1, nTPB >>>(N, d_a, d_b, d_c);
  dt = dtime_usec(dt,sync);
  cudaMemcpy(ic, d_c, N*sizeof(mt), cudaMemcpyDeviceToHost);
  printf("Time taken: %fs\n", dt/(float)USECPSEC);
  std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
  std::cout << "Vector c: [ ";
  for (int k = 0; k < 10; ++k)
    std::cout << ic[k] << " ";
  std::cout <<"]\n";
  mt max_error = 0;
  for (int k = 0; k < N; k++)
    max_error = fmax(max_error, fabs(c[k] - ic[k]));
  std::cout << "Max error = " << max_error << std::endl;
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);
  free(a);
  free(b);
  free(c);
  free(ic);
}