Cuda - 每个向量元素中的多个总和
Cuda - Multiple sums in each vector element
系数a和b的两个切比雪夫多项式的乘积可以用公式表示
问题是尽可能并行化。
我已经设法使用 cuda 通过简单地为每个向量元素应用一个线程来并行化上面的公式。因此一个线程执行 sums/multiplications.
#include <stdio.h>
#include <iostream>
#include <cuda.h>
#include <time.h>
__global__ void chebyprod(int n, float *a, float *b, float *c){
int i = blockIdx.x *blockDim.x + threadIdx.x;
float sum;
if (i < n) {
sum = 0.f;
for (int j = 0; j<=i; j++){
sum += a[j]*b[j-i];
}
for (int j = 1; j < n-i; j++){
sum += a[j]*b[j+i]+a[j+i]*b[j];
}
c[i] = 0.5f*sum;
}
/*
if (i < n)
c[i] = a[i] + b[i];
*/
}
int main(void){
clock_t tStart = clock();
int N = 10000;
float *a, *b, *c, *d_a, *d_b, *d_c;
a = (float*)malloc(N*sizeof(float));
b = (float*)malloc(N*sizeof(float));
c = (float*)malloc(N*sizeof(float));
cudaMalloc(&d_a, N*sizeof(float));
cudaMalloc(&d_b, N*sizeof(float));
cudaMalloc(&d_c, N*sizeof(float));
for (int i = 0; i < N; i++) {
a[i] = 0.1f;
b[i] = 0.2f;
}
cudaMemcpy(d_a, a, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, N*sizeof(float), cudaMemcpyHostToDevice);
int blockSize, gridSize;
// Number of threads in each thread block
blockSize = 1024;
// Number of thread blocks in grid
gridSize = (int)ceil((float)N/blockSize);
std::cout << "blockSize: " << blockSize << "\ngridSize: " << gridSize << "\n";
// Perform chebyprod on N elements
chebyprod<<< gridSize, blockSize >>>(N, d_a, d_b, d_c);
printf("Time taken: %.2fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
cudaMemcpy(c, d_c, N*sizeof(float), cudaMemcpyDeviceToHost);
std::cout << "Vector c: [ ";
for (int k = 0; k < 10; ++k)
std::cout << c[k] << " ";
std::cout <<"]\n";
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
free(a);
free(b);
free(c);
}
在另一个代码中,我设法使用求和归约法对向量中的所有元素求和(我从 nvidia 演示文稿中复制了别人的代码)。现在的问题是,如何将这两种方法结合起来?我想要一堆线程来计算 c 的每个元素中的所有 sums/multiplications。有小费吗?或者我可以从中学习的类似问题?
矩阵中的行减少可能与该问题类似。但是我有多个不同长度和乘法的和。
这是nvidia员工提供的代码(我认为)
template <unsigned int blockSize>
__global__ void parreduc(float *array_in, float *reduct, size_t array_len)
{
extern volatile __shared__ float sdata[];
size_t tid = threadIdx.x,
gridSize = blockSize * gridDim.x,
i = blockIdx.x * blockSize + tid;
sdata[tid] = 0;
while (i < array_len)
{ sdata[tid] += array_in[i];
i += gridSize; }
__syncthreads();
if (blockSize >= 512)
{ if (tid < 256) sdata[tid] += sdata[tid + 256]; __syncthreads(); }
if (blockSize >= 256)
{ if (tid < 128) sdata[tid] += sdata[tid + 128]; __syncthreads(); }
if (blockSize >= 128)
{ if (tid < 64) sdata[tid] += sdata[tid + 64]; __syncthreads(); }
if (tid < 32)
{ if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
if (blockSize >= 2) sdata[tid] += sdata[tid + 1]; }
if (tid == 0) reduct[blockIdx.x] = sdata[0];
}
问题中提供的代码是实现的明智的第一步。线程策略最common/typical:给每个输出点分配一个线程(这里N
个输出点)。每个线程必须执行计算特定输出点所需的所有计算。提高 CUDA 代码性能的动机应始终至少解决 2 个 CUDA 优化优先级:
- 公开足够的并行性(大致:创建足够的线程)
- 高效利用内存(大致:对于全局内存访问,争取合并)
关于第1项,问题中提供的代码的有效性将取决于GPU。作为一个粗略的经验法则,我们寻求在我们 运行 正在使用的 GPU 中为每个 SM 启动至少 2048 个线程(图灵上为 1024 个),以便有机会 "saturate" GPU。对于 N
= 10000,我们可以用 5 个 SM 使 GPU 饱和。对于具有 80 个 SM 的 Tesla V100,我们没有希望用 10,000 个线程使 GPU 饱和。
关于第2项,提供的代码也有一定的不足;在合并方面存在问题:在许多情况下,相邻线程不会读取内存中的相邻值。仅举一个例子,我看到的第一个全局负载是 a[j]
。这是为每个线程加载相同的 value/location,而不是相邻线程中的相邻值。
我们能否想出一个可能会改进这两个方面的替代实现?我们将考虑线程策略的以下变化:为每个输出点分配一个 threadblock,而不是为每个输出点分配一个线程。每个输出点所需的计算将可视化为矩阵的一个 "row"。线程块将 "stride" 沿行执行所需的计算,并最终进行线程块级缩减以生成该行的单个结果。这将使我们能够解决这两个问题:warp 中的相邻线程将能够从 a
和 b
中读取相邻值,并且我们也将能够立即将我们的线程总数增加一个因子高达 1024(因此,我们可以启动约 1000 万个线程,而不是 10000 个线程。1000 万个线程足以使当前的任何 CUDA GPU 饱和)。这种线程策略还有一个很好的特点:上面提到的 "rows" 计算的长度是可变的。第一行和最后一行最长,大约有 N
个计算元素,而中间的行将有接近 N/2
个计算元素。通过选择块步幅循环(概念上类似于 grid-stride loop),我们可以有效地处理不同的行长度。每个线程块将 "stride" 沿行,仅在需要时累积结果。
这是一个实现该实现的示例:
$ cat t1497.cu
#include <stdio.h>
#include <iostream>
#include <cuda.h>
typedef float mt;
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
const bool sync = true;
const bool nosync = false;
unsigned long long dtime_usec(unsigned long long start, bool use_sync = nosync){
if (use_sync == sync) cudaDeviceSynchronize();
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
__global__ void chebyprod(int n, const mt * __restrict__ a, const mt * __restrict__ b, mt * __restrict__ c){
int i = blockIdx.x *blockDim.x + threadIdx.x;
mt sum;
if (i < n) {
sum = 0.f;
for (int j = 0; j<=i; j++){
sum += a[j]*b[i-j];
}
for (int j = 1; j < n-i; j++){
sum += a[j]*b[j+i]+a[j+i]*b[j];
}
c[i] = 0.5f*sum;
}
}
// assume one threadblock per c_k coefficient
// assume a power-of-2 threadblock size
const int tpb_p2 = 8;
const int nTPB = 1<<tpb_p2;
const unsigned row_mask = ~((0xFFFFFFFFU>>tpb_p2)<<tpb_p2);
__global__ void chebyprod_imp(int n, const mt * __restrict__ a, const mt * __restrict__ b, mt * __restrict__ c){
#ifndef NO_WS
__shared__ mt sd[32];
if (threadIdx.x < 32) sd[threadIdx.x] = 0;
__syncthreads();
#else
__shared__ mt sd[nTPB];
#endif
int k = blockIdx.x;
mt sum = 0.0f;
int row_width = (((k)>(n-k))?(k):(n-k))+1;
int strides = (row_width>>tpb_p2)+ ((row_width&row_mask)?1:0);
int j = threadIdx.x;
mt tmp_a;
for (int s=0; s < strides; s++){ // block-stride loop
if (j < n) tmp_a = a[j];
if (j <= k) sum += tmp_a*b[k-j];
if ((j > 0) && (j < (n-k))) sum += tmp_a*b[j+k] + a[j+k]*b[j];
j += nTPB;
}
#ifndef NO_WS
// 1st warp-shuffle reduction
int lane = threadIdx.x & (warpSize-1);
int warpID = threadIdx.x >> 5; // assumes warpSize == 32
unsigned mask = 0xFFFFFFFFU;
for (int offset = warpSize>>1; offset > 0; offset >>= 1)
sum += __shfl_down_sync(mask, sum, offset);
if (lane == 0) sd[warpID] = sum;
__syncthreads(); // put warp results in shared mem
// hereafter, just warp 0
if (warpID == 0){
// reload val from shared mem if warp existed
sum = sd[lane];
// final warp-shuffle reduction
for (int offset = warpSize>>1; offset > 0; offset >>= 1)
sum += __shfl_down_sync(mask, sum, offset);
}
#else
sd[threadIdx.x] = sum;
for (int s = nTPB>>1; s > 0; s>>=1){ // sweep reduction
__syncthreads();
if (threadIdx.x < s) sd[threadIdx.x] += sd[threadIdx.x+s];}
if (!threadIdx.x) sum = sd[0];
#endif
if (!threadIdx.x) c[k] = sum*0.5f;
}
int main(int argc, char *argv[]){
int N = 10000;
if (argc>1) N = atoi(argv[1]);
std::cout << "N = " << N << std::endl;
mt *a, *b, *c, *ic, *d_a, *d_b, *d_c;
a = (mt*)malloc(N*sizeof(mt));
b = (mt*)malloc(N*sizeof(mt));
c = (mt*)malloc(N*sizeof(mt));
ic = (mt*)malloc(N*sizeof(mt));
cudaMalloc(&d_a, N*sizeof(mt));
cudaMalloc(&d_b, N*sizeof(mt));
cudaMalloc(&d_c, N*sizeof(mt));
for (int i = 0; i < N; i++) {
a[i] = 0.1f;
b[i] = 0.2f;
}
cudaMemcpy(d_a, a, N*sizeof(mt), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, N*sizeof(mt), cudaMemcpyHostToDevice);
int blockSize, gridSize;
// Number of threads in each thread block
blockSize = 1024;
// Number of thread blocks in grid
gridSize = (int)ceil((float)N/blockSize);
std::cout << "blockSize: " << blockSize << "\ngridSize: " << gridSize << "\n";
// Perform chebyprod on N elements
unsigned long long dt = dtime_usec(0);
chebyprod<<< gridSize, blockSize >>>(N, d_a, d_b, d_c);
dt = dtime_usec(dt,sync);
cudaMemcpy(c, d_c, N*sizeof(mt), cudaMemcpyDeviceToHost);
printf("Time taken: %fs\n", dt/(float)USECPSEC);
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
std::cout << "Vector c: [ ";
for (int k = 0; k < 10; ++k)
std::cout << c[k] << " ";
std::cout <<"]\n";
dt = dtime_usec(0);
chebyprod_imp<<< N, nTPB >>>(N, d_a, d_b, d_c);
dt = dtime_usec(dt,sync);
cudaMemcpy(ic, d_c, N*sizeof(mt), cudaMemcpyDeviceToHost);
printf("Time taken: %fs\n", dt/(float)USECPSEC);
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
std::cout << "Vector c: [ ";
for (int k = 0; k < 10; ++k)
std::cout << ic[k] << " ";
std::cout <<"]\n";
mt max_error = 0;
for (int k = 0; k < N; k++)
max_error = fmax(max_error, fabs(c[k] - ic[k]));
std::cout << "Max error = " << max_error << std::endl;
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
free(a);
free(b);
free(c);
free(ic);
}
$ nvcc -arch=sm_52 -o t1497 t1497.cu
$ ./t1497
blockSize: 1024
gridSize: 10
Time taken: 0.001687s
no error
Vector c: [ 199.996 199.986 199.976 199.966 199.956 199.946 199.936 199.926 199.916 199.906 ]
Time taken: 0.000350s
no error
Vector c: [ 199.99 199.98 199.97 199.96 199.95 199.94 199.93 199.92 199.91 199.9 ]
Max error = 0.0137787
$
(更改 -arch
开关以匹配您的 GPU)
以上示例表明修改后的算法 运行 快了大约 5 倍(在 Tesla V100 上)。尽管存在数值差异,但这是由于浮点数问题造成的。为证明该算法给出了正确的结果,请将 typedef
从 float
切换为 double
。您将看到结果中基本上不再有任何数值差异(表明算法在逻辑上是相同的)并且 float
分辨率的改进算法版本给出了前 10 个数值元素的答案更接近 double
算法产生的 "more accurate" 结果。
正如评论中所讨论的,这种算法转换可能并非在所有情况下都有益。主要好处将来自利用具有更大线程容量(大于 N
线程)的 GPU。相对较小的 GPU(例如,对于 N
= 10000,可能有 8 个 SM 或更少)可能无法从中受益,实际上代码可能 运行 比原始算法慢。
虽然我提到了合并,但对于 N
= 10000,这里的输入数据非常小(~80K 字节),适合大多数 GPU 的二级缓存。一旦数据在 L2 缓存中,低效的访问模式就不再是问题了。因此,在这种情况下,该算法的主要好处可能是由于第 1 项。如果无法利用第 1 项,则该算法显示出很少或没有好处。
出于测试目的,我创建了另一个使用 warp-stride 循环的版本。然而,它在小型 GPU 上似乎并没有显着加快,实际上在 V100 上更慢:
#include <stdio.h>
#include <iostream>
#include <cuda.h>
typedef float mt;
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
const bool sync = true;
const bool nosync = false;
unsigned long long dtime_usec(unsigned long long start, bool use_sync = nosync){
if (use_sync == sync) cudaDeviceSynchronize();
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
__global__ void chebyprod(int n, const mt * __restrict__ a, const mt * __restrict__ b, mt * __restrict__ c){
int i = blockIdx.x *blockDim.x + threadIdx.x;
mt sum;
if (i < n) {
sum = 0.f;
for (int j = 0; j<=i; j++){
sum += a[j]*b[i-j];
}
for (int j = 1; j < n-i; j++){
sum += a[j]*b[j+i]+a[j+i]*b[j];
}
c[i] = 0.5f*sum;
}
}
// assume one warp per c_k coefficient
// assume a multiple-of-32 threadblock size
const int nTPB = 32*8;
const int warpSize_p2 = 5; // assumes warpSize == 32
const int nWarps = nTPB>>warpSize_p2;
const unsigned row_mask = ~((0xFFFFFFFFU>>warpSize_p2)<<warpSize_p2);
__global__ void chebyprod_imp(int n, const mt * __restrict__ a, const mt * __restrict__ b, mt * __restrict__ c){
int warpID = threadIdx.x >> warpSize_p2;
int k = blockIdx.x*(nWarps)+warpID;
if (k < n){
mt sum = 0.0f;
int lane = threadIdx.x & (warpSize-1);
int row_width = (((k)>(n-k))?(k):(n-k))+1;
int strides = (row_width>>warpSize_p2)+ ((row_width&row_mask)?1:0);
int j = lane;
mt tmp_a;
for (int s=0; s < strides; s++){ // warp-stride loop
if (j < n) tmp_a = a[j];
if (j <= k) sum += tmp_a*b[k-j];
if ((j > 0) && (j < (n-k))) sum += tmp_a*b[j+k] + a[j+k]*b[j];
j += warpSize;
}
// warp-shuffle reduction
for (int offset = warpSize>>1; offset > 0; offset >>= 1)
sum += __shfl_down_sync(0xFFFFFFFFU, sum, offset);
if (lane==0) c[k] = sum*0.5f;}
}
int main(int argc, char *argv[]){
int N = 10000;
if (argc>1) N = atoi(argv[1]);
std::cout << "N = " << N << std::endl;
mt *a, *b, *c, *ic, *d_a, *d_b, *d_c;
a = (mt*)malloc(N*sizeof(mt));
b = (mt*)malloc(N*sizeof(mt));
c = (mt*)malloc(N*sizeof(mt));
ic = (mt*)malloc(N*sizeof(mt));
cudaMalloc(&d_a, N*sizeof(mt));
cudaMalloc(&d_b, N*sizeof(mt));
cudaMalloc(&d_c, N*sizeof(mt));
for (int i = 0; i < N; i++) {
a[i] = 0.1f;
b[i] = 0.2f;
}
cudaMemcpy(d_a, a, N*sizeof(mt), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, N*sizeof(mt), cudaMemcpyHostToDevice);
int blockSize, gridSize;
// Number of threads in each thread block
blockSize = 1024;
// Number of thread blocks in grid
gridSize = (int)ceil((float)N/blockSize);
std::cout << "blockSize: " << blockSize << "\ngridSize: " << gridSize << "\n";
// Perform chebyprod on N elements
unsigned long long dt = dtime_usec(0);
chebyprod<<< gridSize, blockSize >>>(N, d_a, d_b, d_c);
dt = dtime_usec(dt,sync);
cudaMemcpy(c, d_c, N*sizeof(mt), cudaMemcpyDeviceToHost);
printf("Time taken: %fs\n", dt/(float)USECPSEC);
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
std::cout << "Vector c: [ ";
for (int k = 0; k < 10; ++k)
std::cout << c[k] << " ";
std::cout <<"]\n";
dt = dtime_usec(0);
chebyprod_imp<<< (N/nWarps)+1, nTPB >>>(N, d_a, d_b, d_c);
dt = dtime_usec(dt,sync);
cudaMemcpy(ic, d_c, N*sizeof(mt), cudaMemcpyDeviceToHost);
printf("Time taken: %fs\n", dt/(float)USECPSEC);
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
std::cout << "Vector c: [ ";
for (int k = 0; k < 10; ++k)
std::cout << ic[k] << " ";
std::cout <<"]\n";
mt max_error = 0;
for (int k = 0; k < N; k++)
max_error = fmax(max_error, fabs(c[k] - ic[k]));
std::cout << "Max error = " << max_error << std::endl;
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
free(a);
free(b);
free(c);
free(ic);
}