Kepler 上更快的并行缩减
Faster Parallel Reductions on Kepler
我只是一个 CUDA 初学者,正在尝试在我的程序中使用 Faster Parallel Reductions on Kepler,但我没有得到结果,下面是我正在做的一个函数,输出为 0,如果知道我的错误是什么,我将不胜感激?
#ifndef __CUDACC__
#define __CUDACC__
#endif
#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <iostream>
#include <cuda_runtime_api.h>
#include <device_functions.h>
#include <stdio.h>
#include <math.h>
__inline__ __device__
float warpReduceSum(float val) {
for (int offset = warpSize/2; offset > 0; offset /= 2)
val += __shfl_down(val, offset);
return val;
}
__inline__ __device__
float blockReduceSum(float val) {
static __shared__ int shared[32]; // Shared mem for 32 partial sums
int lane = threadIdx.x % warpSize;
int wid = threadIdx.x / warpSize;
val = warpReduceSum(val); // Each warp performs partial reduction
if (lane==0) shared[wid]=val; // Write reduced value to shared memory
__syncthreads(); // Wait for all partial reductions
//read from shared memory only if that warp existed
val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
if (wid==0) val = warpReduceSum(val); //Final reduce within first warp
return val;
}
__global__ void deviceReduceKernel(float *in, float* out, size_t N)
{
float sum = 0;
//reduce multiple elements per thread
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x)
{
sum += in[i];
}
sum = blockReduceSum(sum);
if (threadIdx.x==0)
out[blockIdx.x]=sum;
}
int main()
{
int n = 1000000;
float *b = new float[1]();
float *d = new float[1]();
float *a ;
int blocks = (n/512)+1;
float *d_intermediate;
cudaMalloc((void**)&d_intermediate, n*sizeof(float));
cudaMalloc((void**)&a, n*sizeof(float));
cudaMemset(a, 1, n*sizeof(float));
deviceReduceKernel<<<blocks, 512>>>(a, d_intermediate, n);
deviceReduceKernel<<<1, 1024>>>(d_intermediate, &b[0], blocks);
cudaMemcpy(d, b, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_intermediate);
std::cout << d[0];
return 0;
}
您的代码存在各种问题:
任何时候您在使用 CUDA 代码时遇到问题,您应该使用 proper cuda error checking 和 运行 您的代码以及 cuda-memcheck
、 之前 向他人求助。即使您不理解错误输出,它也会对其他试图帮助您的人有用。如果您使用此代码完成了此操作,则会建议您使用各种 errors/problems
传递给 CUDA 内核的任何指针都应该是有效的 CUDA 设备指针。您的 b
指针是主机指针:
float *b = new float[1]();
所以你不能在这里使用它:
deviceReduceKernel<<<1, 1024>>>(d_intermediate, &b[0], blocks);
^
因为你显然想用它来存储设备上的单个 float
数量,我们可以很容易地 re-use a
指针。
出于类似的原因,这是不明智的:
cudaMemcpy(d, b, sizeof(float), cudaMemcpyDeviceToHost);
在这种情况下,b
和 d
都是主机指针。这不会将数据从设备复制到主机。
这可能不是你想的那样:
cudaMemset(a, 1, n*sizeof(float));
我想您认为这会用数量 1 填充 float
数组,但它不会。 cudaMemset
和 memset
一样,填充 bytes 并取 byte 数量。如果你用它来填充一个 float
数组,你实际上是在创建一个用 0x01010101
填充的数组。当您将位模式转换为 float
数量时,我不知道转换成什么值,但它不会为您提供 float
值 1。我们将通过填充普通值来解决此问题带循环的主机数组,然后将该数据传输到要减少的设备。
这是解决了上述问题的修改代码,运行对我来说是正确的:
$ cat t1290.cu
#include <iostream>
#include <stdio.h>
#include <math.h>
__inline__ __device__
float warpReduceSum(float val) {
for (int offset = warpSize/2; offset > 0; offset /= 2)
val += __shfl_down(val, offset);
return val;
}
__inline__ __device__
float blockReduceSum(float val) {
static __shared__ int shared[32]; // Shared mem for 32 partial sums
int lane = threadIdx.x % warpSize;
int wid = threadIdx.x / warpSize;
val = warpReduceSum(val); // Each warp performs partial reduction
if (lane==0) shared[wid]=val; // Write reduced value to shared memory
__syncthreads(); // Wait for all partial reductions
//read from shared memory only if that warp existed
val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
if (wid==0) val = warpReduceSum(val); //Final reduce within first warp
return val;
}
__global__ void deviceReduceKernel(float *in, float* out, size_t N)
{
float sum = 0;
//reduce multiple elements per thread
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x)
{
sum += in[i];
}
sum = blockReduceSum(sum);
if (threadIdx.x==0)
out[blockIdx.x]=sum;
}
int main()
{
int n = 1000000;
float b;
float *a, *a_host;
a_host = new float[n];
int blocks = (n/512)+1;
float *d_intermediate;
cudaMalloc((void**)&d_intermediate, blocks*sizeof(float));
cudaMalloc((void**)&a, n*sizeof(float));
for (int i = 0; i < n; i++) a_host[i] = 1;
cudaMemcpy(a, a_host, n*sizeof(float), cudaMemcpyHostToDevice);
deviceReduceKernel<<<blocks, 512>>>(a, d_intermediate, n);
deviceReduceKernel<<<1, 1024>>>(d_intermediate, a, blocks);
cudaMemcpy(&b, a, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_intermediate);
std::cout << b << std::endl;
return 0;
}
$ nvcc -arch=sm_35 -o t1290 t1290.cu
$ cuda-memcheck ./t1290
========= CUDA-MEMCHECK
1e+06
========= ERROR SUMMARY: 0 errors
$
我只是一个 CUDA 初学者,正在尝试在我的程序中使用 Faster Parallel Reductions on Kepler,但我没有得到结果,下面是我正在做的一个函数,输出为 0,如果知道我的错误是什么,我将不胜感激?
#ifndef __CUDACC__
#define __CUDACC__
#endif
#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <iostream>
#include <cuda_runtime_api.h>
#include <device_functions.h>
#include <stdio.h>
#include <math.h>
__inline__ __device__
float warpReduceSum(float val) {
for (int offset = warpSize/2; offset > 0; offset /= 2)
val += __shfl_down(val, offset);
return val;
}
__inline__ __device__
float blockReduceSum(float val) {
static __shared__ int shared[32]; // Shared mem for 32 partial sums
int lane = threadIdx.x % warpSize;
int wid = threadIdx.x / warpSize;
val = warpReduceSum(val); // Each warp performs partial reduction
if (lane==0) shared[wid]=val; // Write reduced value to shared memory
__syncthreads(); // Wait for all partial reductions
//read from shared memory only if that warp existed
val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
if (wid==0) val = warpReduceSum(val); //Final reduce within first warp
return val;
}
__global__ void deviceReduceKernel(float *in, float* out, size_t N)
{
float sum = 0;
//reduce multiple elements per thread
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x)
{
sum += in[i];
}
sum = blockReduceSum(sum);
if (threadIdx.x==0)
out[blockIdx.x]=sum;
}
int main()
{
int n = 1000000;
float *b = new float[1]();
float *d = new float[1]();
float *a ;
int blocks = (n/512)+1;
float *d_intermediate;
cudaMalloc((void**)&d_intermediate, n*sizeof(float));
cudaMalloc((void**)&a, n*sizeof(float));
cudaMemset(a, 1, n*sizeof(float));
deviceReduceKernel<<<blocks, 512>>>(a, d_intermediate, n);
deviceReduceKernel<<<1, 1024>>>(d_intermediate, &b[0], blocks);
cudaMemcpy(d, b, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_intermediate);
std::cout << d[0];
return 0;
}
您的代码存在各种问题:
任何时候您在使用 CUDA 代码时遇到问题,您应该使用 proper cuda error checking 和 运行 您的代码以及
cuda-memcheck
、 之前 向他人求助。即使您不理解错误输出,它也会对其他试图帮助您的人有用。如果您使用此代码完成了此操作,则会建议您使用各种 errors/problems传递给 CUDA 内核的任何指针都应该是有效的 CUDA 设备指针。您的
b
指针是主机指针:float *b = new float[1]();
所以你不能在这里使用它:
deviceReduceKernel<<<1, 1024>>>(d_intermediate, &b[0], blocks); ^
因为你显然想用它来存储设备上的单个
float
数量,我们可以很容易地 re-usea
指针。出于类似的原因,这是不明智的:
cudaMemcpy(d, b, sizeof(float), cudaMemcpyDeviceToHost);
在这种情况下,
b
和d
都是主机指针。这不会将数据从设备复制到主机。这可能不是你想的那样:
cudaMemset(a, 1, n*sizeof(float));
我想您认为这会用数量 1 填充
float
数组,但它不会。cudaMemset
和memset
一样,填充 bytes 并取 byte 数量。如果你用它来填充一个float
数组,你实际上是在创建一个用0x01010101
填充的数组。当您将位模式转换为float
数量时,我不知道转换成什么值,但它不会为您提供float
值 1。我们将通过填充普通值来解决此问题带循环的主机数组,然后将该数据传输到要减少的设备。
这是解决了上述问题的修改代码,运行对我来说是正确的:
$ cat t1290.cu
#include <iostream>
#include <stdio.h>
#include <math.h>
__inline__ __device__
float warpReduceSum(float val) {
for (int offset = warpSize/2; offset > 0; offset /= 2)
val += __shfl_down(val, offset);
return val;
}
__inline__ __device__
float blockReduceSum(float val) {
static __shared__ int shared[32]; // Shared mem for 32 partial sums
int lane = threadIdx.x % warpSize;
int wid = threadIdx.x / warpSize;
val = warpReduceSum(val); // Each warp performs partial reduction
if (lane==0) shared[wid]=val; // Write reduced value to shared memory
__syncthreads(); // Wait for all partial reductions
//read from shared memory only if that warp existed
val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
if (wid==0) val = warpReduceSum(val); //Final reduce within first warp
return val;
}
__global__ void deviceReduceKernel(float *in, float* out, size_t N)
{
float sum = 0;
//reduce multiple elements per thread
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x)
{
sum += in[i];
}
sum = blockReduceSum(sum);
if (threadIdx.x==0)
out[blockIdx.x]=sum;
}
int main()
{
int n = 1000000;
float b;
float *a, *a_host;
a_host = new float[n];
int blocks = (n/512)+1;
float *d_intermediate;
cudaMalloc((void**)&d_intermediate, blocks*sizeof(float));
cudaMalloc((void**)&a, n*sizeof(float));
for (int i = 0; i < n; i++) a_host[i] = 1;
cudaMemcpy(a, a_host, n*sizeof(float), cudaMemcpyHostToDevice);
deviceReduceKernel<<<blocks, 512>>>(a, d_intermediate, n);
deviceReduceKernel<<<1, 1024>>>(d_intermediate, a, blocks);
cudaMemcpy(&b, a, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_intermediate);
std::cout << b << std::endl;
return 0;
}
$ nvcc -arch=sm_35 -o t1290 t1290.cu
$ cuda-memcheck ./t1290
========= CUDA-MEMCHECK
1e+06
========= ERROR SUMMARY: 0 errors
$