将 cuda 数组传递给 thrust::inclusive_scan
Pass cuda array to thrust::inclusive_scan
我可以将 inclusive_scan 用于 cpu 上的数组,但是否可以使用 gpu 上的数组来实现? (注释是我知道有效但我不需要的方式)。或者,还有其他简单的方法可以对设备内存中的数组执行包容性扫描吗?
代码:
#include <stdio.h>
#include <stdlib.h> /* for rand() */
#include <unistd.h> /* for getpid() */
#include <time.h> /* for time() */
#include <math.h>
#include <assert.h>
#include <iostream>
#include <ctime>
#include <thrust/scan.h>
#include <cuda.h>
#ifdef DOUBLE
#define REAL double
#define MAXT 256
#else
#define REAL float
#define MAXT 512
#endif
#ifndef MIN
#define MIN(x,y) ((x < y) ? x : y)
#endif
using namespace std;
bool errorAsk(const char *s="n/a")
{
cudaError_t err=cudaGetLastError();
if(err==cudaSuccess)
return false;
printf("CUDA error [%s]: %s\n",s,cudaGetErrorString(err));
return true;
};
double *fillArray(double *c_idata,int N,double constant) {
int n;
for (n = 0; n < N; n++) {
c_idata[n] = constant*floor(drand48()*10);
}
return c_idata;
}
int main(int argc,char *argv[])
{
int N,blocks,threads;
N = 100;
threads=MAXT;
blocks=N/threads+(N%threads==0?0:1);
double *c_data,*g_data;
c_data = new double[N];
c_data = fillArray(c_data,N,1);
cudaMalloc(&g_data,N*sizeof(double));
cudaMemcpy(g_data,c_data,N*sizeof(double),cudaMemcpyHostToDevice);
thrust::inclusive_scan(g_data, g_data + N, g_data); // in-place scan
cudaMemcpy(c_data,g_data,N*sizeof(double),cudaMemcpyDeviceToHost);
// thrust::inclusive_scan(c_data, c_data + N, c_data); // in-place scan
for(int i = 0; i < N; i++) {
cout<<c_data[i]<<endl;
}
}
如果您阅读 thrust quick start guide,您会发现一个处理 "raw" 设备数据的建议:使用 thrust::device_ptr
:
You may wonder what happens when a "raw" pointer is used as an argument to a Thrust function. Like the STL, Thrust permits this usage and it will dispatch the host path of the algorithm. If the pointer in question is in fact a pointer to device memory then you'll need to wrap it with thrust::device_ptr before calling the function.
要修复您的代码,您需要
#include <thrust/device_ptr.h>
并用以下两行替换现有的对 thrust::inclusive_scan
的调用:
thrust::device_ptr<double> g_ptr = thrust::device_pointer_cast(g_data);
thrust::inclusive_scan(g_ptr, g_ptr + N, g_ptr); // in-place scan
另一种方法是使用 thrust execution policies 并像这样修改您的调用:
thrust::inclusive_scan(thrust::device, g_data, g_data + N, g_data);
还有其他各种可能性。
我可以将 inclusive_scan 用于 cpu 上的数组,但是否可以使用 gpu 上的数组来实现? (注释是我知道有效但我不需要的方式)。或者,还有其他简单的方法可以对设备内存中的数组执行包容性扫描吗?
代码:
#include <stdio.h>
#include <stdlib.h> /* for rand() */
#include <unistd.h> /* for getpid() */
#include <time.h> /* for time() */
#include <math.h>
#include <assert.h>
#include <iostream>
#include <ctime>
#include <thrust/scan.h>
#include <cuda.h>
#ifdef DOUBLE
#define REAL double
#define MAXT 256
#else
#define REAL float
#define MAXT 512
#endif
#ifndef MIN
#define MIN(x,y) ((x < y) ? x : y)
#endif
using namespace std;
bool errorAsk(const char *s="n/a")
{
cudaError_t err=cudaGetLastError();
if(err==cudaSuccess)
return false;
printf("CUDA error [%s]: %s\n",s,cudaGetErrorString(err));
return true;
};
double *fillArray(double *c_idata,int N,double constant) {
int n;
for (n = 0; n < N; n++) {
c_idata[n] = constant*floor(drand48()*10);
}
return c_idata;
}
int main(int argc,char *argv[])
{
int N,blocks,threads;
N = 100;
threads=MAXT;
blocks=N/threads+(N%threads==0?0:1);
double *c_data,*g_data;
c_data = new double[N];
c_data = fillArray(c_data,N,1);
cudaMalloc(&g_data,N*sizeof(double));
cudaMemcpy(g_data,c_data,N*sizeof(double),cudaMemcpyHostToDevice);
thrust::inclusive_scan(g_data, g_data + N, g_data); // in-place scan
cudaMemcpy(c_data,g_data,N*sizeof(double),cudaMemcpyDeviceToHost);
// thrust::inclusive_scan(c_data, c_data + N, c_data); // in-place scan
for(int i = 0; i < N; i++) {
cout<<c_data[i]<<endl;
}
}
如果您阅读 thrust quick start guide,您会发现一个处理 "raw" 设备数据的建议:使用 thrust::device_ptr
:
You may wonder what happens when a "raw" pointer is used as an argument to a Thrust function. Like the STL, Thrust permits this usage and it will dispatch the host path of the algorithm. If the pointer in question is in fact a pointer to device memory then you'll need to wrap it with thrust::device_ptr before calling the function.
要修复您的代码,您需要
#include <thrust/device_ptr.h>
并用以下两行替换现有的对 thrust::inclusive_scan
的调用:
thrust::device_ptr<double> g_ptr = thrust::device_pointer_cast(g_data);
thrust::inclusive_scan(g_ptr, g_ptr + N, g_ptr); // in-place scan
另一种方法是使用 thrust execution policies 并像这样修改您的调用:
thrust::inclusive_scan(thrust::device, g_data, g_data + N, g_data);
还有其他各种可能性。