memset cuArray 用于表面内存
memset cuArray for surface memory
假设您有一个 cuArray
用于绑定表面对象。
形式的东西:
// These are inputs to a function really.
cudaArray* d_cuArrSurf
cudaSurfaceObject_t * surfImage;
const cudaExtent extent = make_cudaExtent(width, height, depth);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaMalloc3DArray(&d_cuArrSurf, &channelDesc, extent);
// Bind to Surface
cudaResourceDesc surfRes;
memset(&surfRes, 0, sizeof(cudaResourceDesc));
surfRes.resType = cudaResourceTypeArray;
surfRes.res.array.array = d_cuArrSurf;
cudaCreateSurfaceObject(surfImage, &surfRes);
现在,我想将此 cuArray
初始化为零。显然 cuArray
类型的对象没有 memset
。最好的方法是什么?也许有多种选择,有些可能有更好或更差的特性。这些选项有哪些?
我能想到
分配和归零主机内存并使用 cudaMemcpy3D()
复制它。
创建初始化内核并用surf3Dwrite()
写入
Would it be possible for you to show an example of those lines?
这是一个粗略的例子,粗略地扩展了粗略的例子:
$ cat t1648.cu
// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
__device__ float my_common(float *d, int width, unsigned int x, unsigned int y){
// 200 lines of common code...
return d[y *width +x];
}
////////////////////////////////////////////////////////////////////////////////
// Kernels
////////////////////////////////////////////////////////////////////////////////
//! Write to a cuArray using surface writes
//! @param gIData input data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void WriteKernel(float *gIData, int width, int height,
cudaSurfaceObject_t outputSurface)
{
// calculate surface coordinates
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
unsigned int z = blockIdx.z*blockDim.z + threadIdx.z;
// read from global memory and write to cuarray (via surface reference)
surf3Dwrite(my_common(gIData, width, x, y),
outputSurface, x*4, y, z, cudaBoundaryModeTrap);
}
__global__ void WriteKernel(float *gIData, int width, int height,
float *out)
{
// calculate coordinates
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
// read from global memory and write to global memory
out[y*width+x] = my_common(gIData, width, x, y);
}
__global__ void ReadKernel(float tval, cudaSurfaceObject_t outputSurface)
{
// calculate surface coordinates
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
unsigned int z = blockIdx.z*blockDim.z + threadIdx.z;;
// read from global memory and write to cuarray (via surface reference)
float val;
surf3Dread(&val,
outputSurface, x*4, y, z, cudaBoundaryModeTrap);
if (val != tval) printf("oops\n");
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
printf("starting...\n");
unsigned width = 256;
unsigned height = 256;
unsigned depth = 256;
unsigned int size = depth*width * height * sizeof(float);
// Allocate device memory for result
float *dData = NULL;
cudaMalloc((void **) &dData, size);
// Allocate array and copy image data
float *out, *h_out;
h_out = new float[height*width*depth];
float tval = 1.0f;
for (int i = 0; i < height*width*depth; i++) h_out[i] = tval;
cudaArray* d_cuArrSurf;
cudaSurfaceObject_t surfImage;
const cudaExtent extent = make_cudaExtent(width, height, depth);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaMalloc3DArray(&d_cuArrSurf, &channelDesc, extent);
// Bind to Surface
cudaResourceDesc surfRes;
memset(&surfRes, 0, sizeof(cudaResourceDesc));
surfRes.resType = cudaResourceTypeArray;
surfRes.res.array.array = d_cuArrSurf;
cudaCreateSurfaceObject(&surfImage, &surfRes);
cudaMalloc(&out, size);
cudaMemcpy(out, h_out, size, cudaMemcpyHostToDevice);
dim3 dimBlock(8, 8, 8);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
// initialize array
cudaMemcpy3DParms p = {0};
p.srcPtr = make_cudaPitchedPtr(out, width*sizeof(out[0]), width, height);
p.srcPos = make_cudaPos(0,0,0);
p.dstArray = d_cuArrSurf;
p.dstPos = make_cudaPos(0,0,0);
p.extent = make_cudaExtent(width, height, 1);
p.kind = cudaMemcpyDefault;
for (int i = 0; i < depth; i++){
cudaMemcpy3D(&p);
p.dstPos = make_cudaPos(0,0, i+1);}
ReadKernel<<<dimGrid, dimBlock>>>(tval, surfImage);
WriteKernel<<<dimGrid, dimBlock>>>(dData, width, height, surfImage);
WriteKernel<<<dimGrid, dimBlock>>>(dData, width, height, out);
cudaDeviceSynchronize();
}
$ nvcc -o t1648 t1648.cu
$ cuda-memcheck ./t1648
========= CUDA-MEMCHECK
starting...
========= ERROR SUMMARY: 0 errors
$
上面的(总)范围是 256x256x256。所以我选择在 cudaMemcpy3D
的 256 次迭代中进行 256x256 传输(每次传输范围)(基本上每个 z 切片)。好像通过了sniff测试
我在这里使用 1 作为设备内存的初始化值 "just because"。如果你想让它更快并初始化为零,跳过主机->设备复制,只使用 cudaMemset 将线性内存(3D 传输的源)初始化为零。
假设您有一个 cuArray
用于绑定表面对象。
形式的东西:
// These are inputs to a function really.
cudaArray* d_cuArrSurf
cudaSurfaceObject_t * surfImage;
const cudaExtent extent = make_cudaExtent(width, height, depth);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaMalloc3DArray(&d_cuArrSurf, &channelDesc, extent);
// Bind to Surface
cudaResourceDesc surfRes;
memset(&surfRes, 0, sizeof(cudaResourceDesc));
surfRes.resType = cudaResourceTypeArray;
surfRes.res.array.array = d_cuArrSurf;
cudaCreateSurfaceObject(surfImage, &surfRes);
现在,我想将此 cuArray
初始化为零。显然 cuArray
类型的对象没有 memset
。最好的方法是什么?也许有多种选择,有些可能有更好或更差的特性。这些选项有哪些?
我能想到
分配和归零主机内存并使用
cudaMemcpy3D()
复制它。创建初始化内核并用
surf3Dwrite()
写入
Would it be possible for you to show an example of those lines?
这是一个粗略的例子,粗略地扩展了
$ cat t1648.cu
// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
__device__ float my_common(float *d, int width, unsigned int x, unsigned int y){
// 200 lines of common code...
return d[y *width +x];
}
////////////////////////////////////////////////////////////////////////////////
// Kernels
////////////////////////////////////////////////////////////////////////////////
//! Write to a cuArray using surface writes
//! @param gIData input data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void WriteKernel(float *gIData, int width, int height,
cudaSurfaceObject_t outputSurface)
{
// calculate surface coordinates
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
unsigned int z = blockIdx.z*blockDim.z + threadIdx.z;
// read from global memory and write to cuarray (via surface reference)
surf3Dwrite(my_common(gIData, width, x, y),
outputSurface, x*4, y, z, cudaBoundaryModeTrap);
}
__global__ void WriteKernel(float *gIData, int width, int height,
float *out)
{
// calculate coordinates
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
// read from global memory and write to global memory
out[y*width+x] = my_common(gIData, width, x, y);
}
__global__ void ReadKernel(float tval, cudaSurfaceObject_t outputSurface)
{
// calculate surface coordinates
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
unsigned int z = blockIdx.z*blockDim.z + threadIdx.z;;
// read from global memory and write to cuarray (via surface reference)
float val;
surf3Dread(&val,
outputSurface, x*4, y, z, cudaBoundaryModeTrap);
if (val != tval) printf("oops\n");
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
printf("starting...\n");
unsigned width = 256;
unsigned height = 256;
unsigned depth = 256;
unsigned int size = depth*width * height * sizeof(float);
// Allocate device memory for result
float *dData = NULL;
cudaMalloc((void **) &dData, size);
// Allocate array and copy image data
float *out, *h_out;
h_out = new float[height*width*depth];
float tval = 1.0f;
for (int i = 0; i < height*width*depth; i++) h_out[i] = tval;
cudaArray* d_cuArrSurf;
cudaSurfaceObject_t surfImage;
const cudaExtent extent = make_cudaExtent(width, height, depth);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaMalloc3DArray(&d_cuArrSurf, &channelDesc, extent);
// Bind to Surface
cudaResourceDesc surfRes;
memset(&surfRes, 0, sizeof(cudaResourceDesc));
surfRes.resType = cudaResourceTypeArray;
surfRes.res.array.array = d_cuArrSurf;
cudaCreateSurfaceObject(&surfImage, &surfRes);
cudaMalloc(&out, size);
cudaMemcpy(out, h_out, size, cudaMemcpyHostToDevice);
dim3 dimBlock(8, 8, 8);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
// initialize array
cudaMemcpy3DParms p = {0};
p.srcPtr = make_cudaPitchedPtr(out, width*sizeof(out[0]), width, height);
p.srcPos = make_cudaPos(0,0,0);
p.dstArray = d_cuArrSurf;
p.dstPos = make_cudaPos(0,0,0);
p.extent = make_cudaExtent(width, height, 1);
p.kind = cudaMemcpyDefault;
for (int i = 0; i < depth; i++){
cudaMemcpy3D(&p);
p.dstPos = make_cudaPos(0,0, i+1);}
ReadKernel<<<dimGrid, dimBlock>>>(tval, surfImage);
WriteKernel<<<dimGrid, dimBlock>>>(dData, width, height, surfImage);
WriteKernel<<<dimGrid, dimBlock>>>(dData, width, height, out);
cudaDeviceSynchronize();
}
$ nvcc -o t1648 t1648.cu
$ cuda-memcheck ./t1648
========= CUDA-MEMCHECK
starting...
========= ERROR SUMMARY: 0 errors
$
上面的(总)范围是 256x256x256。所以我选择在 cudaMemcpy3D
的 256 次迭代中进行 256x256 传输(每次传输范围)(基本上每个 z 切片)。好像通过了sniff测试
我在这里使用 1 作为设备内存的初始化值 "just because"。如果你想让它更快并初始化为零,跳过主机->设备复制,只使用 cudaMemset 将线性内存(3D 传输的源)初始化为零。