CUDA:在内核中使用设备仿函数
CUDA: Using device functors in kernels
我试图制作一个设备函子,它基本上执行(未优化的)矩阵向量乘法,就像这样
namespace cusolve
{
template <class value_type,
class matrix_type = value_type*,
class vector_type = value_type*>
struct linear_operator
{
const matrix_type matrix;
const size_t width;
__device__
linear_operator(const matrix_type matrix, size_t width)
: matrix(matrix), width(width) { }
__device__
void operator()(const vector_type x, vector_type x_out)
{
auto col = blockIdx.x * blockDim.x + threadIdx.x;
auto row = blockIdx.y * blockDim.y + threadIdx.y;
x_out[row] = 0;
if (row < width)
{
for (size_t i = 0; i < width; i++)
{
x_out[row] += matrix[row*width + i] * x[i];
}
}
return;
}
};
因此,这假设 matrix
、x
和 x_out
是设备指针。因此,为了测试它,我尝试从一个简单的内核
调用它
__global__
void
operateKernel(double *d_matrix,
double *d_vector, double *d_vector_out,
size_t width)
{
cusolve::linear_operator<double> matmul(d_matrix, width);
matmul(d_vector, d_vector_out);
}
void
operate(double *matrix, double *vector, double *vector_out, size_t width)
{
const dim3 blockConfig(16, 16);
const size_t gridWidth = (size_t) ((double) width) / 16.0l;
const dim3 gridConfig(gridWidth, gridWidth);
double *d_matrix, *d_vector, *d_vector_out;
auto mem_vector = width * sizeof(double);
auto mem_matrix = mem_vector * width;
cudaMalloc((void **) &d_matrix, mem_matrix);
cudaMalloc((void **) &d_vector, mem_vector);
cudaMalloc((void **) &d_vector_out, mem_vector);
cudaMemcpy(d_matrix, matrix, mem_matrix, cudaMemcpyHostToDevice);
cudaMemcpy(d_vector, vector, mem_vector, cudaMemcpyHostToDevice);
operateKernel<<<gridConfig, blockConfig>>>(d_matrix, d_vector, d_vector_out, width);
cudaMemcpy(vector_out, d_vector_out, mem_vector, cudaMemcpyDeviceToHost);
cudaFree(d_vector);
cudaFree(d_matrix);
cudaFree(d_vector_out);
}
但是,当我尝试使用分配并初始化为非空向量和矩阵从 main()
调用 operate()
时,输出全为零。我已经为此苦思冥想了很长一段时间,一直无法弄清楚我做错了什么。
P.S: 我是故意尝试在没有推力的情况下做这个作为学习练习。
计算网格尺寸时忘记使用 ceil
。
const size_t gridWidth = ceil( ((double) width) / 16.0l );
我试图制作一个设备函子,它基本上执行(未优化的)矩阵向量乘法,就像这样
namespace cusolve
{
template <class value_type,
class matrix_type = value_type*,
class vector_type = value_type*>
struct linear_operator
{
const matrix_type matrix;
const size_t width;
__device__
linear_operator(const matrix_type matrix, size_t width)
: matrix(matrix), width(width) { }
__device__
void operator()(const vector_type x, vector_type x_out)
{
auto col = blockIdx.x * blockDim.x + threadIdx.x;
auto row = blockIdx.y * blockDim.y + threadIdx.y;
x_out[row] = 0;
if (row < width)
{
for (size_t i = 0; i < width; i++)
{
x_out[row] += matrix[row*width + i] * x[i];
}
}
return;
}
};
因此,这假设 matrix
、x
和 x_out
是设备指针。因此,为了测试它,我尝试从一个简单的内核
__global__
void
operateKernel(double *d_matrix,
double *d_vector, double *d_vector_out,
size_t width)
{
cusolve::linear_operator<double> matmul(d_matrix, width);
matmul(d_vector, d_vector_out);
}
void
operate(double *matrix, double *vector, double *vector_out, size_t width)
{
const dim3 blockConfig(16, 16);
const size_t gridWidth = (size_t) ((double) width) / 16.0l;
const dim3 gridConfig(gridWidth, gridWidth);
double *d_matrix, *d_vector, *d_vector_out;
auto mem_vector = width * sizeof(double);
auto mem_matrix = mem_vector * width;
cudaMalloc((void **) &d_matrix, mem_matrix);
cudaMalloc((void **) &d_vector, mem_vector);
cudaMalloc((void **) &d_vector_out, mem_vector);
cudaMemcpy(d_matrix, matrix, mem_matrix, cudaMemcpyHostToDevice);
cudaMemcpy(d_vector, vector, mem_vector, cudaMemcpyHostToDevice);
operateKernel<<<gridConfig, blockConfig>>>(d_matrix, d_vector, d_vector_out, width);
cudaMemcpy(vector_out, d_vector_out, mem_vector, cudaMemcpyDeviceToHost);
cudaFree(d_vector);
cudaFree(d_matrix);
cudaFree(d_vector_out);
}
但是,当我尝试使用分配并初始化为非空向量和矩阵从 main()
调用 operate()
时,输出全为零。我已经为此苦思冥想了很长一段时间,一直无法弄清楚我做错了什么。
P.S: 我是故意尝试在没有推力的情况下做这个作为学习练习。
计算网格尺寸时忘记使用 ceil
。
const size_t gridWidth = ceil( ((double) width) / 16.0l );