CUDA:使用 CUSPARSE csrmv() 例程的映射错误

CUDA: Mapping Error using CUSPARSE csrmv() routine

我目前正在尝试使用 CUSPARSE 库来加速 HPCG 的实施。但是,我似乎在设备数据分配过程中犯了某种错误。

这是导致 CUSPARSE_STATUS_MAPPING_ERROR:

的代码段
int HPC_sparsemv( CRS_Matrix *A_crs_d, 
      FP * x_d, FP * y_d)
{
FP alpha = 1.0f;
FP beta = 0.0f;

FP* vals = A_crs_d->vals;
int* inds = A_crs_d->col_ind;
int* row_ptr = A_crs_d->row_ptr;

/*generate Matrix descriptor for SparseMV computation*/
cusparseMatDescr_t matDescr;
cusparseCreateMatDescr(&matDescr);

cusparseStatus_t status;

/*hand off control to CUSPARSE routine*/

#ifdef DOUBLE

status = cusparseDcsrmv(cuspHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, A_crs_d->nrows,
    A_crs_d->ncols,A_crs_d->nnz, &alpha, matDescr, vals, row_ptr, 
    inds, x_d, &beta, y_d); 


#else

status = cusparseScsrmv(cuspHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, A_crs_d->nrows, 
            A_crs_d->ncols,A_crs_d->nnz, &alpha, matDescr, vals, row_ptr,
            col_ind, x_d, &beta, y_d); 

#endif

注意:FP 是由条件编译保护包装的 typedef,这意味着它在编译时被评估为 float 或 double 别名。

这里是处理数据分配的函数:

int cudaAlloc(FP* r_d, FP* p_d, FP* Ap_d, FP* b_d, const FP* const b, FP * x_d, FP * const x, 
    struct CRS_Matrix* A_crs_d, int nrows, int ncols, int nnz){

std::cout << "Beginning device allocation..." << std::endl; 

int size_r = nrows * sizeof(FP);
int size_c = ncols * sizeof(FP);
int size_nnz = nnz * sizeof(FP);

int allocStatus = 0;

/*device alloc r_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &r_d, size_r) );



/*device alloc p_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &p_d, size_c) );


/*device alloc Ap_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &Ap_d, size_r) );


/*device alloc b_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &b_d, size_r ) );
allocStatus |= (int) checkCuda( cudaMemcpy(b_d, b, size_r, cudaMemcpyHostToDevice));

/*device alloc x_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &x_d, size_r ) );
allocStatus |= (int) checkCuda( cudaMemcpy(x_d, x, size_r, cudaMemcpyHostToDevice));


/*device alloc A_crs_d*/
FP * valtmp;
allocStatus |= (int) checkCuda( cudaMalloc((void **) &valtmp, size_nnz) );
allocStatus |= (int) checkCuda( cudaMemcpy(valtmp, CRS->vals, size_nnz, cudaMemcpyHostToDevice) );


int * indtmp;
allocStatus |= (int) checkCuda( cudaMalloc((void **) &indtmp, nnz* sizeof(int)) );
allocStatus |= (int) checkCuda( cudaMemcpy(indtmp, CRS->col_ind, 
nnz * sizeof(int) , cudaMemcpyHostToDevice) );


int * rowtmp; 
allocStatus |= (int) checkCuda( cudaMalloc((void **) &rowtmp,  (nrows + 1) * sizeof(int)) );
allocStatus |= (int) checkCuda( cudaMemcpy(rowtmp, CRS->row_ptr, 
(nrows + 1) * sizeof(int), cudaMemcpyHostToDevice) );


allocStatus |= (int) checkCuda( cudaMallocHost( &A_crs_d, sizeof(CRS_Matrix)) );

A_crs_d->vals = valtmp;
A_crs_d->col_ind = indtmp;
A_crs_d->row_ptr = rowtmp;

A_crs_d->nrows = CRS->nrows;
A_crs_d->ncols = CRS->ncols;
A_crs_d->nnz = CRS->nnz;

std::cout << "Device allocation done." << std::endl;

return  allocStatus;
}

在我第一次访问 Whosebug 期间,我发现这个已解决的问题是由其他人发布的:Cusparse status mapping error while using cuda constant memory

但是,由于我没有在传递给 csrmv() 的参数上使用常量内存,因此无法解决我的问题。我还检查了数据完整性,设备上的 CRS_Matrix 与主机内存中的原始数据完全匹配。

我对这个问题一头雾水,在 CUDA 工具包文档中找不到任何表明问题的信息,因此非常感谢您的帮助。

提前致谢。

您显示的代码中存在一些错误。

  1. 无法将指针参数按值传递给例程,对该指针执行 cudaMalloc 操作,然后期望该结果显示在调用环境中.您正在为传递给 cudaAllocx_db_dA_crs_d(带有 cudaMallocHost)参数执行此操作。一种可能的修复方法是在例程中将这些参数作为双指针 (**) 参数处理,并将指针的 地址传递给例程。这允许修改后的指针值显示在调用环境中。这确实是一个正确的 C 编码问题,并不特定于 CUDA。

  2. 至少在 cudaAlloc 方面,您似乎打算实施 Ax=b。在那种情况下,x向量的长度是A的数量,b向量的长度是b的数量A 行。在您的 cudaAlloc 例程中,您将这两者分配为 A 的行的大小,因此这是不正确的。这也影响了后续的cudaMemcpy操作(大小)。

您显示的代码似乎仅针对 double 情况进行了测试,因为您传递给每次调用的 colum index 参数有所不同(大概是 floatdouble).无论如何,我已经围绕您所展示的内容(针对 double 案例)构建了一个完整的代码,加上上述更改,它运行时没有错误并为我产生了正确的结果:

$ cat t1216.cu
#include <cusparse.h>
#include <iostream>

#define checkCuda(x) x

#ifdef USE_FLOAT
typedef float FP;
#else
#define DOUBLE
typedef double FP;
#endif

struct CRS_Matrix{
  FP *vals;
  int *col_ind;
  int *row_ptr;
  int ncols;
  int nnz;
  int nrows;
} *CRS;

cusparseHandle_t cuspHandle;

int HPC_sparsemv( CRS_Matrix *A_crs_d,
      FP * x_d, FP * y_d)
{
FP alpha = 1.0f;
FP beta = 0.0f;

FP* vals = A_crs_d->vals;
int* inds = A_crs_d->col_ind;
int* row_ptr = A_crs_d->row_ptr;

/*generate Matrix descriptor for SparseMV computation*/
cusparseMatDescr_t matDescr;
cusparseCreateMatDescr(&matDescr);

cusparseStatus_t status;

/*hand off control to CUSPARSE routine*/

#ifdef DOUBLE

status = cusparseDcsrmv(cuspHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, A_crs_d->nrows,
    A_crs_d->ncols,A_crs_d->nnz, &alpha, matDescr, vals, row_ptr,
    inds, x_d, &beta, y_d);


#else

status = cusparseScsrmv(cuspHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, A_crs_d->nrows,
            A_crs_d->ncols,A_crs_d->nnz, &alpha, matDescr, vals, row_ptr,
            col_ind, x_d, &beta, y_d);  // col_ind here should probably be inds

#endif
return (int)status;
}



int cudaAlloc(FP* r_d, FP* p_d, FP* Ap_d, FP** b_d, const FP* const b, FP ** x_d, FP * const x,
    struct CRS_Matrix** A_crs_d, int nrows, int ncols, int nnz){

std::cout << "Beginning device allocation..." << std::endl;

int size_r = nrows * sizeof(FP);
int size_c = ncols * sizeof(FP);
int size_nnz = nnz * sizeof(FP);

int allocStatus = 0;

/*device alloc r_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &r_d, size_r) );



/*device alloc p_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &p_d, size_c) );


/*device alloc Ap_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &Ap_d, size_r) );


/*device alloc b_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) b_d, size_r ) );
allocStatus |= (int) checkCuda( cudaMemcpy(*b_d, b, size_r, cudaMemcpyHostToDevice));

/*device alloc x_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) x_d, size_c ) );
allocStatus |= (int) checkCuda( cudaMemcpy(*x_d, x, size_c, cudaMemcpyHostToDevice));


/*device alloc A_crs_d*/
FP * valtmp;
allocStatus |= (int) checkCuda( cudaMalloc((void **) &valtmp, size_nnz) );
allocStatus |= (int) checkCuda( cudaMemcpy(valtmp, CRS->vals, size_nnz, cudaMemcpyHostToDevice) );


int * indtmp;
allocStatus |= (int) checkCuda( cudaMalloc((void **) &indtmp, nnz* sizeof(int)) );
allocStatus |= (int) checkCuda( cudaMemcpy(indtmp, CRS->col_ind,
nnz * sizeof(int) , cudaMemcpyHostToDevice) );


int * rowtmp;
allocStatus |= (int) checkCuda( cudaMalloc((void **) &rowtmp,  (nrows + 1) * sizeof(int)) );
allocStatus |= (int) checkCuda( cudaMemcpy(rowtmp, CRS->row_ptr,
(nrows + 1) * sizeof(int), cudaMemcpyHostToDevice) );


allocStatus |= (int) checkCuda( cudaMallocHost( A_crs_d, sizeof(CRS_Matrix)) );

(*A_crs_d)->vals = valtmp;
(*A_crs_d)->col_ind = indtmp;
(*A_crs_d)->row_ptr = rowtmp;

(*A_crs_d)->nrows = CRS->nrows;
(*A_crs_d)->ncols = CRS->ncols;
(*A_crs_d)->nnz = CRS->nnz;

std::cout << "Device allocation done." << std::endl;

return  allocStatus;

}

int main(){

  CRS = (struct CRS_Matrix *)malloc(sizeof(struct CRS_Matrix));
  cusparseCreate(&cuspHandle);

  // simple test matrix
  #define M0_M 5
  #define M0_N 5
  FP m0_csr_vals[] = {2.0f, 1.0f, 1.0f, 2.0f, 1.0f, 1.0f, 2.0f, 1.0f, 1.0f, 2.0f, 1.0f, 1.0f, 2.0f};
  int   m0_col_idxs[] = {   0,    1,    0,    1,    2,    1,    2,    3,    2,    3,    4,    3,    4};
  int   m0_row_ptrs[] = {   0, 2, 5, 8, 11, 13};
  FP m0_d[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
  int m0_nnz = 13;

  FP *r_d, *p_d, *Ap_d, *b_d, *x_d;
  FP *b = new FP[M0_N];
  CRS_Matrix *A_crs_d;
  CRS->vals = m0_csr_vals;
  CRS->col_ind = m0_col_idxs;
  CRS->row_ptr = m0_row_ptrs;
  CRS->nrows = M0_M;
  CRS->ncols = M0_N;
  CRS->nnz = m0_nnz;
  // Ax = b
  // r_d, p_d, Ap_d ??
  int stat = cudaAlloc(r_d, p_d, Ap_d, &b_d, b, &x_d, m0_d, &A_crs_d, M0_M, M0_N, m0_nnz);
  std::cout << "cudaAlloc status: " << stat << std::endl;
  stat = HPC_sparsemv( A_crs_d, x_d, b_d);
  std::cout << "HPC_sparsemv status: " << stat << std::endl;
  FP *results = new FP[M0_M];
  cudaMemcpy(results, b_d, M0_M*sizeof(FP), cudaMemcpyDeviceToHost);
  std::cout << "Results:" << std::endl;
  for (int i = 0; i < M0_M; i++) std::cout << results[i] << std::endl;
  return 0;
}

$ nvcc -o t1216 t1216.cu -lcusparse
t1216.cu(153): warning: variable "r_d" is used before its value is set

t1216.cu(153): warning: variable "p_d" is used before its value is set

t1216.cu(153): warning: variable "Ap_d" is used before its value is set

t1216.cu(153): warning: variable "r_d" is used before its value is set

t1216.cu(153): warning: variable "p_d" is used before its value is set

t1216.cu(153): warning: variable "Ap_d" is used before its value is set

$ cuda-memcheck ./t1216
========= CUDA-MEMCHECK
Beginning device allocation...
Device allocation done.
cudaAlloc status: 0
HPC_sparsemv status: 0
Results:
3
4
4
4
3
========= ERROR SUMMARY: 0 errors
$

备注:

  1. 不清楚您在 cudaAlloc 例程中对 r_dp_dAp_d 的意图。我让他们保持原样。但是,如果您打算将它们用于某些用途,它们可能会受到我在上面 1 中描述的问题的影响。

  2. 如前所述,在传递给 HPC_sparsemv 中的 cusparse 例程的参数中,floatdouble 的代码似乎不一致.特别是,列索引参数不匹配,double 版本对我来说似乎很合理,所以我使用了它。如果您使用 float,您可能需要修改该参数。

  3. 以后我建议您提供完整的代码,正如我所展示的那样,以演示失败。它并没有比你已经展示的代码多多少,而且它会让其他人更容易帮助你。