CUDA:cuSolver 引发异常
CUDA : cuSolver raises an exception
我正在尝试使用 cusolver 库来求解多个线性方程,但却引发了一个非常奇怪的异常。
该代码仅使用库中的一个函数,其余的是内存分配和内存复制。
函数是
cusolverSpScsrlsvcholHost(
cusolverSpHandle_t handle, int m, int nnz,
const cusparseMatDescr_t descrA, const float *csrVal,
const int *csrRowPtr, const int *csrColInd, const float *b,
float tol, int reorder, float *x, int *singularity);
我想我的问题可能出在 tol - reorder - singularity parameters 因为剩下的是矩阵参数
这是代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <cusparse.h>
#include <cublas_v2.h>
#include <stdio.h>
#include <cusolverSp.h>
int main()
{
//initialize our test cases
const int size = 3;
int nnz = 6 ;
int sing = -1 ;
//float values[] = {0,0,0,0} ;
float values[] = {1,2,3,4,5,6} ;
int colIdx[] = {0,0,1,0,1,2};
int rowPtr[] = {0, 1,3,7};
float x[] = {4,-6,7};
float y[3]= {0,0,0} ;
float *dev_values = 0 ;
int *dev_rowPtr = 0 ;
int *dev_colIdx = 0 ;
float *dev_x = 0 ;
float *dev_y = 0 ;
cusolverSpHandle_t solver_handle ;
cusolverSpCreate(&solver_handle) ;
cusparseMatDescr_t descr = 0;
cusparseCreateMatDescr(&descr);
cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);
// Choose which GPU to run on, change this on a multi-GPU system.
cudaSetDevice(0);
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
// Allocate GPU buffers for three vectors (two input, one output) .
cudaMalloc((void**)&dev_x, size * sizeof(float));
cudaMalloc((void**)&dev_y, size * sizeof(float));
cudaMalloc((void**)&dev_values, nnz * sizeof(float));
cudaMalloc((void**)&dev_rowPtr, (size + 1) * sizeof(int));
cudaMalloc((void**)&dev_colIdx, nnz * sizeof(int));
//Memcpy
cudaMemcpyAsync(dev_x, x, size * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyAsync(dev_values, values, nnz * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyAsync(dev_rowPtr, rowPtr, (size + 1) * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync(dev_colIdx, colIdx, nnz * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync(dev_y, y, size * sizeof(float), cudaMemcpyHostToDevice);
cusolverSpScsrlsvluHost(solver_handle, size, nnz, descr, dev_values, dev_rowPtr, dev_colIdx, dev_y, 0,0, dev_x, &sing);
cudaMemcpyAsync(y, dev_y, size*sizeof(float), cudaMemcpyDeviceToHost );
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf ("Time for the kernel: %f ms\n", time);
printf("%f\n",y[0]);
printf("%f\n",y[1]);
printf("%f\n",y[2]);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaDeviceReset();
cudaFree(dev_x);
cudaFree(dev_y);
cudaFree(dev_values);
cudaFree(dev_rowPtr);
cudaFree(dev_colIdx);
return 1;
}
您的代码中至少存在 3 个问题:
您正在使用函数的 Host 变体:cusolverSpScsrlsvlu
Host()
.如果查看 cusolverSpScsrlsvluHost
的 the documentation,您会发现对于 Host MemSpace,该函数期望所有参数和指针参数都是基于主机的。但是您正在将设备指针传递给函数。这样做会导致段错误。对于像 dev_values
这样的所有参数,您需要用等效的主机数据指针替换它们(例如 values
代替 dev_values
)。
您的 CSR 矩阵格式不正确。这一行:
int rowPtr[] = {0, 1,3,7};
应该是这样的:
int rowPtr[] = {0, 1,3,6};
指向最后一个元素之后的一个元素的正确行指针索引是 6,而不是 7,因为 6 个实际元素的编号为 0..5。这个问题也可能导致段错误。
您将 y
和 x
错误地(反向)传递给了 cusolverSpScsrlsvluHost()
。由于您在 x
中放置了非零值,因此您可能打算将其作为您的 RHS 向量。该向量在文档中的名称为 b
,它是要传递的第一个向量。您的 y
向量可能是解向量,它是按参数顺序传递的最后一个向量(在文档中它的名称为 x
)。
我建议使用适当的错误检查。
以下代码解决了上述问题,并产生了合理的结果:
$ cat t979.cu
#include <cusparse.h>
#include <stdio.h>
#include <cusolverSp.h>
#include <assert.h>
int main()
{
//initialize our test cases
const int size = 3;
const int nnz = 6 ;
int sing = 0;
//float values[] = {0,0,0,0} ;
float values[nnz] = {1,2,3,4,5,6} ;
int colIdx[nnz] = {0,0,1,0,1,2};
int rowPtr[size+1] = {0, 1,3,6};
float x[size] = {4,-6,7};
float y[size]= {0,0,0} ;
cusolverStatus_t cso;
cusolverSpHandle_t solver_handle ;
cso = cusolverSpCreate(&solver_handle) ;
assert(cso == CUSOLVER_STATUS_SUCCESS);
cusparseStatus_t csp;
cusparseMatDescr_t descr = 0;
csp = cusparseCreateMatDescr(&descr);
assert(csp == CUSPARSE_STATUS_SUCCESS);
csp = cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
assert(csp == CUSPARSE_STATUS_SUCCESS);
csp = cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);
assert(csp == CUSPARSE_STATUS_SUCCESS);
cso = cusolverSpScsrlsvluHost(solver_handle, size, nnz, descr, values, rowPtr, colIdx, x, 0.0,0, y, &sing);
assert(cso == CUSOLVER_STATUS_SUCCESS);
printf("%f\n",y[0]);
printf("%f\n",y[1]);
printf("%f\n",y[2]);
return 0;
}
$ nvcc -o t979 t979.cu -lcusolver -lcusparse
$ ./t979
4.000000
-4.666667
2.388889
$
另请注意,有一个完全可用的 CUDA sample code 演示了此函数的正确用法。
我正在尝试使用 cusolver 库来求解多个线性方程,但却引发了一个非常奇怪的异常。 该代码仅使用库中的一个函数,其余的是内存分配和内存复制。 函数是
cusolverSpScsrlsvcholHost(
cusolverSpHandle_t handle, int m, int nnz,
const cusparseMatDescr_t descrA, const float *csrVal,
const int *csrRowPtr, const int *csrColInd, const float *b,
float tol, int reorder, float *x, int *singularity);
我想我的问题可能出在 tol - reorder - singularity parameters 因为剩下的是矩阵参数 这是代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <cusparse.h>
#include <cublas_v2.h>
#include <stdio.h>
#include <cusolverSp.h>
int main()
{
//initialize our test cases
const int size = 3;
int nnz = 6 ;
int sing = -1 ;
//float values[] = {0,0,0,0} ;
float values[] = {1,2,3,4,5,6} ;
int colIdx[] = {0,0,1,0,1,2};
int rowPtr[] = {0, 1,3,7};
float x[] = {4,-6,7};
float y[3]= {0,0,0} ;
float *dev_values = 0 ;
int *dev_rowPtr = 0 ;
int *dev_colIdx = 0 ;
float *dev_x = 0 ;
float *dev_y = 0 ;
cusolverSpHandle_t solver_handle ;
cusolverSpCreate(&solver_handle) ;
cusparseMatDescr_t descr = 0;
cusparseCreateMatDescr(&descr);
cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);
// Choose which GPU to run on, change this on a multi-GPU system.
cudaSetDevice(0);
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
// Allocate GPU buffers for three vectors (two input, one output) .
cudaMalloc((void**)&dev_x, size * sizeof(float));
cudaMalloc((void**)&dev_y, size * sizeof(float));
cudaMalloc((void**)&dev_values, nnz * sizeof(float));
cudaMalloc((void**)&dev_rowPtr, (size + 1) * sizeof(int));
cudaMalloc((void**)&dev_colIdx, nnz * sizeof(int));
//Memcpy
cudaMemcpyAsync(dev_x, x, size * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyAsync(dev_values, values, nnz * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyAsync(dev_rowPtr, rowPtr, (size + 1) * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync(dev_colIdx, colIdx, nnz * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync(dev_y, y, size * sizeof(float), cudaMemcpyHostToDevice);
cusolverSpScsrlsvluHost(solver_handle, size, nnz, descr, dev_values, dev_rowPtr, dev_colIdx, dev_y, 0,0, dev_x, &sing);
cudaMemcpyAsync(y, dev_y, size*sizeof(float), cudaMemcpyDeviceToHost );
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf ("Time for the kernel: %f ms\n", time);
printf("%f\n",y[0]);
printf("%f\n",y[1]);
printf("%f\n",y[2]);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaDeviceReset();
cudaFree(dev_x);
cudaFree(dev_y);
cudaFree(dev_values);
cudaFree(dev_rowPtr);
cudaFree(dev_colIdx);
return 1;
}
您的代码中至少存在 3 个问题:
您正在使用函数的 Host 变体:
cusolverSpScsrlsvlu
Host()
.如果查看cusolverSpScsrlsvluHost
的 the documentation,您会发现对于 Host MemSpace,该函数期望所有参数和指针参数都是基于主机的。但是您正在将设备指针传递给函数。这样做会导致段错误。对于像dev_values
这样的所有参数,您需要用等效的主机数据指针替换它们(例如values
代替dev_values
)。您的 CSR 矩阵格式不正确。这一行:
int rowPtr[] = {0, 1,3,7};
应该是这样的:
int rowPtr[] = {0, 1,3,6};
指向最后一个元素之后的一个元素的正确行指针索引是 6,而不是 7,因为 6 个实际元素的编号为 0..5。这个问题也可能导致段错误。
您将
y
和x
错误地(反向)传递给了cusolverSpScsrlsvluHost()
。由于您在x
中放置了非零值,因此您可能打算将其作为您的 RHS 向量。该向量在文档中的名称为b
,它是要传递的第一个向量。您的y
向量可能是解向量,它是按参数顺序传递的最后一个向量(在文档中它的名称为x
)。我建议使用适当的错误检查。
以下代码解决了上述问题,并产生了合理的结果:
$ cat t979.cu
#include <cusparse.h>
#include <stdio.h>
#include <cusolverSp.h>
#include <assert.h>
int main()
{
//initialize our test cases
const int size = 3;
const int nnz = 6 ;
int sing = 0;
//float values[] = {0,0,0,0} ;
float values[nnz] = {1,2,3,4,5,6} ;
int colIdx[nnz] = {0,0,1,0,1,2};
int rowPtr[size+1] = {0, 1,3,6};
float x[size] = {4,-6,7};
float y[size]= {0,0,0} ;
cusolverStatus_t cso;
cusolverSpHandle_t solver_handle ;
cso = cusolverSpCreate(&solver_handle) ;
assert(cso == CUSOLVER_STATUS_SUCCESS);
cusparseStatus_t csp;
cusparseMatDescr_t descr = 0;
csp = cusparseCreateMatDescr(&descr);
assert(csp == CUSPARSE_STATUS_SUCCESS);
csp = cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
assert(csp == CUSPARSE_STATUS_SUCCESS);
csp = cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);
assert(csp == CUSPARSE_STATUS_SUCCESS);
cso = cusolverSpScsrlsvluHost(solver_handle, size, nnz, descr, values, rowPtr, colIdx, x, 0.0,0, y, &sing);
assert(cso == CUSOLVER_STATUS_SUCCESS);
printf("%f\n",y[0]);
printf("%f\n",y[1]);
printf("%f\n",y[2]);
return 0;
}
$ nvcc -o t979 t979.cu -lcusolver -lcusparse
$ ./t979
4.000000
-4.666667
2.388889
$
另请注意,有一个完全可用的 CUDA sample code 演示了此函数的正确用法。