cuSOLVER - cusolverSpScsrlsvqr 的设备版本比主机版本慢得多
cuSOLVER - Device version of cusolverSpScsrlsvqr is much slower than host version
我有稀疏的 3 对角 NxN 矩阵 A
由一些规则构建并且想要求解系统 Ax=b
。为此,我使用 cusolverSpScsrlsvqr()
from cuSolverSp module。对于大 N,设备版本比 cusolverSpScsrlsvqrHost()
慢很多倍可以吗?例如。对于 N=2^14,设备时间为 174.1 毫秒,主机时间为 3.5 毫秒。我在 RTX 2060 上。
代码:
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cusolverSp.h>
#include <cusparse_v2.h>
#include <stdio.h>
#include <iostream>
#include <iomanip>
#include <chrono>
using namespace std;
void checkCudaCusolverStatus(cusolverStatus_t status, char const* operation) {
char const *str = "UNKNOWN STATUS";
switch (status) {
case CUSOLVER_STATUS_SUCCESS:
str = "CUSOLVER_STATUS_SUCCESS";
break;
case CUSOLVER_STATUS_NOT_INITIALIZED:
str = "CUSOLVER_STATUS_NOT_INITIALIZED";
break;
case CUSOLVER_STATUS_ALLOC_FAILED:
str = "CUSOLVER_STATUS_ALLOC_FAILED";
break;
case CUSOLVER_STATUS_INVALID_VALUE:
str = "CUSOLVER_STATUS_INVALID_VALUE";
break;
case CUSOLVER_STATUS_ARCH_MISMATCH:
str = "CUSOLVER_STATUS_ARCH_MISMATCH";
break;
case CUSOLVER_STATUS_MAPPING_ERROR:
str = "CUSOLVER_STATUS_MAPPING_ERROR";
break;
case CUSOLVER_STATUS_EXECUTION_FAILED:
str = "CUSOLVER_STATUS_EXECUTION_FAILED";
break;
case CUSOLVER_STATUS_INTERNAL_ERROR:
str = "CUSOLVER_STATUS_INTERNAL_ERROR";
break;
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
str = "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
break;
case CUSOLVER_STATUS_ZERO_PIVOT:
str = "CUSOLVER_STATUS_ZERO_PIVOT";
break;
}
cout << left << setw(30) << operation << " " << str << endl;
}
__global__ void fillAB(float *aValues, int *aRowPtrs, int *aColIdxs, float *b, int const n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= n) return;
if (i == 0) {
float xn = 10 * (n + 1);
aValues[n * 3] = xn;
aRowPtrs[0] = 0;
aRowPtrs[n + 1] = n * 3 + 1;
aColIdxs[n * 3] = n;
b[n] = xn * 2;
}
float xi = 10 * (i + 1);
aValues[i * 3 + 0] = xi;
aValues[i * 3 + 1] = xi + 5;
aValues[i * 3 + 2] = xi - 5;
aColIdxs[i * 3 + 0] = i;
aColIdxs[i * 3 + 1] = i + 1;
aColIdxs[i * 3 + 2] = i;
aRowPtrs[i + 1] = 2 + (i * 3);
b[i] = xi * 2;
}
int main() {
int const n = (int)pow(2, 14); // <<<<<<<<<<<<<<<<<<<<<<<<<<<<< N HERE
int const valCount = n * 3 - 2;
float *const aValues = new float[valCount];
int *const aRowPtrs = new int[n + 1];
int *const aColIdxs = new int[valCount];
float *const b = new float[n];
float *const x = new float[n];
float *dev_aValues;
int *dev_aRowPtrs;
int *dev_aColIdxs;
float *dev_b;
float *dev_x;
int aValuesSize = sizeof(float) * valCount;
int aRowPtrsSize = sizeof(int) * (n + 1);
int aColIdxsSize = sizeof(int) * valCount;
int bSize = sizeof(float) * n;
int xSize = sizeof(float) * n;
cudaMalloc((void**)&dev_aValues, aValuesSize);
cudaMalloc((void**)&dev_aRowPtrs, aRowPtrsSize);
cudaMalloc((void**)&dev_aColIdxs, aColIdxsSize);
cudaMalloc((void**)&dev_b, bSize);
cudaMalloc((void**)&dev_x, xSize);
fillAB<<<1024, (int)ceil(n / 1024.f)>>>(dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, n - 1);
cudaMemcpy(aValues, dev_aValues, aValuesSize, cudaMemcpyDeviceToHost);
cudaMemcpy(aRowPtrs, dev_aRowPtrs, aRowPtrsSize, cudaMemcpyDeviceToHost);
cudaMemcpy(aColIdxs, dev_aColIdxs, aColIdxsSize, cudaMemcpyDeviceToHost);
cudaMemcpy(b, dev_b, bSize, cudaMemcpyDeviceToHost);
cusolverSpHandle_t handle;
checkCudaCusolverStatus(cusolverSpCreate(&handle), "cusolverSpCreate");
cusparseMatDescr_t aDescr;
cusparseCreateMatDescr(&aDescr);
cusparseSetMatIndexBase(aDescr, CUSPARSE_INDEX_BASE_ZERO);
cusparseSetMatType(aDescr, CUSPARSE_MATRIX_TYPE_GENERAL);
int singularity;
cusolverStatus_t status;
cusolverSpScsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
cudaDeviceSynchronize();
auto t0 = chrono::high_resolution_clock::now();
for (int i = 0; i < 10; ++i)
////////////////////// CUSOLVER HERE //////////////////////
status = cusolverSpScsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
//status = cusolverSpScsrlsvqrHost(handle, n, valCount, aDescr, aValues, aRowPtrs, aColIdxs, b, 0.1f, 0, x, &singularity);
///////////////////////////////////////////////////////////
cudaDeviceSynchronize();
auto t1 = chrono::high_resolution_clock::now();
checkCudaCusolverStatus(status, "cusolverSpScsrlsvqr");
checkCudaCusolverStatus(cusolverSpDestroy(handle), "cusolverSpDestroy");
cout << "System solved: " << setw(20) << fixed << right << setprecision(3) << (t1 - t0).count() / 10.0 / 1000000 << " ms" << endl;
cudaMemcpy(x, dev_x, xSize, cudaMemcpyDeviceToHost);
/*for (int i = 0; i < n; ++i) {
cout << " " << x[i];
}*/
cudaFree(dev_aValues);
cudaFree(dev_aRowPtrs);
cudaFree(dev_aColIdxs);
cudaFree(dev_b);
cudaFree(dev_x);
delete[] aValues;
delete[] aRowPtrs;
delete[] aColIdxs;
delete[] b;
delete[] x;
cudaDeviceReset();
return 0;
}
我猜测这里的问题是它是一个三对角矩阵。我怀疑这可能会消除某些对 GPU cusolver 例程有益的并行性方面。除了我在 cusparse docs 中读到这样的陈述外,我真的没有任何理由支持这个陈述:
For example, a tridiagonal matrix has no parallelism.
我不能确切地说出这意味着什么,除了它向我暗示对于三对角矩阵,也许需要采用不同的方法。以及专门针对三对角线情况的 cusparse provides 求解器。
如果我们使用其中之一,我们可以在您的测试用例上获得比您的特定主机 cusolver 示例更快的结果。这是一个例子:
$ cat t48.cu
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cusolverSp.h>
#include <cusparse_v2.h>
#include <stdio.h>
#include <iostream>
#include <iomanip>
#include <chrono>
#include <cassert>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
#ifdef USE_DOUBLE
#define START 3
#define TOL 0.000001
#define THR 0.00001
typedef double mt;
#else
#define START 0
#define TOL 0.01
#define THR 0.1
typedef float mt;
#endif
using namespace std;
void checkCudaCusolverStatus(cusolverStatus_t status, char const* operation) {
char const *str = "UNKNOWN STATUS";
switch (status) {
case CUSOLVER_STATUS_SUCCESS:
str = "CUSOLVER_STATUS_SUCCESS";
break;
case CUSOLVER_STATUS_NOT_INITIALIZED:
str = "CUSOLVER_STATUS_NOT_INITIALIZED";
break;
case CUSOLVER_STATUS_ALLOC_FAILED:
str = "CUSOLVER_STATUS_ALLOC_FAILED";
break;
case CUSOLVER_STATUS_INVALID_VALUE:
str = "CUSOLVER_STATUS_INVALID_VALUE";
break;
case CUSOLVER_STATUS_ARCH_MISMATCH:
str = "CUSOLVER_STATUS_ARCH_MISMATCH";
break;
case CUSOLVER_STATUS_MAPPING_ERROR:
str = "CUSOLVER_STATUS_MAPPING_ERROR";
break;
case CUSOLVER_STATUS_EXECUTION_FAILED:
str = "CUSOLVER_STATUS_EXECUTION_FAILED";
break;
case CUSOLVER_STATUS_INTERNAL_ERROR:
str = "CUSOLVER_STATUS_INTERNAL_ERROR";
break;
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
str = "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
break;
case CUSOLVER_STATUS_ZERO_PIVOT:
str = "CUSOLVER_STATUS_ZERO_PIVOT";
break;
}
cout << left << setw(30) << operation << " " << str << endl;
}
__global__ void fillAB(mt *aValues, int *aRowPtrs, int *aColIdxs, mt *b, int const n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= n) return;
if (i == 0) {
mt xn = 10 * (n + 1);
aValues[n * 3] = xn;
aRowPtrs[0] = 0;
aRowPtrs[n + 1] = n * 3 + 1;
aColIdxs[n * 3] = n;
b[n] = xn * 2;
}
mt xi = 10 * (i + 1);
aValues[i * 3 + 0] = xi;
aValues[i * 3 + 1] = xi + 5;
aValues[i * 3 + 2] = xi - 5;
aColIdxs[i * 3 + 0] = i;
aColIdxs[i * 3 + 1] = i + 1;
aColIdxs[i * 3 + 2] = i;
aRowPtrs[i + 1] = 2 + (i * 3);
b[i] = xi * 2;
}
__global__ void filld3(mt *d, mt *du, mt *dl, mt *aValues, mt *b, mt *b2, const int n){
int i = blockDim.x*blockIdx.x+threadIdx.x;
if ((i > 0) && (i < n-1)){
dl[i] = aValues[i*3 - 1];
d[i] = aValues[i*3];
du[i] = aValues[i*3+1];
}
if (i == 0){
dl[0] = 0;
d[0] = aValues[0];
du[0] = aValues[1];}
if (i == (n-1)){
dl[i] = aValues[i*3-1];
d[i] = aValues[i*3];
du[i] = 0;}
if (i < n) b2[i] = b[i];
}
int main() {
int const n = (int)pow(2, 14); // <<<<<<<<<<<<<<<<<<<<<<<<<<<<< N HERE
int const valCount = n * 3 - 2;
mt *const aValues = new mt[valCount];
int *const aRowPtrs = new int[n + 1];
int *const aColIdxs = new int[valCount];
mt *const b = new mt[n];
mt *const x = new mt[n];
mt *const x2= new mt[n];
mt *dev_aValues;
int *dev_aRowPtrs;
int *dev_aColIdxs;
mt *dev_b;
mt *dev_x;
mt *dev_b2, *dev_d, *dev_dl, *dev_du;
int aValuesSize = sizeof(mt) * valCount;
int aRowPtrsSize = sizeof(int) * (n + 1);
int aColIdxsSize = sizeof(int) * valCount;
int bSize = sizeof(mt) * n;
int xSize = sizeof(mt) * n;
cudaMalloc((void**)&dev_aValues, aValuesSize);
cudaMalloc((void**)&dev_aRowPtrs, aRowPtrsSize);
cudaMalloc((void**)&dev_aColIdxs, aColIdxsSize);
cudaMalloc((void**)&dev_b, bSize);
cudaMalloc((void**)&dev_x, xSize);
cudaMalloc((void**)&dev_b2, bSize);
cudaMalloc(&dev_d, n*sizeof(mt));
cudaMalloc(&dev_dl, n*sizeof(mt));
cudaMalloc(&dev_du, n*sizeof(mt));
fillAB<<<1024, (int)ceil(n / 1024.f)>>>(dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, n - 1);
filld3<<<(n+1023)/1024,1024>>>(dev_d, dev_du, dev_dl, dev_aValues, dev_b, dev_b2, n);
cudaMemcpy(aValues, dev_aValues, aValuesSize, cudaMemcpyDeviceToHost);
cudaMemcpy(aRowPtrs, dev_aRowPtrs, aRowPtrsSize, cudaMemcpyDeviceToHost);
cudaMemcpy(aColIdxs, dev_aColIdxs, aColIdxsSize, cudaMemcpyDeviceToHost);
cudaMemcpy(b, dev_b, bSize, cudaMemcpyDeviceToHost);
cusolverSpHandle_t handle;
checkCudaCusolverStatus(cusolverSpCreate(&handle), "cusolverSpCreate");
cusparseMatDescr_t aDescr;
cusparseCreateMatDescr(&aDescr);
cusparseSetMatIndexBase(aDescr, CUSPARSE_INDEX_BASE_ZERO);
cusparseSetMatType(aDescr, CUSPARSE_MATRIX_TYPE_GENERAL);
int singularity;
cusolverStatus_t status;
unsigned long long dt = dtime_usec(0);
#ifdef USE_DOUBLE
cusolverSpDcsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
#else
cusolverSpScsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
#endif
cudaDeviceSynchronize();
dt = dtime_usec(dt);
std::cout << "time: " << dt/(float)USECPSEC << "s" << std::endl;
auto t0 = chrono::high_resolution_clock::now();
for (int i = 0; i < 10; ++i)
////////////////////// CUSOLVER HERE //////////////////////
#ifdef USE_DEVICE
#ifdef USE_DOUBLE
status = cusolverSpDcsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
#else
status = cusolverSpScsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
#endif
#else
#ifdef USE_DOUBLE
status = cusolverSpDcsrlsvqrHost(handle, n, valCount, aDescr, aValues, aRowPtrs, aColIdxs, b, 0.1f, 0, x, &singularity);
#else
status = cusolverSpScsrlsvqrHost(handle, n, valCount, aDescr, aValues, aRowPtrs, aColIdxs, b, 0.1f, 0, x, &singularity);
#endif
#endif
///////////////////////////////////////////////////////////
cudaDeviceSynchronize();
auto t1 = chrono::high_resolution_clock::now();
checkCudaCusolverStatus(status, "cusolverSpScsrlsvqr");
checkCudaCusolverStatus(cusolverSpDestroy(handle), "cusolverSpDestroy");
cout << "System solved: " << setw(20) << fixed << right << setprecision(6) << (t1 - t0).count() / 10.0 / 1000000 << " ms" << endl;
cudaMemcpy(x, dev_x, xSize, cudaMemcpyDeviceToHost);
/*for (int i = 0; i < n; ++i) {
cout << " " << x[i];
}*/
cusparseHandle_t csphandle;
cusparseStatus_t cstat = cusparseCreate(&csphandle);
assert(cstat == CUSPARSE_STATUS_SUCCESS);
size_t bufferSize;
#ifdef USE_DOUBLE
cstat = cusparseDgtsv2_nopivot_bufferSizeExt(csphandle, n, 1, dev_dl, dev_d, dev_du, dev_b2, n, &bufferSize);
#else
cstat = cusparseSgtsv2_nopivot_bufferSizeExt(csphandle, n, 1, dev_dl, dev_d, dev_du, dev_b2, n, &bufferSize);
#endif
assert(cstat == CUSPARSE_STATUS_SUCCESS);
unsigned char *dev_buffer;
cudaMalloc(&dev_buffer, bufferSize);
dt = dtime_usec(0);
#ifdef USE_DOUBLE
cstat = cusparseDgtsv2_nopivot(csphandle, n, 1, dev_dl, dev_d, dev_du, dev_b2, n, (void *)dev_buffer);
#else
cstat = cusparseSgtsv2_nopivot(csphandle, n, 1, dev_dl, dev_d, dev_du, dev_b2, n, (void *)dev_buffer);
#endif
if(cstat != CUSPARSE_STATUS_SUCCESS) {std::cout << "cusparse solve error: " << (int)cstat << std::endl;}
cudaDeviceSynchronize();
dt = dtime_usec(dt);
std::cout << "cusparse time: " << (dt*1000.f)/(float)USECPSEC << "ms" << std::endl;
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
cudaMemcpy(x2, dev_b2, xSize, cudaMemcpyDeviceToHost);
for (int i = START; i < n; i++) if ((x[i] > THR) && (fabs((x[i] - x2[i])/x[i]) > TOL)) {std::cout << "mismatch at: " << i << " was: " << x2[i] << " should be: " << x[i] << std::endl; return 0;}
for (int i = 0; i < 40; i++)
std::cout << x2[i] << " " << x[i] << std::endl;
cudaFree(dev_aValues);
cudaFree(dev_aRowPtrs);
cudaFree(dev_aColIdxs);
cudaFree(dev_b);
cudaFree(dev_x);
delete[] aValues;
delete[] aRowPtrs;
delete[] aColIdxs;
delete[] b;
delete[] x;
cudaDeviceReset();
return 0;
}
$ nvcc -o t48 t48.cu -lcusparse -lcusolver
$ ./t48
cusolverSpCreate CUSOLVER_STATUS_SUCCESS
time: 0.202933s
cusolverSpScsrlsvqr CUSOLVER_STATUS_SUCCESS
cusolverSpDestroy CUSOLVER_STATUS_SUCCESS
System solved: 6.653404 ms
cusparse time: 0.089000ms
no error
-11243.155273 -11242.705078
7496.770508 7496.473145
-3747.185303 -3747.039551
0.685791 0.689445
2083.059570 2082.854004
-1892.308716 -1892.124756
306.474457 306.447662
1103.516846 1103.407104
-1271.085938 -1270.961060
334.883911 334.852417
711.941956 711.870911
-955.718140 -955.624390
321.378174 321.348175
506.580902 506.530060
-764.231689 -764.156799
300.298950 300.270935
382.335785 382.299347
-635.448120 -635.388000
279.217651 279.191864
300.164459 300.135559
-542.869019 -542.817688
259.955475 259.931702
242.390839 242.367310
-473.107758 -473.063080
242.806229 242.785751
199.916733 199.895645
-418.663696 -418.624115
227.637909 227.618652
167.604431 167.586487
-375.002411 -374.966827
214.208069 214.189835
142.353058 142.337738
-339.221130 -339.187653
202.273911 202.255341
122.184746 122.171494
-309.370209 -309.339600
191.615189 191.597580
105.783485 105.771858
-284.096802 -284.068604
182.047958 182.031158
$
备注:
- 此处不声明正确性或适用性。这主要是您的代码,我稍微修改了一下以进行调查。
- 两种方法的结果并不完全匹配,但在
float
的情况下,彼此的误差似乎在 1% 以内。我认为部分原因是 float
分辨率,但可能还有其他因素。没有进一步的研究,我没有任何理由声称一个比另一个“更正确”。
- 我使用了
gtsv2
的 nopivot
变体,因为它似乎表明它在 power-of-2 大小的情况下会更快,这就是你的情况。根据我的测试,它更快。
- 当我 运行 nopivot 大小为 2^12 而不是 2^14 时,它确实 运行 在我的 GPU (GTX 960) 上更快。 YMMV.
- 我在调查各种事情时在代码中丢弃了各种其他垃圾,所以有点乱。
- 同样,我真的无法解释 cusolver 的情况。围绕三对角线问题性质的猜测只是 - 猜测。尽管如此,在我看来,如果 cusparse 开发人员找到一个很好的理由为三对角线情况提供一组(单独的)求解器,那么这样做可能有一些合理的理由(即可以利用问题的某些方面,当该信息是先验已知时)。所以使用它们似乎是合理的,在这种情况下似乎 运行 更快。
我有稀疏的 3 对角 NxN 矩阵 A
由一些规则构建并且想要求解系统 Ax=b
。为此,我使用 cusolverSpScsrlsvqr()
from cuSolverSp module。对于大 N,设备版本比 cusolverSpScsrlsvqrHost()
慢很多倍可以吗?例如。对于 N=2^14,设备时间为 174.1 毫秒,主机时间为 3.5 毫秒。我在 RTX 2060 上。
代码:
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cusolverSp.h>
#include <cusparse_v2.h>
#include <stdio.h>
#include <iostream>
#include <iomanip>
#include <chrono>
using namespace std;
void checkCudaCusolverStatus(cusolverStatus_t status, char const* operation) {
char const *str = "UNKNOWN STATUS";
switch (status) {
case CUSOLVER_STATUS_SUCCESS:
str = "CUSOLVER_STATUS_SUCCESS";
break;
case CUSOLVER_STATUS_NOT_INITIALIZED:
str = "CUSOLVER_STATUS_NOT_INITIALIZED";
break;
case CUSOLVER_STATUS_ALLOC_FAILED:
str = "CUSOLVER_STATUS_ALLOC_FAILED";
break;
case CUSOLVER_STATUS_INVALID_VALUE:
str = "CUSOLVER_STATUS_INVALID_VALUE";
break;
case CUSOLVER_STATUS_ARCH_MISMATCH:
str = "CUSOLVER_STATUS_ARCH_MISMATCH";
break;
case CUSOLVER_STATUS_MAPPING_ERROR:
str = "CUSOLVER_STATUS_MAPPING_ERROR";
break;
case CUSOLVER_STATUS_EXECUTION_FAILED:
str = "CUSOLVER_STATUS_EXECUTION_FAILED";
break;
case CUSOLVER_STATUS_INTERNAL_ERROR:
str = "CUSOLVER_STATUS_INTERNAL_ERROR";
break;
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
str = "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
break;
case CUSOLVER_STATUS_ZERO_PIVOT:
str = "CUSOLVER_STATUS_ZERO_PIVOT";
break;
}
cout << left << setw(30) << operation << " " << str << endl;
}
__global__ void fillAB(float *aValues, int *aRowPtrs, int *aColIdxs, float *b, int const n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= n) return;
if (i == 0) {
float xn = 10 * (n + 1);
aValues[n * 3] = xn;
aRowPtrs[0] = 0;
aRowPtrs[n + 1] = n * 3 + 1;
aColIdxs[n * 3] = n;
b[n] = xn * 2;
}
float xi = 10 * (i + 1);
aValues[i * 3 + 0] = xi;
aValues[i * 3 + 1] = xi + 5;
aValues[i * 3 + 2] = xi - 5;
aColIdxs[i * 3 + 0] = i;
aColIdxs[i * 3 + 1] = i + 1;
aColIdxs[i * 3 + 2] = i;
aRowPtrs[i + 1] = 2 + (i * 3);
b[i] = xi * 2;
}
int main() {
int const n = (int)pow(2, 14); // <<<<<<<<<<<<<<<<<<<<<<<<<<<<< N HERE
int const valCount = n * 3 - 2;
float *const aValues = new float[valCount];
int *const aRowPtrs = new int[n + 1];
int *const aColIdxs = new int[valCount];
float *const b = new float[n];
float *const x = new float[n];
float *dev_aValues;
int *dev_aRowPtrs;
int *dev_aColIdxs;
float *dev_b;
float *dev_x;
int aValuesSize = sizeof(float) * valCount;
int aRowPtrsSize = sizeof(int) * (n + 1);
int aColIdxsSize = sizeof(int) * valCount;
int bSize = sizeof(float) * n;
int xSize = sizeof(float) * n;
cudaMalloc((void**)&dev_aValues, aValuesSize);
cudaMalloc((void**)&dev_aRowPtrs, aRowPtrsSize);
cudaMalloc((void**)&dev_aColIdxs, aColIdxsSize);
cudaMalloc((void**)&dev_b, bSize);
cudaMalloc((void**)&dev_x, xSize);
fillAB<<<1024, (int)ceil(n / 1024.f)>>>(dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, n - 1);
cudaMemcpy(aValues, dev_aValues, aValuesSize, cudaMemcpyDeviceToHost);
cudaMemcpy(aRowPtrs, dev_aRowPtrs, aRowPtrsSize, cudaMemcpyDeviceToHost);
cudaMemcpy(aColIdxs, dev_aColIdxs, aColIdxsSize, cudaMemcpyDeviceToHost);
cudaMemcpy(b, dev_b, bSize, cudaMemcpyDeviceToHost);
cusolverSpHandle_t handle;
checkCudaCusolverStatus(cusolverSpCreate(&handle), "cusolverSpCreate");
cusparseMatDescr_t aDescr;
cusparseCreateMatDescr(&aDescr);
cusparseSetMatIndexBase(aDescr, CUSPARSE_INDEX_BASE_ZERO);
cusparseSetMatType(aDescr, CUSPARSE_MATRIX_TYPE_GENERAL);
int singularity;
cusolverStatus_t status;
cusolverSpScsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
cudaDeviceSynchronize();
auto t0 = chrono::high_resolution_clock::now();
for (int i = 0; i < 10; ++i)
////////////////////// CUSOLVER HERE //////////////////////
status = cusolverSpScsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
//status = cusolverSpScsrlsvqrHost(handle, n, valCount, aDescr, aValues, aRowPtrs, aColIdxs, b, 0.1f, 0, x, &singularity);
///////////////////////////////////////////////////////////
cudaDeviceSynchronize();
auto t1 = chrono::high_resolution_clock::now();
checkCudaCusolverStatus(status, "cusolverSpScsrlsvqr");
checkCudaCusolverStatus(cusolverSpDestroy(handle), "cusolverSpDestroy");
cout << "System solved: " << setw(20) << fixed << right << setprecision(3) << (t1 - t0).count() / 10.0 / 1000000 << " ms" << endl;
cudaMemcpy(x, dev_x, xSize, cudaMemcpyDeviceToHost);
/*for (int i = 0; i < n; ++i) {
cout << " " << x[i];
}*/
cudaFree(dev_aValues);
cudaFree(dev_aRowPtrs);
cudaFree(dev_aColIdxs);
cudaFree(dev_b);
cudaFree(dev_x);
delete[] aValues;
delete[] aRowPtrs;
delete[] aColIdxs;
delete[] b;
delete[] x;
cudaDeviceReset();
return 0;
}
我猜测这里的问题是它是一个三对角矩阵。我怀疑这可能会消除某些对 GPU cusolver 例程有益的并行性方面。除了我在 cusparse docs 中读到这样的陈述外,我真的没有任何理由支持这个陈述:
For example, a tridiagonal matrix has no parallelism.
我不能确切地说出这意味着什么,除了它向我暗示对于三对角矩阵,也许需要采用不同的方法。以及专门针对三对角线情况的 cusparse provides 求解器。
如果我们使用其中之一,我们可以在您的测试用例上获得比您的特定主机 cusolver 示例更快的结果。这是一个例子:
$ cat t48.cu
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cusolverSp.h>
#include <cusparse_v2.h>
#include <stdio.h>
#include <iostream>
#include <iomanip>
#include <chrono>
#include <cassert>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
#ifdef USE_DOUBLE
#define START 3
#define TOL 0.000001
#define THR 0.00001
typedef double mt;
#else
#define START 0
#define TOL 0.01
#define THR 0.1
typedef float mt;
#endif
using namespace std;
void checkCudaCusolverStatus(cusolverStatus_t status, char const* operation) {
char const *str = "UNKNOWN STATUS";
switch (status) {
case CUSOLVER_STATUS_SUCCESS:
str = "CUSOLVER_STATUS_SUCCESS";
break;
case CUSOLVER_STATUS_NOT_INITIALIZED:
str = "CUSOLVER_STATUS_NOT_INITIALIZED";
break;
case CUSOLVER_STATUS_ALLOC_FAILED:
str = "CUSOLVER_STATUS_ALLOC_FAILED";
break;
case CUSOLVER_STATUS_INVALID_VALUE:
str = "CUSOLVER_STATUS_INVALID_VALUE";
break;
case CUSOLVER_STATUS_ARCH_MISMATCH:
str = "CUSOLVER_STATUS_ARCH_MISMATCH";
break;
case CUSOLVER_STATUS_MAPPING_ERROR:
str = "CUSOLVER_STATUS_MAPPING_ERROR";
break;
case CUSOLVER_STATUS_EXECUTION_FAILED:
str = "CUSOLVER_STATUS_EXECUTION_FAILED";
break;
case CUSOLVER_STATUS_INTERNAL_ERROR:
str = "CUSOLVER_STATUS_INTERNAL_ERROR";
break;
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
str = "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
break;
case CUSOLVER_STATUS_ZERO_PIVOT:
str = "CUSOLVER_STATUS_ZERO_PIVOT";
break;
}
cout << left << setw(30) << operation << " " << str << endl;
}
__global__ void fillAB(mt *aValues, int *aRowPtrs, int *aColIdxs, mt *b, int const n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= n) return;
if (i == 0) {
mt xn = 10 * (n + 1);
aValues[n * 3] = xn;
aRowPtrs[0] = 0;
aRowPtrs[n + 1] = n * 3 + 1;
aColIdxs[n * 3] = n;
b[n] = xn * 2;
}
mt xi = 10 * (i + 1);
aValues[i * 3 + 0] = xi;
aValues[i * 3 + 1] = xi + 5;
aValues[i * 3 + 2] = xi - 5;
aColIdxs[i * 3 + 0] = i;
aColIdxs[i * 3 + 1] = i + 1;
aColIdxs[i * 3 + 2] = i;
aRowPtrs[i + 1] = 2 + (i * 3);
b[i] = xi * 2;
}
__global__ void filld3(mt *d, mt *du, mt *dl, mt *aValues, mt *b, mt *b2, const int n){
int i = blockDim.x*blockIdx.x+threadIdx.x;
if ((i > 0) && (i < n-1)){
dl[i] = aValues[i*3 - 1];
d[i] = aValues[i*3];
du[i] = aValues[i*3+1];
}
if (i == 0){
dl[0] = 0;
d[0] = aValues[0];
du[0] = aValues[1];}
if (i == (n-1)){
dl[i] = aValues[i*3-1];
d[i] = aValues[i*3];
du[i] = 0;}
if (i < n) b2[i] = b[i];
}
int main() {
int const n = (int)pow(2, 14); // <<<<<<<<<<<<<<<<<<<<<<<<<<<<< N HERE
int const valCount = n * 3 - 2;
mt *const aValues = new mt[valCount];
int *const aRowPtrs = new int[n + 1];
int *const aColIdxs = new int[valCount];
mt *const b = new mt[n];
mt *const x = new mt[n];
mt *const x2= new mt[n];
mt *dev_aValues;
int *dev_aRowPtrs;
int *dev_aColIdxs;
mt *dev_b;
mt *dev_x;
mt *dev_b2, *dev_d, *dev_dl, *dev_du;
int aValuesSize = sizeof(mt) * valCount;
int aRowPtrsSize = sizeof(int) * (n + 1);
int aColIdxsSize = sizeof(int) * valCount;
int bSize = sizeof(mt) * n;
int xSize = sizeof(mt) * n;
cudaMalloc((void**)&dev_aValues, aValuesSize);
cudaMalloc((void**)&dev_aRowPtrs, aRowPtrsSize);
cudaMalloc((void**)&dev_aColIdxs, aColIdxsSize);
cudaMalloc((void**)&dev_b, bSize);
cudaMalloc((void**)&dev_x, xSize);
cudaMalloc((void**)&dev_b2, bSize);
cudaMalloc(&dev_d, n*sizeof(mt));
cudaMalloc(&dev_dl, n*sizeof(mt));
cudaMalloc(&dev_du, n*sizeof(mt));
fillAB<<<1024, (int)ceil(n / 1024.f)>>>(dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, n - 1);
filld3<<<(n+1023)/1024,1024>>>(dev_d, dev_du, dev_dl, dev_aValues, dev_b, dev_b2, n);
cudaMemcpy(aValues, dev_aValues, aValuesSize, cudaMemcpyDeviceToHost);
cudaMemcpy(aRowPtrs, dev_aRowPtrs, aRowPtrsSize, cudaMemcpyDeviceToHost);
cudaMemcpy(aColIdxs, dev_aColIdxs, aColIdxsSize, cudaMemcpyDeviceToHost);
cudaMemcpy(b, dev_b, bSize, cudaMemcpyDeviceToHost);
cusolverSpHandle_t handle;
checkCudaCusolverStatus(cusolverSpCreate(&handle), "cusolverSpCreate");
cusparseMatDescr_t aDescr;
cusparseCreateMatDescr(&aDescr);
cusparseSetMatIndexBase(aDescr, CUSPARSE_INDEX_BASE_ZERO);
cusparseSetMatType(aDescr, CUSPARSE_MATRIX_TYPE_GENERAL);
int singularity;
cusolverStatus_t status;
unsigned long long dt = dtime_usec(0);
#ifdef USE_DOUBLE
cusolverSpDcsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
#else
cusolverSpScsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
#endif
cudaDeviceSynchronize();
dt = dtime_usec(dt);
std::cout << "time: " << dt/(float)USECPSEC << "s" << std::endl;
auto t0 = chrono::high_resolution_clock::now();
for (int i = 0; i < 10; ++i)
////////////////////// CUSOLVER HERE //////////////////////
#ifdef USE_DEVICE
#ifdef USE_DOUBLE
status = cusolverSpDcsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
#else
status = cusolverSpScsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
#endif
#else
#ifdef USE_DOUBLE
status = cusolverSpDcsrlsvqrHost(handle, n, valCount, aDescr, aValues, aRowPtrs, aColIdxs, b, 0.1f, 0, x, &singularity);
#else
status = cusolverSpScsrlsvqrHost(handle, n, valCount, aDescr, aValues, aRowPtrs, aColIdxs, b, 0.1f, 0, x, &singularity);
#endif
#endif
///////////////////////////////////////////////////////////
cudaDeviceSynchronize();
auto t1 = chrono::high_resolution_clock::now();
checkCudaCusolverStatus(status, "cusolverSpScsrlsvqr");
checkCudaCusolverStatus(cusolverSpDestroy(handle), "cusolverSpDestroy");
cout << "System solved: " << setw(20) << fixed << right << setprecision(6) << (t1 - t0).count() / 10.0 / 1000000 << " ms" << endl;
cudaMemcpy(x, dev_x, xSize, cudaMemcpyDeviceToHost);
/*for (int i = 0; i < n; ++i) {
cout << " " << x[i];
}*/
cusparseHandle_t csphandle;
cusparseStatus_t cstat = cusparseCreate(&csphandle);
assert(cstat == CUSPARSE_STATUS_SUCCESS);
size_t bufferSize;
#ifdef USE_DOUBLE
cstat = cusparseDgtsv2_nopivot_bufferSizeExt(csphandle, n, 1, dev_dl, dev_d, dev_du, dev_b2, n, &bufferSize);
#else
cstat = cusparseSgtsv2_nopivot_bufferSizeExt(csphandle, n, 1, dev_dl, dev_d, dev_du, dev_b2, n, &bufferSize);
#endif
assert(cstat == CUSPARSE_STATUS_SUCCESS);
unsigned char *dev_buffer;
cudaMalloc(&dev_buffer, bufferSize);
dt = dtime_usec(0);
#ifdef USE_DOUBLE
cstat = cusparseDgtsv2_nopivot(csphandle, n, 1, dev_dl, dev_d, dev_du, dev_b2, n, (void *)dev_buffer);
#else
cstat = cusparseSgtsv2_nopivot(csphandle, n, 1, dev_dl, dev_d, dev_du, dev_b2, n, (void *)dev_buffer);
#endif
if(cstat != CUSPARSE_STATUS_SUCCESS) {std::cout << "cusparse solve error: " << (int)cstat << std::endl;}
cudaDeviceSynchronize();
dt = dtime_usec(dt);
std::cout << "cusparse time: " << (dt*1000.f)/(float)USECPSEC << "ms" << std::endl;
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
cudaMemcpy(x2, dev_b2, xSize, cudaMemcpyDeviceToHost);
for (int i = START; i < n; i++) if ((x[i] > THR) && (fabs((x[i] - x2[i])/x[i]) > TOL)) {std::cout << "mismatch at: " << i << " was: " << x2[i] << " should be: " << x[i] << std::endl; return 0;}
for (int i = 0; i < 40; i++)
std::cout << x2[i] << " " << x[i] << std::endl;
cudaFree(dev_aValues);
cudaFree(dev_aRowPtrs);
cudaFree(dev_aColIdxs);
cudaFree(dev_b);
cudaFree(dev_x);
delete[] aValues;
delete[] aRowPtrs;
delete[] aColIdxs;
delete[] b;
delete[] x;
cudaDeviceReset();
return 0;
}
$ nvcc -o t48 t48.cu -lcusparse -lcusolver
$ ./t48
cusolverSpCreate CUSOLVER_STATUS_SUCCESS
time: 0.202933s
cusolverSpScsrlsvqr CUSOLVER_STATUS_SUCCESS
cusolverSpDestroy CUSOLVER_STATUS_SUCCESS
System solved: 6.653404 ms
cusparse time: 0.089000ms
no error
-11243.155273 -11242.705078
7496.770508 7496.473145
-3747.185303 -3747.039551
0.685791 0.689445
2083.059570 2082.854004
-1892.308716 -1892.124756
306.474457 306.447662
1103.516846 1103.407104
-1271.085938 -1270.961060
334.883911 334.852417
711.941956 711.870911
-955.718140 -955.624390
321.378174 321.348175
506.580902 506.530060
-764.231689 -764.156799
300.298950 300.270935
382.335785 382.299347
-635.448120 -635.388000
279.217651 279.191864
300.164459 300.135559
-542.869019 -542.817688
259.955475 259.931702
242.390839 242.367310
-473.107758 -473.063080
242.806229 242.785751
199.916733 199.895645
-418.663696 -418.624115
227.637909 227.618652
167.604431 167.586487
-375.002411 -374.966827
214.208069 214.189835
142.353058 142.337738
-339.221130 -339.187653
202.273911 202.255341
122.184746 122.171494
-309.370209 -309.339600
191.615189 191.597580
105.783485 105.771858
-284.096802 -284.068604
182.047958 182.031158
$
备注:
- 此处不声明正确性或适用性。这主要是您的代码,我稍微修改了一下以进行调查。
- 两种方法的结果并不完全匹配,但在
float
的情况下,彼此的误差似乎在 1% 以内。我认为部分原因是float
分辨率,但可能还有其他因素。没有进一步的研究,我没有任何理由声称一个比另一个“更正确”。 - 我使用了
gtsv2
的nopivot
变体,因为它似乎表明它在 power-of-2 大小的情况下会更快,这就是你的情况。根据我的测试,它更快。 - 当我 运行 nopivot 大小为 2^12 而不是 2^14 时,它确实 运行 在我的 GPU (GTX 960) 上更快。 YMMV.
- 我在调查各种事情时在代码中丢弃了各种其他垃圾,所以有点乱。
- 同样,我真的无法解释 cusolver 的情况。围绕三对角线问题性质的猜测只是 - 猜测。尽管如此,在我看来,如果 cusparse 开发人员找到一个很好的理由为三对角线情况提供一组(单独的)求解器,那么这样做可能有一些合理的理由(即可以利用问题的某些方面,当该信息是先验已知时)。所以使用它们似乎是合理的,在这种情况下似乎 运行 更快。