具有大矩阵的 CUDA cuSolver gesvdj
CUDA cuSolver gesvdj with large matrix
我正在 运行对在 NVIDIA P6000 上的 "G.2. SVD with singular vectors (via Jacobi method)" 部分找到的 here 代码稍作修改。稍作修改是在堆中为 A、U 和 V 向量动态分配内存,并使用取决于 A 中索引的值填充指定大小的 A 向量。我还将所有内容从双精度数转换为浮点数。最后的修改是在 gesvdj 调用自身上循环并收敛检查迭代次数(在我的例子中是 10 次)。
通过这些细微的修改,我能够克服在大小大于 ~1000x1000 的对称数组上执行 SVD 的第一个障碍。我最终需要 运行 一个大小为 1048576x20 的数组上的 SVD。
目前,算法 运行s 用于大小为 10000x20 的数组,但当我转到 50000x20 时失败。
问题似乎源于 gesvdj 调用本身。调用 gesvdj 后的同步调用失败并返回一般访问错误。
如果我 运行 使用 cuda-memcheck 的程序,对于同一块中的不同线程,我会得到一系列这些错误:
Invalid __global__ write of size 4
========= at 0x00000108 in void eye_kernel<float, int=5, int=3>(int, int, float*, int)
========= by thread (0,7,0) in block (16,1342,0)
========= Address 0x7f4ed40414c0 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/lib64/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x269e85]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcusolver.so.10.0 [0x100c822]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcusolver.so.10.0 [0x100ca17]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcusolver.so.10.0 [0x1040dd5]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcusolver.so.10.0 [0x235c5d]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcusolver.so.10.0 (cusolverDnSgesvdj + 0x508) [0x21f9a8]
========= Host Frame:./gesvdj_example [0x4518]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x223d5]
========= Host Frame:./gesvdj_example [0x3ab9]
我想知道我是否遇到了某种 cusolver 内部限制?谁有想法?如有必要,我可以提供我的确切代码,但它与示例非常相似,我认为我只是将人们指向那里。
谢谢!
编辑以添加我链接到的示例中的违规代码,该算法在 assert(CUSOLVER_STATUS_SUCCESS == status) 处失败;线。我对使用 C 和 CUDA 进行编码真的很陌生,如果我遗漏了一些明显的调试信息,我深表歉意。
/* step 5: compute SVD */
status = cusolverDnDgesvdj(
cusolverH,
jobz, /* CUSOLVER_EIG_MODE_NOVECTOR: compute singular values only */
/* CUSOLVER_EIG_MODE_VECTOR: compute singular value and singular vectors */
econ, /* econ = 1 for economy size */
m, /* nubmer of rows of A, 0 <= m */
n, /* number of columns of A, 0 <= n */
d_A, /* m-by-n */
lda, /* leading dimension of A */
d_S, /* min(m,n) */
/* the singular values in descending order */
d_U, /* m-by-m if econ = 0 */
/* m-by-min(m,n) if econ = 1 */
lda, /* leading dimension of U, ldu >= max(1,m) */
d_V, /* n-by-n if econ = 0 */
/* n-by-min(m,n) if econ = 1 */
lda, /* leading dimension of V, ldv >= max(1,n) */
d_work,
lwork,
d_info,
gesvdj_params);
cudaStat1 = cudaDeviceSynchronize();
assert(CUSOLVER_STATUS_SUCCESS == status);
assert(cudaSuccess == cudaStat1);
编辑 2 添加我的代码...
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <cusolverDn.h>
#include<iostream>
#include<iomanip>
#include<assert.h>
#include <cuda_runtime_api.h>
#include "cuda_runtime.h"
int main(int argc, char*argv[])
{
//Setting which device to run on
const int device = 0;
cudaSetDevice(device);
cusolverDnHandle_t cusolverH = NULL;
cudaStream_t stream = NULL;
gesvdjInfo_t gesvdj_params = NULL;
cusolverStatus_t status = CUSOLVER_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
cudaError_t cudaStat3 = cudaSuccess;
cudaError_t cudaStat4 = cudaSuccess;
cudaError_t cudaStat5 = cudaSuccess;
const long int m = 50000;
const long int n = 20;
const int lda = m;
// --- Setting the host, Nrows x Ncols matrix
float *A = (float *)malloc(m * n * sizeof(float));
for(long int j = 0; j < m; j++)
for(long int i = 0; i < n; i++)
A[j + i*m] = sqrt((float)(i + j));
float *U = (float *)malloc(m * m * sizeof(float)); /* m-by-m unitary matrix, left singular vectors */
float *V = (float *)malloc(m * n * sizeof(float)); /* n-by-n unitary matrix, right singular vectors */
float S[n]; /* numerical singular value */
float *d_A = NULL; /* device copy of A */
float *d_S = NULL; /* singular values */
float *d_U = NULL; /* left singular vectors */
float *d_V = NULL; /* right singular vectors */
int *d_info = NULL; /* error info */
int lwork = 0; /* size of workspace */
float *d_work = NULL; /* devie workspace for gesvdj */
int info = 0; /* host copy of error info */
/* configuration of gesvdj */
const double tol = 1.e-7;
const int max_sweeps = 100;
const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; // compute eigenvectors.
const int econ = 0 ; /* econ = 1 for economy size */
/* numerical results of gesvdj */
double residual = 0;
int executed_sweeps = 0;
printf("tol = %E, default value is machine zero \n", tol);
printf("max. sweeps = %d, default value is 100\n", max_sweeps);
printf("econ = %d \n", econ);
printf("=====\n");
/* step 1: create cusolver handle, bind a stream */
status = cusolverDnCreate(&cusolverH);
assert(CUSOLVER_STATUS_SUCCESS == status);
cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
assert(cudaSuccess == cudaStat1);
status = cusolverDnSetStream(cusolverH, stream);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* step 2: configuration of gesvdj */
status = cusolverDnCreateGesvdjInfo(&gesvdj_params);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* default value of tolerance is machine zero */
status = cusolverDnXgesvdjSetTolerance(
gesvdj_params,
tol);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* default value of max. sweeps is 100 */
status = cusolverDnXgesvdjSetMaxSweeps(
gesvdj_params,
max_sweeps);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* step 3: copy A to device */
cudaStat1 = cudaMalloc ((void**)&d_A , sizeof(float)*lda*n);
cudaStat2 = cudaMalloc ((void**)&d_S , sizeof(float)*n);
cudaStat3 = cudaMalloc ((void**)&d_U , sizeof(float)*lda*m);
cudaStat4 = cudaMalloc ((void**)&d_V , sizeof(float)*lda*n);
cudaStat5 = cudaMalloc ((void**)&d_info, sizeof(int));
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
assert(cudaSuccess == cudaStat4);
assert(cudaSuccess == cudaStat5);
cudaStat1 = cudaMemcpy(d_A, A, sizeof(float)*lda*n, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
/* step 4: query workspace of SVD */
status = cusolverDnSgesvdj_bufferSize(
cusolverH,
jobz, /* CUSOLVER_EIG_MODE_NOVECTOR: compute singular values only */
/* CUSOLVER_EIG_MODE_VECTOR: compute singular value and singular vectors */
econ, /* econ = 1 for economy size */
m, /* nubmer of rows of A, 0 <= m */
n, /* number of columns of A, 0 <= n */
d_A, /* m-by-n */
lda, /* leading dimension of A */
d_S, /* min(m,n) */
/* the singular values in descending order */
d_U, /* m-by-m if econ = 0 */
/* m-by-min(m,n) if econ = 1 */
lda, /* leading dimension of U, ldu >= max(1,m) */
d_V, /* n-by-n if econ = 0 */
/* n-by-min(m,n) if econ = 1 */
lda, /* leading dimension of V, ldv >= max(1,n) */
&lwork,
gesvdj_params);
assert(CUSOLVER_STATUS_SUCCESS == status);
cudaStat1 = cudaMalloc((void**)&d_work , sizeof(float)*lwork);
assert(cudaSuccess == cudaStat1);
/* step 5: compute SVD */
//Iterating over SVD calculation, not part of example
int iters;
for (iters = 10; iters > 0; iters--){
status = cusolverDnSgesvdj(
cusolverH,
jobz, /* CUSOLVER_EIG_MODE_NOVECTOR: compute singular values only */
/* CUSOLVER_EIG_MODE_VECTOR: compute singular value and singular vectors */
econ, /* econ = 1 for economy size */
m, /* nubmer of rows of A, 0 <= m */
n, /* number of columns of A, 0 <= n */
d_A, /* m-by-n */
lda, /* leading dimension of A */
d_S, /* min(m,n) */
/* the singular values in descending order */
d_U, /* m-by-m if econ = 0 */
/* m-by-min(m,n) if econ = 1 */
lda, /* leading dimension of U, ldu >= max(1,m) */
d_V, /* n-by-n if econ = 0 */
/* n-by-min(m,n) if econ = 1 */
lda, /* leading dimension of V, ldv >= max(1,n) */
d_work,
lwork,
d_info,
gesvdj_params);
cudaStat1 = cudaDeviceSynchronize();
assert(CUSOLVER_STATUS_SUCCESS == status);
assert(cudaSuccess == cudaStat1);
cudaStat1 = cudaMemcpy(U, d_U, sizeof(float)*lda*m, cudaMemcpyDeviceToHost);
cudaStat2 = cudaMemcpy(V, d_V, sizeof(float)*lda*n, cudaMemcpyDeviceToHost);
cudaStat3 = cudaMemcpy(S, d_S, sizeof(float)*n , cudaMemcpyDeviceToHost);
cudaStat4 = cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost);
cudaStat5 = cudaDeviceSynchronize();
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
assert(cudaSuccess == cudaStat4);
assert(cudaSuccess == cudaStat5);
if ( 0 == info ){
printf("gesvdj converges \n");
}else if ( 0 > info ){
printf("%d-th parameter is wrong \n", -info);
exit(1);
}else{
printf("WARNING: info = %d : gesvdj does not converge \n", info );
}
}
/* step 6: measure error of singular value */
status = cusolverDnXgesvdjGetSweeps(
cusolverH,
gesvdj_params,
&executed_sweeps);
assert(CUSOLVER_STATUS_SUCCESS == status);
status = cusolverDnXgesvdjGetResidual(
cusolverH,
gesvdj_params,
&residual);
assert(CUSOLVER_STATUS_SUCCESS == status);
printf("residual |A - U*S*V**H|_F = %E \n", residual );
printf("number of executed sweeps = %d \n\n", executed_sweeps );
/* free resources */
if (A ) free(A);
if (V ) free(V);
if (U ) free(U);
if (d_A ) cudaFree(d_A);
if (d_S ) cudaFree(d_S);
if (d_U ) cudaFree(d_U);
if (d_V ) cudaFree(d_V);
if (d_info ) cudaFree(d_info);
if (d_work ) cudaFree(d_work);
if (cusolverH ) cusolverDnDestroy(cusolverH);
if (stream ) cudaStreamDestroy(stream);
if (gesvdj_params) cusolverDnDestroyGesvdjInfo(gesvdj_params);
cudaDeviceReset();
return 0;
}
感谢@talonmies 帮助诊断问题。 cusolver 的 gesvdj 方法具有经济模式,可将 U 和 V 矩阵存储在更经济的数组中。我为使代码正常工作所做的修改很简单。
econ = 1
U array size (mxn)
V array size (nxn)
ldv paramater = n
代码如下:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <cusolverDn.h>
#include<iostream>
#include<iomanip>
#include<assert.h>
#include <cuda_runtime_api.h>
#include "cuda_runtime.h"
int main(int argc, char*argv[])
{
//Setting which device to run on
const int device = 0;
cudaSetDevice(device);
cusolverDnHandle_t cusolverH = NULL;
cudaStream_t stream = NULL;
gesvdjInfo_t gesvdj_params = NULL;
cusolverStatus_t status = CUSOLVER_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
cudaError_t cudaStat3 = cudaSuccess;
cudaError_t cudaStat4 = cudaSuccess;
cudaError_t cudaStat5 = cudaSuccess;
const long int m = 1048576;
const long int n = 20;
const int lda = m;
// --- Setting the host, Nrows x Ncols matrix
float *A = (float *)malloc(m * n * sizeof(float));
for(long int j = 0; j < m; j++)
for(long int i = 0; i < n; i++)
A[j + i*m] = sqrt((float)(i + j));
float *U = (float *)malloc(m * n * sizeof(float)); /* m-by-m unitary matrix, left singular vectors */
float *V = (float *)malloc(n * n * sizeof(float)); /* n-by-n unitary matrix, right singular vectors */
float S[n]; /* numerical singular value */
float *d_A = NULL; /* device copy of A */
float *d_S = NULL; /* singular values */
float *d_U = NULL; /* left singular vectors */
float *d_V = NULL; /* right singular vectors */
int *d_info = NULL; /* error info */
int lwork = 0; /* size of workspace */
float *d_work = NULL; /* devie workspace for gesvdj */
int info = 0; /* host copy of error info */
/* configuration of gesvdj */
const double tol = 1.e-7;
const int max_sweeps = 100;
const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; // compute eigenvectors.
const int econ = 1 ; /* econ = 1 for economy size */
/* numerical results of gesvdj */
double residual = 0;
int executed_sweeps = 0;
printf("tol = %E, default value is machine zero \n", tol);
printf("max. sweeps = %d, default value is 100\n", max_sweeps);
printf("econ = %d \n", econ);
printf("=====\n");
/* step 1: create cusolver handle, bind a stream */
status = cusolverDnCreate(&cusolverH);
assert(CUSOLVER_STATUS_SUCCESS == status);
cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
assert(cudaSuccess == cudaStat1);
status = cusolverDnSetStream(cusolverH, stream);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* step 2: configuration of gesvdj */
status = cusolverDnCreateGesvdjInfo(&gesvdj_params);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* default value of tolerance is machine zero */
status = cusolverDnXgesvdjSetTolerance(
gesvdj_params,
tol);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* default value of max. sweeps is 100 */
status = cusolverDnXgesvdjSetMaxSweeps(
gesvdj_params,
max_sweeps);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* step 3: copy A to device */
cudaStat1 = cudaMalloc ((void**)&d_A , sizeof(float)*lda*n);
cudaStat2 = cudaMalloc ((void**)&d_S , sizeof(float)*n);
cudaStat3 = cudaMalloc ((void**)&d_U , sizeof(float)*lda*n);
cudaStat4 = cudaMalloc ((void**)&d_V , sizeof(float)*n*n);
cudaStat5 = cudaMalloc ((void**)&d_info, sizeof(int));
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
assert(cudaSuccess == cudaStat4);
assert(cudaSuccess == cudaStat5);
cudaStat1 = cudaMemcpy(d_A, A, sizeof(float)*lda*n, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
/* step 4: query workspace of SVD */
status = cusolverDnSgesvdj_bufferSize(
cusolverH,
jobz, /* CUSOLVER_EIG_MODE_NOVECTOR: compute singular values only */
/* CUSOLVER_EIG_MODE_VECTOR: compute singular value and singular vectors */
econ, /* econ = 1 for economy size */
m, /* nubmer of rows of A, 0 <= m */
n, /* number of columns of A, 0 <= n */
d_A, /* m-by-n */
lda, /* leading dimension of A */
d_S, /* min(m,n) */
/* the singular values in descending order */
d_U, /* m-by-m if econ = 0 */
/* m-by-min(m,n) if econ = 1 */
lda, /* leading dimension of U, ldu >= max(1,m) */
d_V, /* n-by-n if econ = 0 */
/* n-by-min(m,n) if econ = 1 */
n, /* leading dimension of V, ldv >= max(1,n) */
&lwork,
gesvdj_params);
assert(CUSOLVER_STATUS_SUCCESS == status);
cudaStat1 = cudaMalloc((void**)&d_work , sizeof(float)*lwork);
assert(cudaSuccess == cudaStat1);
/* step 5: compute SVD */
//Iterating over SVD calculation, not part of example
int iters;
for (iters = 10; iters > 0; iters--){
status = cusolverDnSgesvdj(
cusolverH,
jobz, /* CUSOLVER_EIG_MODE_NOVECTOR: compute singular values only */
/* CUSOLVER_EIG_MODE_VECTOR: compute singular value and singular vectors */
econ, /* econ = 1 for economy size */
m, /* nubmer of rows of A, 0 <= m */
n, /* number of columns of A, 0 <= n */
d_A, /* m-by-n */
lda, /* leading dimension of A */
d_S, /* min(m,n) */
/* the singular values in descending order */
d_U, /* m-by-m if econ = 0 */
/* m-by-min(m,n) if econ = 1 */
lda, /* leading dimension of U, ldu >= max(1,m) */
d_V, /* n-by-n if econ = 0 */
/* n-by-min(m,n) if econ = 1 */
n, /* leading dimension of V, ldv >= max(1,n) */
d_work,
lwork,
d_info,
gesvdj_params);
cudaStat1 = cudaDeviceSynchronize();
assert(CUSOLVER_STATUS_SUCCESS == status);
assert(cudaSuccess == cudaStat1);
cudaStat1 = cudaMemcpy(U, d_U, sizeof(float)*lda*n, cudaMemcpyDeviceToHost);
cudaStat2 = cudaMemcpy(V, d_V, sizeof(float)*n*n, cudaMemcpyDeviceToHost);
cudaStat3 = cudaMemcpy(S, d_S, sizeof(float)*n , cudaMemcpyDeviceToHost);
cudaStat4 = cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost);
cudaStat5 = cudaDeviceSynchronize();
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
assert(cudaSuccess == cudaStat4);
assert(cudaSuccess == cudaStat5);
if ( 0 == info ){
printf("gesvdj converges \n");
}else if ( 0 > info ){
printf("%d-th parameter is wrong \n", -info);
exit(1);
}else{
printf("WARNING: info = %d : gesvdj does not converge \n", info );
}
}
/* step 6: measure error of singular value */
status = cusolverDnXgesvdjGetSweeps(
cusolverH,
gesvdj_params,
&executed_sweeps);
assert(CUSOLVER_STATUS_SUCCESS == status);
status = cusolverDnXgesvdjGetResidual(
cusolverH,
gesvdj_params,
&residual);
assert(CUSOLVER_STATUS_SUCCESS == status);
printf("residual |A - U*S*V**H|_F = %E \n", residual );
printf("number of executed sweeps = %d \n\n", executed_sweeps );
/* free resources */
if (A ) free(A);
if (V ) free(V);
if (U ) free(U);
if (d_A ) cudaFree(d_A);
if (d_S ) cudaFree(d_S);
if (d_U ) cudaFree(d_U);
if (d_V ) cudaFree(d_V);
if (d_info ) cudaFree(d_info);
if (d_work ) cudaFree(d_work);
if (cusolverH ) cusolverDnDestroy(cusolverH);
if (stream ) cudaStreamDestroy(stream);
if (gesvdj_params) cusolverDnDestroyGesvdjInfo(gesvdj_params);
cudaDeviceReset();
return 0;
}
我正在 运行对在 NVIDIA P6000 上的 "G.2. SVD with singular vectors (via Jacobi method)" 部分找到的 here 代码稍作修改。稍作修改是在堆中为 A、U 和 V 向量动态分配内存,并使用取决于 A 中索引的值填充指定大小的 A 向量。我还将所有内容从双精度数转换为浮点数。最后的修改是在 gesvdj 调用自身上循环并收敛检查迭代次数(在我的例子中是 10 次)。
通过这些细微的修改,我能够克服在大小大于 ~1000x1000 的对称数组上执行 SVD 的第一个障碍。我最终需要 运行 一个大小为 1048576x20 的数组上的 SVD。
目前,算法 运行s 用于大小为 10000x20 的数组,但当我转到 50000x20 时失败。
问题似乎源于 gesvdj 调用本身。调用 gesvdj 后的同步调用失败并返回一般访问错误。
如果我 运行 使用 cuda-memcheck 的程序,对于同一块中的不同线程,我会得到一系列这些错误:
Invalid __global__ write of size 4
========= at 0x00000108 in void eye_kernel<float, int=5, int=3>(int, int, float*, int)
========= by thread (0,7,0) in block (16,1342,0)
========= Address 0x7f4ed40414c0 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/lib64/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x269e85]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcusolver.so.10.0 [0x100c822]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcusolver.so.10.0 [0x100ca17]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcusolver.so.10.0 [0x1040dd5]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcusolver.so.10.0 [0x235c5d]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcusolver.so.10.0 (cusolverDnSgesvdj + 0x508) [0x21f9a8]
========= Host Frame:./gesvdj_example [0x4518]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x223d5]
========= Host Frame:./gesvdj_example [0x3ab9]
我想知道我是否遇到了某种 cusolver 内部限制?谁有想法?如有必要,我可以提供我的确切代码,但它与示例非常相似,我认为我只是将人们指向那里。
谢谢!
编辑以添加我链接到的示例中的违规代码,该算法在 assert(CUSOLVER_STATUS_SUCCESS == status) 处失败;线。我对使用 C 和 CUDA 进行编码真的很陌生,如果我遗漏了一些明显的调试信息,我深表歉意。
/* step 5: compute SVD */
status = cusolverDnDgesvdj(
cusolverH,
jobz, /* CUSOLVER_EIG_MODE_NOVECTOR: compute singular values only */
/* CUSOLVER_EIG_MODE_VECTOR: compute singular value and singular vectors */
econ, /* econ = 1 for economy size */
m, /* nubmer of rows of A, 0 <= m */
n, /* number of columns of A, 0 <= n */
d_A, /* m-by-n */
lda, /* leading dimension of A */
d_S, /* min(m,n) */
/* the singular values in descending order */
d_U, /* m-by-m if econ = 0 */
/* m-by-min(m,n) if econ = 1 */
lda, /* leading dimension of U, ldu >= max(1,m) */
d_V, /* n-by-n if econ = 0 */
/* n-by-min(m,n) if econ = 1 */
lda, /* leading dimension of V, ldv >= max(1,n) */
d_work,
lwork,
d_info,
gesvdj_params);
cudaStat1 = cudaDeviceSynchronize();
assert(CUSOLVER_STATUS_SUCCESS == status);
assert(cudaSuccess == cudaStat1);
编辑 2 添加我的代码...
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <cusolverDn.h>
#include<iostream>
#include<iomanip>
#include<assert.h>
#include <cuda_runtime_api.h>
#include "cuda_runtime.h"
int main(int argc, char*argv[])
{
//Setting which device to run on
const int device = 0;
cudaSetDevice(device);
cusolverDnHandle_t cusolverH = NULL;
cudaStream_t stream = NULL;
gesvdjInfo_t gesvdj_params = NULL;
cusolverStatus_t status = CUSOLVER_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
cudaError_t cudaStat3 = cudaSuccess;
cudaError_t cudaStat4 = cudaSuccess;
cudaError_t cudaStat5 = cudaSuccess;
const long int m = 50000;
const long int n = 20;
const int lda = m;
// --- Setting the host, Nrows x Ncols matrix
float *A = (float *)malloc(m * n * sizeof(float));
for(long int j = 0; j < m; j++)
for(long int i = 0; i < n; i++)
A[j + i*m] = sqrt((float)(i + j));
float *U = (float *)malloc(m * m * sizeof(float)); /* m-by-m unitary matrix, left singular vectors */
float *V = (float *)malloc(m * n * sizeof(float)); /* n-by-n unitary matrix, right singular vectors */
float S[n]; /* numerical singular value */
float *d_A = NULL; /* device copy of A */
float *d_S = NULL; /* singular values */
float *d_U = NULL; /* left singular vectors */
float *d_V = NULL; /* right singular vectors */
int *d_info = NULL; /* error info */
int lwork = 0; /* size of workspace */
float *d_work = NULL; /* devie workspace for gesvdj */
int info = 0; /* host copy of error info */
/* configuration of gesvdj */
const double tol = 1.e-7;
const int max_sweeps = 100;
const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; // compute eigenvectors.
const int econ = 0 ; /* econ = 1 for economy size */
/* numerical results of gesvdj */
double residual = 0;
int executed_sweeps = 0;
printf("tol = %E, default value is machine zero \n", tol);
printf("max. sweeps = %d, default value is 100\n", max_sweeps);
printf("econ = %d \n", econ);
printf("=====\n");
/* step 1: create cusolver handle, bind a stream */
status = cusolverDnCreate(&cusolverH);
assert(CUSOLVER_STATUS_SUCCESS == status);
cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
assert(cudaSuccess == cudaStat1);
status = cusolverDnSetStream(cusolverH, stream);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* step 2: configuration of gesvdj */
status = cusolverDnCreateGesvdjInfo(&gesvdj_params);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* default value of tolerance is machine zero */
status = cusolverDnXgesvdjSetTolerance(
gesvdj_params,
tol);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* default value of max. sweeps is 100 */
status = cusolverDnXgesvdjSetMaxSweeps(
gesvdj_params,
max_sweeps);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* step 3: copy A to device */
cudaStat1 = cudaMalloc ((void**)&d_A , sizeof(float)*lda*n);
cudaStat2 = cudaMalloc ((void**)&d_S , sizeof(float)*n);
cudaStat3 = cudaMalloc ((void**)&d_U , sizeof(float)*lda*m);
cudaStat4 = cudaMalloc ((void**)&d_V , sizeof(float)*lda*n);
cudaStat5 = cudaMalloc ((void**)&d_info, sizeof(int));
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
assert(cudaSuccess == cudaStat4);
assert(cudaSuccess == cudaStat5);
cudaStat1 = cudaMemcpy(d_A, A, sizeof(float)*lda*n, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
/* step 4: query workspace of SVD */
status = cusolverDnSgesvdj_bufferSize(
cusolverH,
jobz, /* CUSOLVER_EIG_MODE_NOVECTOR: compute singular values only */
/* CUSOLVER_EIG_MODE_VECTOR: compute singular value and singular vectors */
econ, /* econ = 1 for economy size */
m, /* nubmer of rows of A, 0 <= m */
n, /* number of columns of A, 0 <= n */
d_A, /* m-by-n */
lda, /* leading dimension of A */
d_S, /* min(m,n) */
/* the singular values in descending order */
d_U, /* m-by-m if econ = 0 */
/* m-by-min(m,n) if econ = 1 */
lda, /* leading dimension of U, ldu >= max(1,m) */
d_V, /* n-by-n if econ = 0 */
/* n-by-min(m,n) if econ = 1 */
lda, /* leading dimension of V, ldv >= max(1,n) */
&lwork,
gesvdj_params);
assert(CUSOLVER_STATUS_SUCCESS == status);
cudaStat1 = cudaMalloc((void**)&d_work , sizeof(float)*lwork);
assert(cudaSuccess == cudaStat1);
/* step 5: compute SVD */
//Iterating over SVD calculation, not part of example
int iters;
for (iters = 10; iters > 0; iters--){
status = cusolverDnSgesvdj(
cusolverH,
jobz, /* CUSOLVER_EIG_MODE_NOVECTOR: compute singular values only */
/* CUSOLVER_EIG_MODE_VECTOR: compute singular value and singular vectors */
econ, /* econ = 1 for economy size */
m, /* nubmer of rows of A, 0 <= m */
n, /* number of columns of A, 0 <= n */
d_A, /* m-by-n */
lda, /* leading dimension of A */
d_S, /* min(m,n) */
/* the singular values in descending order */
d_U, /* m-by-m if econ = 0 */
/* m-by-min(m,n) if econ = 1 */
lda, /* leading dimension of U, ldu >= max(1,m) */
d_V, /* n-by-n if econ = 0 */
/* n-by-min(m,n) if econ = 1 */
lda, /* leading dimension of V, ldv >= max(1,n) */
d_work,
lwork,
d_info,
gesvdj_params);
cudaStat1 = cudaDeviceSynchronize();
assert(CUSOLVER_STATUS_SUCCESS == status);
assert(cudaSuccess == cudaStat1);
cudaStat1 = cudaMemcpy(U, d_U, sizeof(float)*lda*m, cudaMemcpyDeviceToHost);
cudaStat2 = cudaMemcpy(V, d_V, sizeof(float)*lda*n, cudaMemcpyDeviceToHost);
cudaStat3 = cudaMemcpy(S, d_S, sizeof(float)*n , cudaMemcpyDeviceToHost);
cudaStat4 = cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost);
cudaStat5 = cudaDeviceSynchronize();
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
assert(cudaSuccess == cudaStat4);
assert(cudaSuccess == cudaStat5);
if ( 0 == info ){
printf("gesvdj converges \n");
}else if ( 0 > info ){
printf("%d-th parameter is wrong \n", -info);
exit(1);
}else{
printf("WARNING: info = %d : gesvdj does not converge \n", info );
}
}
/* step 6: measure error of singular value */
status = cusolverDnXgesvdjGetSweeps(
cusolverH,
gesvdj_params,
&executed_sweeps);
assert(CUSOLVER_STATUS_SUCCESS == status);
status = cusolverDnXgesvdjGetResidual(
cusolverH,
gesvdj_params,
&residual);
assert(CUSOLVER_STATUS_SUCCESS == status);
printf("residual |A - U*S*V**H|_F = %E \n", residual );
printf("number of executed sweeps = %d \n\n", executed_sweeps );
/* free resources */
if (A ) free(A);
if (V ) free(V);
if (U ) free(U);
if (d_A ) cudaFree(d_A);
if (d_S ) cudaFree(d_S);
if (d_U ) cudaFree(d_U);
if (d_V ) cudaFree(d_V);
if (d_info ) cudaFree(d_info);
if (d_work ) cudaFree(d_work);
if (cusolverH ) cusolverDnDestroy(cusolverH);
if (stream ) cudaStreamDestroy(stream);
if (gesvdj_params) cusolverDnDestroyGesvdjInfo(gesvdj_params);
cudaDeviceReset();
return 0;
}
感谢@talonmies 帮助诊断问题。 cusolver 的 gesvdj 方法具有经济模式,可将 U 和 V 矩阵存储在更经济的数组中。我为使代码正常工作所做的修改很简单。
econ = 1
U array size (mxn)
V array size (nxn)
ldv paramater = n
代码如下:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <cusolverDn.h>
#include<iostream>
#include<iomanip>
#include<assert.h>
#include <cuda_runtime_api.h>
#include "cuda_runtime.h"
int main(int argc, char*argv[])
{
//Setting which device to run on
const int device = 0;
cudaSetDevice(device);
cusolverDnHandle_t cusolverH = NULL;
cudaStream_t stream = NULL;
gesvdjInfo_t gesvdj_params = NULL;
cusolverStatus_t status = CUSOLVER_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
cudaError_t cudaStat3 = cudaSuccess;
cudaError_t cudaStat4 = cudaSuccess;
cudaError_t cudaStat5 = cudaSuccess;
const long int m = 1048576;
const long int n = 20;
const int lda = m;
// --- Setting the host, Nrows x Ncols matrix
float *A = (float *)malloc(m * n * sizeof(float));
for(long int j = 0; j < m; j++)
for(long int i = 0; i < n; i++)
A[j + i*m] = sqrt((float)(i + j));
float *U = (float *)malloc(m * n * sizeof(float)); /* m-by-m unitary matrix, left singular vectors */
float *V = (float *)malloc(n * n * sizeof(float)); /* n-by-n unitary matrix, right singular vectors */
float S[n]; /* numerical singular value */
float *d_A = NULL; /* device copy of A */
float *d_S = NULL; /* singular values */
float *d_U = NULL; /* left singular vectors */
float *d_V = NULL; /* right singular vectors */
int *d_info = NULL; /* error info */
int lwork = 0; /* size of workspace */
float *d_work = NULL; /* devie workspace for gesvdj */
int info = 0; /* host copy of error info */
/* configuration of gesvdj */
const double tol = 1.e-7;
const int max_sweeps = 100;
const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; // compute eigenvectors.
const int econ = 1 ; /* econ = 1 for economy size */
/* numerical results of gesvdj */
double residual = 0;
int executed_sweeps = 0;
printf("tol = %E, default value is machine zero \n", tol);
printf("max. sweeps = %d, default value is 100\n", max_sweeps);
printf("econ = %d \n", econ);
printf("=====\n");
/* step 1: create cusolver handle, bind a stream */
status = cusolverDnCreate(&cusolverH);
assert(CUSOLVER_STATUS_SUCCESS == status);
cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
assert(cudaSuccess == cudaStat1);
status = cusolverDnSetStream(cusolverH, stream);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* step 2: configuration of gesvdj */
status = cusolverDnCreateGesvdjInfo(&gesvdj_params);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* default value of tolerance is machine zero */
status = cusolverDnXgesvdjSetTolerance(
gesvdj_params,
tol);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* default value of max. sweeps is 100 */
status = cusolverDnXgesvdjSetMaxSweeps(
gesvdj_params,
max_sweeps);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* step 3: copy A to device */
cudaStat1 = cudaMalloc ((void**)&d_A , sizeof(float)*lda*n);
cudaStat2 = cudaMalloc ((void**)&d_S , sizeof(float)*n);
cudaStat3 = cudaMalloc ((void**)&d_U , sizeof(float)*lda*n);
cudaStat4 = cudaMalloc ((void**)&d_V , sizeof(float)*n*n);
cudaStat5 = cudaMalloc ((void**)&d_info, sizeof(int));
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
assert(cudaSuccess == cudaStat4);
assert(cudaSuccess == cudaStat5);
cudaStat1 = cudaMemcpy(d_A, A, sizeof(float)*lda*n, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
/* step 4: query workspace of SVD */
status = cusolverDnSgesvdj_bufferSize(
cusolverH,
jobz, /* CUSOLVER_EIG_MODE_NOVECTOR: compute singular values only */
/* CUSOLVER_EIG_MODE_VECTOR: compute singular value and singular vectors */
econ, /* econ = 1 for economy size */
m, /* nubmer of rows of A, 0 <= m */
n, /* number of columns of A, 0 <= n */
d_A, /* m-by-n */
lda, /* leading dimension of A */
d_S, /* min(m,n) */
/* the singular values in descending order */
d_U, /* m-by-m if econ = 0 */
/* m-by-min(m,n) if econ = 1 */
lda, /* leading dimension of U, ldu >= max(1,m) */
d_V, /* n-by-n if econ = 0 */
/* n-by-min(m,n) if econ = 1 */
n, /* leading dimension of V, ldv >= max(1,n) */
&lwork,
gesvdj_params);
assert(CUSOLVER_STATUS_SUCCESS == status);
cudaStat1 = cudaMalloc((void**)&d_work , sizeof(float)*lwork);
assert(cudaSuccess == cudaStat1);
/* step 5: compute SVD */
//Iterating over SVD calculation, not part of example
int iters;
for (iters = 10; iters > 0; iters--){
status = cusolverDnSgesvdj(
cusolverH,
jobz, /* CUSOLVER_EIG_MODE_NOVECTOR: compute singular values only */
/* CUSOLVER_EIG_MODE_VECTOR: compute singular value and singular vectors */
econ, /* econ = 1 for economy size */
m, /* nubmer of rows of A, 0 <= m */
n, /* number of columns of A, 0 <= n */
d_A, /* m-by-n */
lda, /* leading dimension of A */
d_S, /* min(m,n) */
/* the singular values in descending order */
d_U, /* m-by-m if econ = 0 */
/* m-by-min(m,n) if econ = 1 */
lda, /* leading dimension of U, ldu >= max(1,m) */
d_V, /* n-by-n if econ = 0 */
/* n-by-min(m,n) if econ = 1 */
n, /* leading dimension of V, ldv >= max(1,n) */
d_work,
lwork,
d_info,
gesvdj_params);
cudaStat1 = cudaDeviceSynchronize();
assert(CUSOLVER_STATUS_SUCCESS == status);
assert(cudaSuccess == cudaStat1);
cudaStat1 = cudaMemcpy(U, d_U, sizeof(float)*lda*n, cudaMemcpyDeviceToHost);
cudaStat2 = cudaMemcpy(V, d_V, sizeof(float)*n*n, cudaMemcpyDeviceToHost);
cudaStat3 = cudaMemcpy(S, d_S, sizeof(float)*n , cudaMemcpyDeviceToHost);
cudaStat4 = cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost);
cudaStat5 = cudaDeviceSynchronize();
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
assert(cudaSuccess == cudaStat4);
assert(cudaSuccess == cudaStat5);
if ( 0 == info ){
printf("gesvdj converges \n");
}else if ( 0 > info ){
printf("%d-th parameter is wrong \n", -info);
exit(1);
}else{
printf("WARNING: info = %d : gesvdj does not converge \n", info );
}
}
/* step 6: measure error of singular value */
status = cusolverDnXgesvdjGetSweeps(
cusolverH,
gesvdj_params,
&executed_sweeps);
assert(CUSOLVER_STATUS_SUCCESS == status);
status = cusolverDnXgesvdjGetResidual(
cusolverH,
gesvdj_params,
&residual);
assert(CUSOLVER_STATUS_SUCCESS == status);
printf("residual |A - U*S*V**H|_F = %E \n", residual );
printf("number of executed sweeps = %d \n\n", executed_sweeps );
/* free resources */
if (A ) free(A);
if (V ) free(V);
if (U ) free(U);
if (d_A ) cudaFree(d_A);
if (d_S ) cudaFree(d_S);
if (d_U ) cudaFree(d_U);
if (d_V ) cudaFree(d_V);
if (d_info ) cudaFree(d_info);
if (d_work ) cudaFree(d_work);
if (cusolverH ) cusolverDnDestroy(cusolverH);
if (stream ) cudaStreamDestroy(stream);
if (gesvdj_params) cusolverDnDestroyGesvdjInfo(gesvdj_params);
cudaDeviceReset();
return 0;
}