cublasDgetrfBatched 和 cublasDtrsmBatched 在使用 cuBLAS 求解线性系统数组时的矛盾
Contradiction of cublasDgetrfBatched and cublasDtrsmBatched when to solve array of linear systems using cuBLAS
我有很多密集的线性系统,我想用 cuBLAS Batched 格式求解。所以我的计划是
使用cublasDgetrfBatched进行批量LU分解
然后用cublasDtrsmBatched对batched lower triangle和batched upper triangle部分一一进行。
代码为
#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<cublas_v2.h>
const int N = 32;
const int Nmatrices = N;
__global__ void initiate(double *d_A, double *d_B)
{
int i = threadIdx.x; int j = blockIdx.x;
int id = j*N*N + i*N; int idb = j*N + i;
for(int k = 0; k< N ; k++)
{
d_A[id + k] = 0.0;
if(k == i-2) d_A[id + k] = 1.0;
if(k == i-1) d_A[id + k] = 2.0;
if(k == i) d_A[id + k] = 8.0;
if(k == i+1) d_A[id + k] = 2.0;
if(k == i+2) d_A[id + k] = 1.0;
}
d_B[idb] = 8.0;
}
int main()
{
cublasHandle_t handle; cublasSafeCall(cublasCreate(&handle));
// Allocate device space for the input matrices
double *d_A_sys; cudaMalloc((void**)&d_A_sys, N*N*Nmatrices*sizeof(double));
double *d_B_sys; cudaMalloc((void**)&d_B_sys, N*Nmatrices *sizeof(double));
// Allocate host space for the solution
double *h_B_sys = (double *)malloc(N*Nmatrices*sizeof(double));
// kernel for initiat d_A_sys and d_B_sys
initiate<<<Nmatrices, N>>>(d_A_sys, d_B_sys);
//Creating the array of pointers needed as input/output to the batched getrf
double **h_A_pointers = (double **)malloc(Nmatrices*sizeof(double *));
for (int i = 0; i < Nmatrices; i++) h_A_pointers[i] = d_A_sys + i*N*N;
double **h_b_pointers = (double **)malloc(Nmatrices*sizeof(double *));
for (int i = 0; i < Nmatrices; i++) h_B_pointers[i] = d_B_sys + i*N;
double **d_A_pointers;
cudaMalloc((void**)&d_A_pointers, Nmatrices*sizeof(double *));
cudaMemcpy(d_A_pointers, h_A_pointers, Nmatrices*sizeof(double *), cudaMemcpyHostToDevice);
double **d_b_pointers;
cudaMalloc((void**)&d_b_pointers, Nmatrices*sizeof(double *));
cudaMemcpy(d_b_pointers, h_b_pointers, Nmatrices*sizeof(double *), cudaMemcpyHostToDevice);
int *d_InfoArrays; cudaMalloc((void**)&d_InfoArrays, Nmatrices*sizeof(int));
int *h_InfoArrays = (int *)malloc(Nmatrices*sizeof(int));
//Batched LU decomposition
cublasDgetrfBatched(handle, N, d_A_pointers, N, NULL, d_InfoArrays, Nmatrices));
//Batched Lower triangular part
cublasDtrsmBatched(handle,
CUBLAS_SIDE_LEFT,
CUBLAS_FILL_MODE_LOWER,
CUBLAS_OP_N,
CUBLAS_DIAG_UNIT,
N,
N,
&alpha,
d_A_pointers,
N,
d_b_pointers,
N,
Nmatrices);
//Batched Upper triangular part
cublasDtrsmBatched(handle,
CUBLAS_SIDE_LEFT,
CUBLAS_FILL_MODE_UPPER,
CUBLAS_OP_N,
CUBLAS_DIAG_NON_UNIT,
N,
N,
&alpha,
d_A_pointers,
N,
d_b_pointers,
N,
Nmatrices);
cudaMemcpy(h_B_sys, d_B_sys, N*Nmatrices*sizeof(double), cudaMemcpyDeviceToHost);
printf("Print out the solutions \n");
cublasDestroy(handle);
gpuErrchk(cudaDeviceReset());
return 0;
}
cublasDgetrfBatched and cublasDtrsmBatched demand d_A_pointers should be in double type but when I execute, the later one giving me compiling error like this see the pic:
如何克服这个问题,有帮助吗?
您可以通过执行以下操作来解决 const 正确性问题:
const double **d_A_pointers;
cudaMalloc((void**)&d_A_pointers, Nmatrices*sizeof(double *));
....
//Batched LU decomposition
cublasDgetrfBatched(handle, N, const_cast<double**>(d_A_pointers), N, NULL, d_InfoArrays, Nmatrices);
即抛弃 cublasDgetrfBatched
中的常量
您在发布的代码中显然有误的另一件事是 cublasDtrsmBatched
调用中输入的维度。我相信你想要这样的东西:
//Batched Lower triangular part
cublasDtrsmBatched(handle,
CUBLAS_SIDE_LEFT,
CUBLAS_FILL_MODE_LOWER,
CUBLAS_OP_N,
CUBLAS_DIAG_UNIT,
N,
1,
&alpha,
d_A_pointers,
N,
d_b_pointers,
N,
Nmatrices);
即您的示例的 RHS 矩阵的输入大小是 Nx1,而不是 NxN(您没有为每个分解矩阵解决 N 个 RHS 问题,只有一个)。
您的代码中可能存在其他问题(请注意,CUBLAS 与大多数参考 BLAS 实现一样,默认情况下需要按列主要顺序输入),并且由于其他原因,您在问题中发布的代码实际上并未编译,所以不能肯定地说更多。
我有很多密集的线性系统,我想用 cuBLAS Batched 格式求解。所以我的计划是
使用cublasDgetrfBatched进行批量LU分解
然后用cublasDtrsmBatched对batched lower triangle和batched upper triangle部分一一进行。
代码为
#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<cublas_v2.h>
const int N = 32;
const int Nmatrices = N;
__global__ void initiate(double *d_A, double *d_B)
{
int i = threadIdx.x; int j = blockIdx.x;
int id = j*N*N + i*N; int idb = j*N + i;
for(int k = 0; k< N ; k++)
{
d_A[id + k] = 0.0;
if(k == i-2) d_A[id + k] = 1.0;
if(k == i-1) d_A[id + k] = 2.0;
if(k == i) d_A[id + k] = 8.0;
if(k == i+1) d_A[id + k] = 2.0;
if(k == i+2) d_A[id + k] = 1.0;
}
d_B[idb] = 8.0;
}
int main()
{
cublasHandle_t handle; cublasSafeCall(cublasCreate(&handle));
// Allocate device space for the input matrices
double *d_A_sys; cudaMalloc((void**)&d_A_sys, N*N*Nmatrices*sizeof(double));
double *d_B_sys; cudaMalloc((void**)&d_B_sys, N*Nmatrices *sizeof(double));
// Allocate host space for the solution
double *h_B_sys = (double *)malloc(N*Nmatrices*sizeof(double));
// kernel for initiat d_A_sys and d_B_sys
initiate<<<Nmatrices, N>>>(d_A_sys, d_B_sys);
//Creating the array of pointers needed as input/output to the batched getrf
double **h_A_pointers = (double **)malloc(Nmatrices*sizeof(double *));
for (int i = 0; i < Nmatrices; i++) h_A_pointers[i] = d_A_sys + i*N*N;
double **h_b_pointers = (double **)malloc(Nmatrices*sizeof(double *));
for (int i = 0; i < Nmatrices; i++) h_B_pointers[i] = d_B_sys + i*N;
double **d_A_pointers;
cudaMalloc((void**)&d_A_pointers, Nmatrices*sizeof(double *));
cudaMemcpy(d_A_pointers, h_A_pointers, Nmatrices*sizeof(double *), cudaMemcpyHostToDevice);
double **d_b_pointers;
cudaMalloc((void**)&d_b_pointers, Nmatrices*sizeof(double *));
cudaMemcpy(d_b_pointers, h_b_pointers, Nmatrices*sizeof(double *), cudaMemcpyHostToDevice);
int *d_InfoArrays; cudaMalloc((void**)&d_InfoArrays, Nmatrices*sizeof(int));
int *h_InfoArrays = (int *)malloc(Nmatrices*sizeof(int));
//Batched LU decomposition
cublasDgetrfBatched(handle, N, d_A_pointers, N, NULL, d_InfoArrays, Nmatrices));
//Batched Lower triangular part
cublasDtrsmBatched(handle,
CUBLAS_SIDE_LEFT,
CUBLAS_FILL_MODE_LOWER,
CUBLAS_OP_N,
CUBLAS_DIAG_UNIT,
N,
N,
&alpha,
d_A_pointers,
N,
d_b_pointers,
N,
Nmatrices);
//Batched Upper triangular part
cublasDtrsmBatched(handle,
CUBLAS_SIDE_LEFT,
CUBLAS_FILL_MODE_UPPER,
CUBLAS_OP_N,
CUBLAS_DIAG_NON_UNIT,
N,
N,
&alpha,
d_A_pointers,
N,
d_b_pointers,
N,
Nmatrices);
cudaMemcpy(h_B_sys, d_B_sys, N*Nmatrices*sizeof(double), cudaMemcpyDeviceToHost);
printf("Print out the solutions \n");
cublasDestroy(handle);
gpuErrchk(cudaDeviceReset());
return 0;
}
cublasDgetrfBatched and cublasDtrsmBatched demand d_A_pointers should be in double type but when I execute, the later one giving me compiling error like this see the pic:
如何克服这个问题,有帮助吗?
您可以通过执行以下操作来解决 const 正确性问题:
const double **d_A_pointers;
cudaMalloc((void**)&d_A_pointers, Nmatrices*sizeof(double *));
....
//Batched LU decomposition
cublasDgetrfBatched(handle, N, const_cast<double**>(d_A_pointers), N, NULL, d_InfoArrays, Nmatrices);
即抛弃 cublasDgetrfBatched
您在发布的代码中显然有误的另一件事是 cublasDtrsmBatched
调用中输入的维度。我相信你想要这样的东西:
//Batched Lower triangular part
cublasDtrsmBatched(handle,
CUBLAS_SIDE_LEFT,
CUBLAS_FILL_MODE_LOWER,
CUBLAS_OP_N,
CUBLAS_DIAG_UNIT,
N,
1,
&alpha,
d_A_pointers,
N,
d_b_pointers,
N,
Nmatrices);
即您的示例的 RHS 矩阵的输入大小是 Nx1,而不是 NxN(您没有为每个分解矩阵解决 N 个 RHS 问题,只有一个)。
您的代码中可能存在其他问题(请注意,CUBLAS 与大多数参考 BLAS 实现一样,默认情况下需要按列主要顺序输入),并且由于其他原因,您在问题中发布的代码实际上并未编译,所以不能肯定地说更多。