在CUDA中乘以矢量化二维方阵和压缩三对角矩阵
Multiply vectorized 2D square matrix and compressed tridiagonal matrix in CUDA
我有两个矩阵
#define MATRIX_SIZE 20
#define BLOCK_SIZE 2
#define TILE_SIZE 2
double** A
double** B
矩阵A是稠密矩阵,矩阵B是三对角矩阵。我创建了 A
的矢量化表示
/* sz = A.rowlen = B.rowlen = A.collen = B.collen */
double* A1d = matrix_to_vector(sz, A);
我还使用以下函数创建了 B 的压缩表示
double* l_array = new double(sz - 1);
double* m_array = new double(sz);
double* r_array = new double(sz-1);
int current_l_idx = 0;
int current_m_idx = 0;
int current_r_idx = 0;
for (int i = 0; i < sz; i++) {
for (int j = 0; j < sz; j++) {
if ((i == j+1) || (i-1 == j)) {
l_array[current_l_idx] = B[i][j];
current_l_idx++;
}
else if ((i == j-1) || (i+1 == j)) {
r_array[current_r_idx] = B[i][j];
current_r_idx++;
}
else if (i == j) {
m_array[current_m_idx] = B[i][j];
current_m_idx++;
}
}
}
然后我为 CUDA
创建一个空的二维矢量化矩阵 E 以及我的所有对象
double* E1d = matrix_to_vector(sz, E);
double* d_A
double* d_B_l;
double* d_B_m;
double* d_B_r;
double* d_E;
size_t sizeA = sz * sz * sizeof(double);
size_t sizeB_lr = (sz - 1) * sizeof(double);
size_t sizeB_m = sz * sizeof(double);
cudaMalloc(&d_A, sizeA);
cudaMalloc(&d_B_l. sizeB_lr);
cudaMalloc(&d_B_m, sizeB_m);
cudaMalloc(&d_B_r, sizeB_lr);
cudaMalloc(&d_E, sizeA);
cudaMemcpy(d_A, A1d, sizeA, cudaMemcpyHostToDevice);
cudaMemcpy(d_B_l, l_array, sizeB_lr, cudaMemcpyHostToDevice);
cudaMemcpy(d_B_m, m_array, sizeB_m, cudaMemcpyHostToDevice);
cudaMemcpy(d_B_r, r_array, sizeB_lr, cudaMemcpyHostToDevice);
cudaMemcpy(d_E, E1d, sizeA, cudaMemcpyHostToDevice);
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(MATRIX_SIZE / threads.x, MATRIX_SIZE / threads.y);
cudakernel<<<grid, threads>>>(sz, d_A, d_B_l, d_B_m, d_B_r, d_E);
我可以连续执行这个乘法,但不幸的是,我不知道如何在 CUDA 设备上实现这个
假设
- A 和 B 总是正方形
- sz 总是能被 BLOCK_SIZE 和 TILE_SIZE
整除
- BLOCK_SIZE 永远等于 TILE_SIZE
根据您的设置代码,我怀疑您正在寻找一种平铺共享内存方法来处理这种矩阵乘法,而我并不是真的想为您做功课,所以我将演示一个不使用共享内存的示例。
如果你了解矩阵乘法的工作原理,并且你也了解如何创建一个普通的shared memory GPU matrix multiply kernel,将以下代码转换为使用共享内存应该相对简单:
#include <stdio.h>
#define DSIZE 256
#define BSIZE 32
#define TOL 0.0001
typedef double mytype;
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// C = A x B
// A,B,C are all dense
template <typename T>
__global__ void mm(const T * __restrict__ A, const T * __restrict__ B, T * __restrict__ C, const int sz){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
int idy = threadIdx.y+blockDim.y*blockIdx.y;
if ((idx < sz) && (idy < sz)){
T temp = 0;
for (int i = 0; i < sz; i++)
temp += A[idy*sz+i]*B[i*sz+idx];
C[idy*sz+idx] = temp;}
}
// C = A x B
// A,C are dense, B is tridiagonal
template <typename T>
__global__ void mmt(const T * __restrict__ A, const T * __restrict__ B_l, const T * __restrict__ B_m, const T * __restrict__ B_r, T * __restrict__ C, const int sz){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
int idy = threadIdx.y+blockDim.y*blockIdx.y;
if ((idx < sz) && (idy < sz)){
T temp = 0;
if (idx > 0) temp += A[idy*sz+(idx-1)]*B_r[idx-1];
temp += A[idy*sz+(idx) ]*B_m[idx];
if (idx < (sz-1)) temp += A[idy*sz+(idx+1)]*B_l[idx];
C[idy*sz+idx] = temp;}
}
int main(){
mytype *d_A, *h_A, *d_B, *h_B, *d_C, *h_Cd, *h_Cs, *d_B_l, *h_B_l, *d_B_m, *h_B_m, *d_B_r, *h_B_r;
size_t msz = DSIZE*DSIZE;
size_t mszb = msz*sizeof(mytype);
// host side allocations
h_A = (mytype *)malloc(mszb);
h_B = (mytype *)malloc(mszb);
h_Cd =(mytype *)malloc(mszb);
h_Cs =(mytype *)malloc(mszb);
h_B_l = (mytype *)malloc((DSIZE-1)*sizeof(mytype));
h_B_r = (mytype *)malloc((DSIZE-1)*sizeof(mytype));
h_B_m = (mytype *)malloc( DSIZE*sizeof(mytype));
if (!h_A || !h_B || !h_Cd || !h_Cs || !h_B_l || !h_B_r || !h_B_m) {printf("malloc fail\n"); return -1;}
// device side allocations
cudaMalloc(&d_A, mszb);
cudaMalloc(&d_B, mszb);
cudaMalloc(&d_C, mszb);
cudaMalloc(&d_B_l, (DSIZE-1)*sizeof(mytype));
cudaMalloc(&d_B_r, (DSIZE-1)*sizeof(mytype));
cudaMalloc(&d_B_m, DSIZE*sizeof(mytype));
cudaCheckErrors("cudaMalloc fail");
// prepare A, B matrices
/*
|1 1 1 ...|
A = |2 2 2 ...|
|3 3 3 ...|
|4 4 4 ...|
|... |
|2 1 0 ...| B_l = left/lower subdiagonal (i.e. all 3's)
B = |3 2 1 ...| B_m = middle/main diagonal (i.e. all 2's)
|0 3 2 ...| B_r = right/upper superdiagonal (i.e. all 1's)
|0 0 3 ...|
|... |
*/
for (int i = 0; i < DSIZE; i++){
if (i < DSIZE-1){
h_B_r[i] = 1;
h_B_l[i] = 3;}
h_B_m[i] = 2;
for (int j = 0; j < DSIZE; j++){
h_A[i*DSIZE+j] = i+1;
if (j==i+1) h_B[i*DSIZE+j] = 1;
else if (j==i) h_B[i*DSIZE+j] = 2;
else if (j==i-1) h_B[i*DSIZE+j] = 3;
else h_B[i*DSIZE+j] = 0;}}
// copy data to device
cudaMemcpy(d_A, h_A, mszb, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, mszb, cudaMemcpyHostToDevice);
cudaMemcpy(d_B_l, h_B_l, (DSIZE-1)*sizeof(mytype), cudaMemcpyHostToDevice);
cudaMemcpy(d_B_r, h_B_r, (DSIZE-1)*sizeof(mytype), cudaMemcpyHostToDevice);
cudaMemcpy(d_B_m, h_B_m, DSIZE*sizeof(mytype), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy1 fail");
// perform dense-dense multiply
dim3 block(BSIZE,BSIZE);
dim3 grid((DSIZE+block.x-1)/block.x, (DSIZE+block.y-1)/block.y);
cudaMemset(d_C, 0, mszb);
mm<<<grid, block>>>(d_A, d_B, d_C, DSIZE);
cudaMemcpy(h_Cd, d_C, mszb, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2/kernel fail");
// perform dense-sparse multiply
cudaMemset(d_C, 0, mszb);
mmt<<<grid, block>>>(d_A, d_B_l, d_B_m, d_B_r, d_C, DSIZE);
cudaMemcpy(h_Cs, d_C, mszb, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 3/kernel fail");
// compare results
for (int i = 0; i < DSIZE; i++)
for (int j = 0; j < DSIZE; j++)
if (abs(h_Cs[i*DSIZE+j] - h_Cd[i*DSIZE+j]) > TOL) {printf("results mismatch at (%d, %d) dense: %f sparse: %f\n", i, j, h_Cd[i*DSIZE+j], h_Cs[i*DSIZE+j]); return -1;}
printf("Success!\n");
return 0;
}
备注:
mmt
内核中的所有全局内存访问(即 A
、B
向量和 C
)应该正确跨线程合并。因此,使用共享内存的转换也应该很容易产生对共享内存的非存储区冲突访问。
虽然研究这段代码可能对学习有用,但我建议任何严重的稀疏-密集矩阵乘法都使用 CUSPARSE 中的例程来完成,例如 csrmm。它几乎肯定会比上面的代码更有效(更快),并且可能比上面代码的任何共享内存转换更快。
我有两个矩阵
#define MATRIX_SIZE 20
#define BLOCK_SIZE 2
#define TILE_SIZE 2
double** A
double** B
矩阵A是稠密矩阵,矩阵B是三对角矩阵。我创建了 A
的矢量化表示/* sz = A.rowlen = B.rowlen = A.collen = B.collen */
double* A1d = matrix_to_vector(sz, A);
我还使用以下函数创建了 B 的压缩表示
double* l_array = new double(sz - 1);
double* m_array = new double(sz);
double* r_array = new double(sz-1);
int current_l_idx = 0;
int current_m_idx = 0;
int current_r_idx = 0;
for (int i = 0; i < sz; i++) {
for (int j = 0; j < sz; j++) {
if ((i == j+1) || (i-1 == j)) {
l_array[current_l_idx] = B[i][j];
current_l_idx++;
}
else if ((i == j-1) || (i+1 == j)) {
r_array[current_r_idx] = B[i][j];
current_r_idx++;
}
else if (i == j) {
m_array[current_m_idx] = B[i][j];
current_m_idx++;
}
}
}
然后我为 CUDA
double* E1d = matrix_to_vector(sz, E);
double* d_A
double* d_B_l;
double* d_B_m;
double* d_B_r;
double* d_E;
size_t sizeA = sz * sz * sizeof(double);
size_t sizeB_lr = (sz - 1) * sizeof(double);
size_t sizeB_m = sz * sizeof(double);
cudaMalloc(&d_A, sizeA);
cudaMalloc(&d_B_l. sizeB_lr);
cudaMalloc(&d_B_m, sizeB_m);
cudaMalloc(&d_B_r, sizeB_lr);
cudaMalloc(&d_E, sizeA);
cudaMemcpy(d_A, A1d, sizeA, cudaMemcpyHostToDevice);
cudaMemcpy(d_B_l, l_array, sizeB_lr, cudaMemcpyHostToDevice);
cudaMemcpy(d_B_m, m_array, sizeB_m, cudaMemcpyHostToDevice);
cudaMemcpy(d_B_r, r_array, sizeB_lr, cudaMemcpyHostToDevice);
cudaMemcpy(d_E, E1d, sizeA, cudaMemcpyHostToDevice);
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(MATRIX_SIZE / threads.x, MATRIX_SIZE / threads.y);
cudakernel<<<grid, threads>>>(sz, d_A, d_B_l, d_B_m, d_B_r, d_E);
我可以连续执行这个乘法,但不幸的是,我不知道如何在 CUDA 设备上实现这个
假设
- A 和 B 总是正方形
- sz 总是能被 BLOCK_SIZE 和 TILE_SIZE 整除
- BLOCK_SIZE 永远等于 TILE_SIZE
根据您的设置代码,我怀疑您正在寻找一种平铺共享内存方法来处理这种矩阵乘法,而我并不是真的想为您做功课,所以我将演示一个不使用共享内存的示例。
如果你了解矩阵乘法的工作原理,并且你也了解如何创建一个普通的shared memory GPU matrix multiply kernel,将以下代码转换为使用共享内存应该相对简单:
#include <stdio.h>
#define DSIZE 256
#define BSIZE 32
#define TOL 0.0001
typedef double mytype;
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// C = A x B
// A,B,C are all dense
template <typename T>
__global__ void mm(const T * __restrict__ A, const T * __restrict__ B, T * __restrict__ C, const int sz){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
int idy = threadIdx.y+blockDim.y*blockIdx.y;
if ((idx < sz) && (idy < sz)){
T temp = 0;
for (int i = 0; i < sz; i++)
temp += A[idy*sz+i]*B[i*sz+idx];
C[idy*sz+idx] = temp;}
}
// C = A x B
// A,C are dense, B is tridiagonal
template <typename T>
__global__ void mmt(const T * __restrict__ A, const T * __restrict__ B_l, const T * __restrict__ B_m, const T * __restrict__ B_r, T * __restrict__ C, const int sz){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
int idy = threadIdx.y+blockDim.y*blockIdx.y;
if ((idx < sz) && (idy < sz)){
T temp = 0;
if (idx > 0) temp += A[idy*sz+(idx-1)]*B_r[idx-1];
temp += A[idy*sz+(idx) ]*B_m[idx];
if (idx < (sz-1)) temp += A[idy*sz+(idx+1)]*B_l[idx];
C[idy*sz+idx] = temp;}
}
int main(){
mytype *d_A, *h_A, *d_B, *h_B, *d_C, *h_Cd, *h_Cs, *d_B_l, *h_B_l, *d_B_m, *h_B_m, *d_B_r, *h_B_r;
size_t msz = DSIZE*DSIZE;
size_t mszb = msz*sizeof(mytype);
// host side allocations
h_A = (mytype *)malloc(mszb);
h_B = (mytype *)malloc(mszb);
h_Cd =(mytype *)malloc(mszb);
h_Cs =(mytype *)malloc(mszb);
h_B_l = (mytype *)malloc((DSIZE-1)*sizeof(mytype));
h_B_r = (mytype *)malloc((DSIZE-1)*sizeof(mytype));
h_B_m = (mytype *)malloc( DSIZE*sizeof(mytype));
if (!h_A || !h_B || !h_Cd || !h_Cs || !h_B_l || !h_B_r || !h_B_m) {printf("malloc fail\n"); return -1;}
// device side allocations
cudaMalloc(&d_A, mszb);
cudaMalloc(&d_B, mszb);
cudaMalloc(&d_C, mszb);
cudaMalloc(&d_B_l, (DSIZE-1)*sizeof(mytype));
cudaMalloc(&d_B_r, (DSIZE-1)*sizeof(mytype));
cudaMalloc(&d_B_m, DSIZE*sizeof(mytype));
cudaCheckErrors("cudaMalloc fail");
// prepare A, B matrices
/*
|1 1 1 ...|
A = |2 2 2 ...|
|3 3 3 ...|
|4 4 4 ...|
|... |
|2 1 0 ...| B_l = left/lower subdiagonal (i.e. all 3's)
B = |3 2 1 ...| B_m = middle/main diagonal (i.e. all 2's)
|0 3 2 ...| B_r = right/upper superdiagonal (i.e. all 1's)
|0 0 3 ...|
|... |
*/
for (int i = 0; i < DSIZE; i++){
if (i < DSIZE-1){
h_B_r[i] = 1;
h_B_l[i] = 3;}
h_B_m[i] = 2;
for (int j = 0; j < DSIZE; j++){
h_A[i*DSIZE+j] = i+1;
if (j==i+1) h_B[i*DSIZE+j] = 1;
else if (j==i) h_B[i*DSIZE+j] = 2;
else if (j==i-1) h_B[i*DSIZE+j] = 3;
else h_B[i*DSIZE+j] = 0;}}
// copy data to device
cudaMemcpy(d_A, h_A, mszb, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, mszb, cudaMemcpyHostToDevice);
cudaMemcpy(d_B_l, h_B_l, (DSIZE-1)*sizeof(mytype), cudaMemcpyHostToDevice);
cudaMemcpy(d_B_r, h_B_r, (DSIZE-1)*sizeof(mytype), cudaMemcpyHostToDevice);
cudaMemcpy(d_B_m, h_B_m, DSIZE*sizeof(mytype), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy1 fail");
// perform dense-dense multiply
dim3 block(BSIZE,BSIZE);
dim3 grid((DSIZE+block.x-1)/block.x, (DSIZE+block.y-1)/block.y);
cudaMemset(d_C, 0, mszb);
mm<<<grid, block>>>(d_A, d_B, d_C, DSIZE);
cudaMemcpy(h_Cd, d_C, mszb, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2/kernel fail");
// perform dense-sparse multiply
cudaMemset(d_C, 0, mszb);
mmt<<<grid, block>>>(d_A, d_B_l, d_B_m, d_B_r, d_C, DSIZE);
cudaMemcpy(h_Cs, d_C, mszb, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 3/kernel fail");
// compare results
for (int i = 0; i < DSIZE; i++)
for (int j = 0; j < DSIZE; j++)
if (abs(h_Cs[i*DSIZE+j] - h_Cd[i*DSIZE+j]) > TOL) {printf("results mismatch at (%d, %d) dense: %f sparse: %f\n", i, j, h_Cd[i*DSIZE+j], h_Cs[i*DSIZE+j]); return -1;}
printf("Success!\n");
return 0;
}
备注:
mmt
内核中的所有全局内存访问(即A
、B
向量和C
)应该正确跨线程合并。因此,使用共享内存的转换也应该很容易产生对共享内存的非存储区冲突访问。虽然研究这段代码可能对学习有用,但我建议任何严重的稀疏-密集矩阵乘法都使用 CUSPARSE 中的例程来完成,例如 csrmm。它几乎肯定会比上面的代码更有效(更快),并且可能比上面代码的任何共享内存转换更快。