Cublas 矩阵-矩阵乘法参数
Cublas matrix-matrix multiplication parameters
我正在尝试使用 Cublas 进行矩阵-矩阵乘法,但它仍然无法正常工作,我也没有弄清楚问题所在。
因为这是我第一次使用 Cublas,所以我不确定我是否设置了正确的参数,尤其是对于前导维度
例如:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cublas_v2.h"
#include <stdio.h>
void mulWithCuda(double *c, const double *a, const double *b, unsigned int size);
int main(){
const int arraySize = 9;
const double a[12] = { 1, 2, 3, 4, 5, 6, 7, 8 ,9, 10, 11, 12 };
const double b[arraySize] = { 10, 20, 30, 40, 50, 60, 70, 80, 90 };
double c[12] = { 0 };
mulWithCuda(c, a, b, arraySize);
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 3; j++) {
printf("%lf ", c[i * 3 + j]);
}
printf("\n");
}
return 0;
}
void mulWithCuda(double* c, const double* a, const double* b, unsigned int size){
double *dev_a = 0;
double *dev_b = 0;
double *dev_c = 0;
cudaMalloc((void**)&dev_c, 12 * sizeof(double));
cudaMalloc((void**)&dev_a, size * sizeof(double));
cudaMalloc((void**)&dev_b, 12 * sizeof(double));
cudaMemcpy(dev_a, a, 12 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size * sizeof(double), cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasCreate(&handle);
double alpha = 1.0;
double beta = 0;
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 4, 3, 3, &alpha, dev_a, 3, dev_b, 3, &beta, dev_c, 3);
cudaMemcpy(c, dev_c, 12 * sizeof(double), cudaMemcpyDeviceToHost);
cublasDestroy(handle);
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
}
使用的两个矩阵是:
1 2 3
4 5 6
7 8 9
10 11 12
10 20 30
40 50 60
70 80 90
而输出是:
** On entry to DGEMM parameter number 8 had an illegal value
0.000000 0.000000 0.000000
0.000000 0.000000 0.000000
0.000000 0.000000 0.000000
0.000000 0.000000 0.000000
您的代码有 3 个问题。
您没有检查 CUDA 错误或 cuBLAS 错误。此处描述了 CUDA 错误检查 What is the canonical way to check for errors using the CUDA runtime API?
通过适当的错误检查,您会发现 cudaMemcpy(dev_b, b, size * sizeof(double), cudaMemcpyHostToDevice);
由于非法内存访问而失败。 dev_a
和 dev_b
分配了错误的大小。 dev_a 应为 12,dev_b 应为 12。
你对矩阵的内存布局做出了错误的假设。 cuBLAS 使用列优先存储格式。 https://docs.nvidia.com/cuda/cublas/index.html#data-layout
这意味着A和C的前导维度是4,而不是3。这也意味着A和B是
1 5 9
2 6 10
3 7 11
4 8 12
and
10 40 70
20 50 80
30 60 90
,respectively
C 的打印也必须更改以考虑列优先格式
工作代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cublas_v2.h"
#include <stdio.h>
void mulWithCuda(double *c, const double *a, const double *b, unsigned int size);
int main(){
const int arraySize = 9;
const double a[12] = { 1, 2, 3, 4, 5, 6, 7, 8 ,9, 10, 11, 12 };
const double b[arraySize] = { 10, 20, 30, 40, 50, 60, 70, 80, 90 };
double c[12] = { 0 };
mulWithCuda(c, a, b, arraySize);
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 3; j++) {
printf("%lf ", c[j * 4 + i]);
}
printf("\n");
}
return 0;
}
void mulWithCuda(double* c, const double* a, const double* b, unsigned int size){
double *dev_a = 0;
double *dev_b = 0;
double *dev_c = 0;
cudaMalloc((void**)&dev_c, 12 * sizeof(double));
cudaMalloc((void**)&dev_a, 12 * sizeof(double));
cudaMalloc((void**)&dev_b, size * sizeof(double));
cudaMemcpy(dev_a, a, 12 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size * sizeof(double), cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasCreate(&handle);
double alpha = 1.0;
double beta = 0;
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 4, 3, 3, &alpha, dev_a, 4, dev_b, 3, &beta, dev_c, 4);
cudaMemcpy(c, dev_c, 12 * sizeof(double), cudaMemcpyDeviceToHost);
cublasDestroy(handle);
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
}
Output
380.000000 830.000000 1280.000000
440.000000 980.000000 1520.000000
500.000000 1130.000000 1760.000000
560.000000 1280.000000 2000.000000
我正在尝试使用 Cublas 进行矩阵-矩阵乘法,但它仍然无法正常工作,我也没有弄清楚问题所在。 因为这是我第一次使用 Cublas,所以我不确定我是否设置了正确的参数,尤其是对于前导维度
例如:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cublas_v2.h"
#include <stdio.h>
void mulWithCuda(double *c, const double *a, const double *b, unsigned int size);
int main(){
const int arraySize = 9;
const double a[12] = { 1, 2, 3, 4, 5, 6, 7, 8 ,9, 10, 11, 12 };
const double b[arraySize] = { 10, 20, 30, 40, 50, 60, 70, 80, 90 };
double c[12] = { 0 };
mulWithCuda(c, a, b, arraySize);
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 3; j++) {
printf("%lf ", c[i * 3 + j]);
}
printf("\n");
}
return 0;
}
void mulWithCuda(double* c, const double* a, const double* b, unsigned int size){
double *dev_a = 0;
double *dev_b = 0;
double *dev_c = 0;
cudaMalloc((void**)&dev_c, 12 * sizeof(double));
cudaMalloc((void**)&dev_a, size * sizeof(double));
cudaMalloc((void**)&dev_b, 12 * sizeof(double));
cudaMemcpy(dev_a, a, 12 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size * sizeof(double), cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasCreate(&handle);
double alpha = 1.0;
double beta = 0;
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 4, 3, 3, &alpha, dev_a, 3, dev_b, 3, &beta, dev_c, 3);
cudaMemcpy(c, dev_c, 12 * sizeof(double), cudaMemcpyDeviceToHost);
cublasDestroy(handle);
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
}
使用的两个矩阵是:
1 2 3
4 5 6
7 8 9
10 11 12
10 20 30
40 50 60
70 80 90
而输出是:
** On entry to DGEMM parameter number 8 had an illegal value
0.000000 0.000000 0.000000
0.000000 0.000000 0.000000
0.000000 0.000000 0.000000
0.000000 0.000000 0.000000
您的代码有 3 个问题。
您没有检查 CUDA 错误或 cuBLAS 错误。此处描述了 CUDA 错误检查 What is the canonical way to check for errors using the CUDA runtime API?
通过适当的错误检查,您会发现
cudaMemcpy(dev_b, b, size * sizeof(double), cudaMemcpyHostToDevice);
由于非法内存访问而失败。dev_a
和dev_b
分配了错误的大小。 dev_a 应为 12,dev_b 应为 12。你对矩阵的内存布局做出了错误的假设。 cuBLAS 使用列优先存储格式。 https://docs.nvidia.com/cuda/cublas/index.html#data-layout
这意味着A和C的前导维度是4,而不是3。这也意味着A和B是
1 5 9
2 6 10
3 7 11
4 8 12
and
10 40 70
20 50 80
30 60 90
,respectively
C 的打印也必须更改以考虑列优先格式
工作代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cublas_v2.h"
#include <stdio.h>
void mulWithCuda(double *c, const double *a, const double *b, unsigned int size);
int main(){
const int arraySize = 9;
const double a[12] = { 1, 2, 3, 4, 5, 6, 7, 8 ,9, 10, 11, 12 };
const double b[arraySize] = { 10, 20, 30, 40, 50, 60, 70, 80, 90 };
double c[12] = { 0 };
mulWithCuda(c, a, b, arraySize);
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 3; j++) {
printf("%lf ", c[j * 4 + i]);
}
printf("\n");
}
return 0;
}
void mulWithCuda(double* c, const double* a, const double* b, unsigned int size){
double *dev_a = 0;
double *dev_b = 0;
double *dev_c = 0;
cudaMalloc((void**)&dev_c, 12 * sizeof(double));
cudaMalloc((void**)&dev_a, 12 * sizeof(double));
cudaMalloc((void**)&dev_b, size * sizeof(double));
cudaMemcpy(dev_a, a, 12 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size * sizeof(double), cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasCreate(&handle);
double alpha = 1.0;
double beta = 0;
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 4, 3, 3, &alpha, dev_a, 4, dev_b, 3, &beta, dev_c, 4);
cudaMemcpy(c, dev_c, 12 * sizeof(double), cudaMemcpyDeviceToHost);
cublasDestroy(handle);
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
}
Output
380.000000 830.000000 1280.000000
440.000000 980.000000 1520.000000
500.000000 1130.000000 1760.000000
560.000000 1280.000000 2000.000000