Cublas 矩阵-矩阵乘法参数

Cublas matrix-matrix multiplication parameters

我正在尝试使用 Cublas 进行矩阵-矩阵乘法,但它仍然无法正常工作,我也没有弄清楚问题所在。 因为这是我第一次使用 Cublas,所以我不确定我是否设置了正确的参数,尤其是对于前导维度

例如:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cublas_v2.h"
#include <stdio.h>

void mulWithCuda(double *c, const double *a, const double *b, unsigned int size);

int main(){
    const int arraySize = 9;
    const double a[12] = { 1, 2, 3, 4, 5, 6, 7, 8 ,9, 10, 11, 12 };
    const double b[arraySize] = { 10, 20, 30, 40, 50, 60, 70, 80, 90 };
    double c[12] = { 0 };

    mulWithCuda(c, a, b, arraySize);

    for (int i = 0; i < 4; i++) {
        for (int j = 0; j < 3; j++) {
            printf("%lf ", c[i * 3 + j]);
        }
        printf("\n");
    }

    return 0;
}

void mulWithCuda(double* c, const double* a, const double* b, unsigned int size){
    double *dev_a = 0;
    double *dev_b = 0;
    double *dev_c = 0;

    cudaMalloc((void**)&dev_c, 12 * sizeof(double));
    cudaMalloc((void**)&dev_a, size * sizeof(double));
    cudaMalloc((void**)&dev_b, 12 * sizeof(double));

    cudaMemcpy(dev_a, a, 12 * sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, b, size * sizeof(double), cudaMemcpyHostToDevice);
    
    cublasHandle_t handle;
    cublasCreate(&handle);

    double alpha = 1.0;
    double beta = 0;

    cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 4, 3, 3, &alpha, dev_a, 3, dev_b, 3, &beta, dev_c, 3);

    cudaMemcpy(c, dev_c, 12 * sizeof(double), cudaMemcpyDeviceToHost);
   
    cublasDestroy(handle);

    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);
}

使用的两个矩阵是:

1 2 3
4 5 6
7 8 9
10 11 12


10 20 30
40 50 60
70 80 90

而输出是:

 ** On entry to DGEMM  parameter number 8 had an illegal value
0.000000 0.000000 0.000000
0.000000 0.000000 0.000000
0.000000 0.000000 0.000000
0.000000 0.000000 0.000000

您的代码有 3 个问题。

  1. 您没有检查 CUDA 错误或 cuBLAS 错误。此处描述了 CUDA 错误检查 What is the canonical way to check for errors using the CUDA runtime API?

  2. 通过适当的错误检查,您会发现 cudaMemcpy(dev_b, b, size * sizeof(double), cudaMemcpyHostToDevice); 由于非法内存访问而失败。 dev_adev_b 分配了错误的大小。 dev_a 应为 12,dev_b 应为 12。

  3. 你对矩阵的内存布局做出了错误的假设。 cuBLAS 使用列优先存储格式。 https://docs.nvidia.com/cuda/cublas/index.html#data-layout

这意味着A和C的前导维度是4,而不是3。这也意味着A和B是

1 5 9
2 6 10
3 7 11
4 8 12

and

10 40 70
20 50 80
30 60 90

,respectively

C 的打印也必须更改以考虑列优先格式

工作代码:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cublas_v2.h"
#include <stdio.h>

void mulWithCuda(double *c, const double *a, const double *b, unsigned int size);

int main(){
    const int arraySize = 9;
    const double a[12] = { 1, 2, 3, 4, 5, 6, 7, 8 ,9, 10, 11, 12 };
    const double b[arraySize] = { 10, 20, 30, 40, 50, 60, 70, 80, 90 };
    double c[12] = { 0 };

    mulWithCuda(c, a, b, arraySize);

    for (int i = 0; i < 4; i++) {
        for (int j = 0; j < 3; j++) {
            printf("%lf ", c[j * 4 + i]);
        }
        printf("\n");
    }

    return 0;
}

void mulWithCuda(double* c, const double* a, const double* b, unsigned int size){
    double *dev_a = 0;
    double *dev_b = 0;
    double *dev_c = 0;

    cudaMalloc((void**)&dev_c, 12 * sizeof(double));
    cudaMalloc((void**)&dev_a, 12 * sizeof(double));
    cudaMalloc((void**)&dev_b, size * sizeof(double));

    cudaMemcpy(dev_a, a, 12 * sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, b, size * sizeof(double), cudaMemcpyHostToDevice);
    
    cublasHandle_t handle;
    cublasCreate(&handle);

    double alpha = 1.0;
    double beta = 0;

    cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 4, 3, 3, &alpha, dev_a, 4, dev_b, 3, &beta, dev_c, 4);

    cudaMemcpy(c, dev_c, 12 * sizeof(double), cudaMemcpyDeviceToHost);
   
    cublasDestroy(handle);

    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);
}


Output
380.000000 830.000000 1280.000000 
440.000000 980.000000 1520.000000 
500.000000 1130.000000 1760.000000 
560.000000 1280.000000 2000.000000