行主矩阵的 cublassgemm

Question

我真的试着用 C 实现一个函数来乘以 cublas 中的行主矩阵。我不知道我哪里弄错了。下面的函数中A、B、C是正确指向一个行矩阵的指针分配。我想保留在执行产品之前翻译矩阵的选项。以下功能无效。

void matrixMul(cublasHandle_t handle,float *A,float *B,float *C, int m,int n,int k,int transA,int transB){

    cublasStatus_t stat ; // CUBLAS functions status
    float alfa = 1;
    float beta = 0;

    int
    ma = transA ? n:m,
    na = transA ? m:n,


    nb = transB ? k:n,
    mb = transB ? n:k;


    if(na!=mb){
        puts("Something wrong");
    }

    //(mb,nb)(ma,na) = (mb,na)
    stat= cublasSgemm_v2(handle, (cublasOperation_t) transB, (cublasOperation_t)transA,
                nb,ma,mb,&alfa,
                B,k,
                A,n,&beta,
                C,m);

    switch (stat) {
        case CUBLAS_STATUS_SUCCESS:
            puts("Sucess");
            break;
        default:
            printf(">>>>ERRO  %d<<<<\n",stat);
            break;
    }

}

整个源代码

// Utilities and system includes
#include <assert.h>
#include <helper_string.h>  // helper for shared functions common to CUDA Samples

// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>

// CUDA and CUBLAS functions
#include <helper_functions.h>

void getFromDevice(float *h_A,float *d_A,int size){
    //printf("Copy input data from the host memory to the CUDA device\n");
    cudaError_t err  = cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
}
//A = (m,n)
//B = (n,k)
//C = (m,k)
void matrixMul(cublasHandle_t handle,float *A,float *B,float *C, int m,int n,int k,int transA,int transB){

    cublasStatus_t stat ; // CUBLAS functions status
    float alfa = 1;
    float beta = 0;

    int
    ma = transA ? n:m,
    na = transA ? m:n,


    nb = transB ? k:n,
    mb = transB ? n:k;


    if(na!=mb){
        puts("Something wrong");
    }

    //(mb,nb)(ma,na) = (mb,na)
    stat= cublasSgemm_v2(handle, (cublasOperation_t) transB, (cublasOperation_t)transA,
                nb,ma,mb,&alfa,
                B,k,
                A,n,&beta,
                C,m);

    switch (stat) {
        case CUBLAS_STATUS_SUCCESS:
            puts("Sucess");
            break;
        default:
            printf(">>>>ERRO  %d<<<<\n",stat);
            break;
    }

}


float *mallocfDevice(int size){
    float *d_C = NULL;
    cudaError_t err  = cudaMalloc((void **)&d_C, size * sizeof(float));

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }else{
        size_t freeM,  total;
        cudaMemGetInfo  (   &freeM, &total);
        printf("MEM:%.3f\n",freeM,total,100 - ((double)freeM/total)*100 );
    }
    return d_C;
}



void printHostMatrix(int nl, int nc, float *h_s){
    for(int j = 0; j < nl ; j++) {
        for(int i = 0; i < (nc) ; i++){
            int idx = j*nc + i;
            printf("%.2f ", h_s[idx]);
        }
        printf("\n");
    }

}


void printfDeviceMatrix(float *d_s,int m, int p){
    float *h_s =(float*) malloc(sizeof(float)*m*p);
    getFromDevice(h_s,d_s,sizeof(float)*m*p);
    printHostMatrix(m,p,h_s);
    free(h_s);
}


void sendTofDevice(float *h_A,float *d_A,int size){
    //printf("Copy input data from the host memory to the CUDA device\n");
    cudaError_t err  = cudaMemcpy(d_A, h_A, size*sizeof(float), cudaMemcpyHostToDevice);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
}


int main(int argc,char **argv){

    int ma = 2,
        na = 3,
        mb = 3,
        nb = 2;

    float A[] = { 1,2,3,
                 4,5,6};
    float B[] = {7, 8,
                9,10,
                11,12};

    float *C = new float[ma*nb];



    float *d_a = mallocfDevice(ma*mb),
          *d_b = mallocfDevice(mb*nb),
          *d_c = mallocfDevice(ma*nb);

    sendTofDevice(A,d_a,ma*na);
    sendTofDevice(B,d_b,mb*nb);

    cublasHandle_t handle ; // CUBLAS context
    cublasCreate (&handle );

    puts("A");
    printfDeviceMatrix(d_a,ma,na);
    puts("B");
    printfDeviceMatrix(d_b,mb,nb);

    matrixMul(handle,  d_a,d_b,d_c,
                       ma,na,nb,0,0);

    puts("AB=C");
    printfDeviceMatrix(d_c,ma,nb);


}

Answer 1

CUBLAS 假定设备中的矩阵存储在 major 列中：

” 其中 α 和 β 是标量，A、B 和 C 是以列优先格式存储的矩阵，维度分别为 op ( A ) m × k 、op ( B ) k × n 和 C m × n 。此外，对于矩阵 A

阅读更多内容：http://docs.nvidia.com/cuda/cublas/index.html#ixzz3mSDJTWrM“

这意味着矩阵在设备上的处理方式与在主机上的处理方式不同。

行主矩阵的 cublassgemm

cublassgemm for row-major matrix

cuda

cublas