C 主机代码调用 cublasSgemm 的结果不正确
Incorrect result calling cublasSgemm by a C host code
我在 return 从 C 主机代码调用 cuBLAS 库函数 cublasSgemm 时得到了一些奇怪的数字。它确实编译并且运行,但结果矩阵中的数字不正确。
C 主机代码调用这些函数的问题是 C 语言以行优先顺序读取矩阵,而 cuBLAS 函数是用 FORTRAN 编写的,它以列优先顺序读取矩阵。
我为 cublasSgemm 尝试了多种参数组合,但似乎没有一个能正常工作。
我需要在 m1 和 m2 之间执行矩阵乘积,所以我先传递 m2,然后传递 m1,所以 cublas 函数应该读取 (m2)T 和 (m1)T,其中 T 是转置形式;通过这样做,我应该得到结果 (r)T = (m2.m1)T。
我的 C 代码最终应该将 (r)T 读作 r,但我无法获得正确的数字...
这是代码:
cudaError_t vector_matrix_molt(float *m1, float *m2, float *r, int row1, int col1, int row2, int col2) {
//Device Memory allocation
float *d_m1;
float *d_m2;
float *d_r;
float a = 1.0f;
float b = 0.0f;
int stride = 1;
//CUDA stuff
cublasHandle_t handle;
cudaError_t cudaStatus;
cudaStatus = cudaMalloc((void**)&d_m1, col1*row1*sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&d_m2, row2*col2*sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&d_r, row1*col2*sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cublasCreate(&handle);
// Copy Data to Device Memory
cudaStatus = cudaMemcpy(d_m1, m1, row1*col1*sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy 1 failed!");
goto Error;
}
cudaStatus = cudaMemcpy(d_m2, m2, row2*col2*sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy 2 failed!");
goto Error;
}
/*cublasStatus_t cublasSgemm(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
int m, int n, int k, const float *alpha, const float *A, int lda, const float *B, int ldb, const float *beta, float *C, int ldc
*/
//Calling cuBLAS library function...
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, col2, row1, col1, &a, d_m2, col2, d_m1, col1, &b, d_r, row1);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "moltKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching cublasSgemv!\n", cudaStatus);
//printf("Cuda Error: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(r, d_r, row1*col2* sizeof(float), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy 3 failed!");
goto Error;
}
Error:
cudaFree(d_m1);
cudaFree(d_m2);
cudaFree(d_r);
return cudaStatus;
}
您唯一需要更改的是 r
的前导暗淡。
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, col2, row1, col1, &a, d_m2, col2, d_m1, col1, &b, d_r, col2);
您可以参考这个答案以获得更详细的解释。
Transpose matrix multiplication in cuBLAS howto
我在 return 从 C 主机代码调用 cuBLAS 库函数 cublasSgemm 时得到了一些奇怪的数字。它确实编译并且运行,但结果矩阵中的数字不正确。
C 主机代码调用这些函数的问题是 C 语言以行优先顺序读取矩阵,而 cuBLAS 函数是用 FORTRAN 编写的,它以列优先顺序读取矩阵。
我为 cublasSgemm 尝试了多种参数组合,但似乎没有一个能正常工作。
我需要在 m1 和 m2 之间执行矩阵乘积,所以我先传递 m2,然后传递 m1,所以 cublas 函数应该读取 (m2)T 和 (m1)T,其中 T 是转置形式;通过这样做,我应该得到结果 (r)T = (m2.m1)T。 我的 C 代码最终应该将 (r)T 读作 r,但我无法获得正确的数字... 这是代码:
cudaError_t vector_matrix_molt(float *m1, float *m2, float *r, int row1, int col1, int row2, int col2) {
//Device Memory allocation
float *d_m1;
float *d_m2;
float *d_r;
float a = 1.0f;
float b = 0.0f;
int stride = 1;
//CUDA stuff
cublasHandle_t handle;
cudaError_t cudaStatus;
cudaStatus = cudaMalloc((void**)&d_m1, col1*row1*sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&d_m2, row2*col2*sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&d_r, row1*col2*sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cublasCreate(&handle);
// Copy Data to Device Memory
cudaStatus = cudaMemcpy(d_m1, m1, row1*col1*sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy 1 failed!");
goto Error;
}
cudaStatus = cudaMemcpy(d_m2, m2, row2*col2*sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy 2 failed!");
goto Error;
}
/*cublasStatus_t cublasSgemm(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
int m, int n, int k, const float *alpha, const float *A, int lda, const float *B, int ldb, const float *beta, float *C, int ldc
*/
//Calling cuBLAS library function...
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, col2, row1, col1, &a, d_m2, col2, d_m1, col1, &b, d_r, row1);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "moltKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching cublasSgemv!\n", cudaStatus);
//printf("Cuda Error: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(r, d_r, row1*col2* sizeof(float), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy 3 failed!");
goto Error;
}
Error:
cudaFree(d_m1);
cudaFree(d_m2);
cudaFree(d_r);
return cudaStatus;
}
您唯一需要更改的是 r
的前导暗淡。
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, col2, row1, col1, &a, d_m2, col2, d_m1, col1, &b, d_r, col2);
您可以参考这个答案以获得更详细的解释。
Transpose matrix multiplication in cuBLAS howto