cublasGemmEx 结果始终为零
cublasGemmEx result is always zero
我尝试使用 cublasGemmEx 进行矩阵乘法。 A 和 b 是 1X1 半矩阵。如果我将计算类型和输出日期类型设置为 CUDA_R_16F,结果始终为零。如果我将计算类型和输出日期类型设置为 CUDA_R_32F.
,结果是正确的
有谁知道如果我将类型设置为 CUDA_R_16F 为什么结果为零?提前感谢您的回答。
我的cuda版本是10.2,gpu是T4。
我使用命令 'nvcc -arch=sm_75 test_cublas.cu -o test_cublas -lcublas'
构建以下代码
#include "cublas_v2.h"
#include "library_types.h"
#include <stdio.h>
__global__ void init_kernel(half *a, half *b, half *c_half, float *c_float)
{
*a = __float2half_rn(1.0);
*b = __float2half_rn(1.5);
*c_half = __float2half_rn(0.0);
*c_float = 0.0;
}
__global__ void print_gpu_values(half *a, half *b, half *c_half, float *c_float)
{
printf("a %f, b %f, c_half %f, c_float %f\n", __half2float(*a), __half2float(*b), __half2float(*c_half), *c_float);
}
int main(int argc, char **argv)
{
cudaStream_t cudaStream;
if (cudaSuccess != cudaStreamCreateWithFlags(&cudaStream, cudaStreamNonBlocking))
{
printf("create cuda stream failed\n");
exit(-1);
}
cublasHandle_t handle;
cublasCreate(&handle);
if (CUBLAS_STATUS_SUCCESS != cublasSetStream(handle, cudaStream))
{
printf("cublas set stream failed\n");
exit(-1);
}
half *a;
half *b;
half *c_half;
float *c_float;
cudaMalloc(&a, sizeof(half));
cudaMalloc(&b, sizeof(half));
cudaMalloc(&c_half, sizeof(half));
cudaMalloc(&c_float,sizeof(float));
float alpha = 1.0;
float beta = 1.0;
init_kernel<<<1, 1, 0, cudaStream>>>(a, b, c_half, c_float);
if (CUBLAS_STATUS_SUCCESS != cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 1,
&alpha, b, CUDA_R_16F, 1, a, CUDA_R_16F, 1, &beta, c_half, CUDA_R_16F, 1, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP))
{
printf("cublasGemmEx failed\n");
exit(-1);
}
if (CUBLAS_STATUS_SUCCESS != cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 1,
&alpha, b, CUDA_R_16F, 1, a, CUDA_R_16F, 1, &beta, c_float, CUDA_R_32F, 1, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP))
{
printf("cublasGemmEx failed\n");
exit(-1);
}
print_gpu_values<<<1, 1, 0, cudaStream>>>(a, b, c_half, c_float);
cudaStreamSynchronize(cudaStream);
return 0;
}
根据documentation for cublasGemmEx,具体针对alpha
和beta
参数,都说:
of same type as computeType
但是您的代码不满足该要求。对于(有效的)CUDA_R_32F
案例,您的 alpha
和 beta
类型的 float
参数匹配。对于(非工作)CUDA_R_16F
情况,它们不匹配。
当我使用该更改修改您的代码时,我在 CUDA 11.0 上得到了正确的结果:
# cat t3.cu
#include "cublas_v2.h"
#include "library_types.h"
#include <stdio.h>
__global__ void init_kernel(half *a, half *b, half *c_half, float *c_float)
{
*a = __float2half_rn(1.0);
*b = __float2half_rn(1.5);
*c_half = __float2half_rn(0.0);
*c_float = 0.0;
}
__global__ void print_gpu_values(half *a, half *b, half *c_half, float *c_float)
{
printf("a %f, b %f, c_half %f, c_float %f\n", __half2float(*a), __half2float(*b), __half2float(*c_half), *c_float);
}
int main(int argc, char **argv)
{
cudaStream_t cudaStream;
if (cudaSuccess != cudaStreamCreateWithFlags(&cudaStream, cudaStreamNonBlocking))
{
printf("create cuda stream failed\n");
exit(-1);
}
cublasHandle_t handle;
cublasCreate(&handle);
if (CUBLAS_STATUS_SUCCESS != cublasSetStream(handle, cudaStream))
{
printf("cublas set stream failed\n");
exit(-1);
}
half *a;
half *b;
half *c_half;
float *c_float;
cudaMalloc(&a, sizeof(half));
cudaMalloc(&b, sizeof(half));
cudaMalloc(&c_half, sizeof(half));
cudaMalloc(&c_float,sizeof(float));
float alpha = 1.0;
float beta = 1.0;
half halpha = __float2half_rn(alpha);
half hbeta = __float2half_rn(beta);
init_kernel<<<1, 1, 0, cudaStream>>>(a, b, c_half, c_float);
if (CUBLAS_STATUS_SUCCESS != cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 1,
&halpha, b, CUDA_R_16F, 1, a, CUDA_R_16F, 1, &hbeta, c_half, CUDA_R_16F, 1, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP))
{
printf("cublasGemmEx failed\n");
exit(-1);
}
if (CUBLAS_STATUS_SUCCESS != cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 1,
&alpha, b, CUDA_R_16F, 1, a, CUDA_R_16F, 1, &beta, c_float, CUDA_R_32F, 1, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP))
{
printf("cublasGemmEx failed\n");
exit(-1);
}
print_gpu_values<<<1, 1, 0, cudaStream>>>(a, b, c_half, c_float);
cudaStreamSynchronize(cudaStream);
return 0;
}
# nvcc t3.cu -o t3 -lcublas
# cuda-memcheck ./t3
========= CUDA-MEMCHECK
a 1.000000, b 1.500000, c_half 1.500000, c_float 1.500000
========= ERROR SUMMARY: 0 errors
# nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Wed_Jul_22_19:09:09_PDT_2020
Cuda compilation tools, release 11.0, V11.0.221
Build cuda_11.0_bu.TC445_37.28845127_0
#
我尝试使用 cublasGemmEx 进行矩阵乘法。 A 和 b 是 1X1 半矩阵。如果我将计算类型和输出日期类型设置为 CUDA_R_16F,结果始终为零。如果我将计算类型和输出日期类型设置为 CUDA_R_32F.
,结果是正确的有谁知道如果我将类型设置为 CUDA_R_16F 为什么结果为零?提前感谢您的回答。
我的cuda版本是10.2,gpu是T4。 我使用命令 'nvcc -arch=sm_75 test_cublas.cu -o test_cublas -lcublas'
构建以下代码#include "cublas_v2.h"
#include "library_types.h"
#include <stdio.h>
__global__ void init_kernel(half *a, half *b, half *c_half, float *c_float)
{
*a = __float2half_rn(1.0);
*b = __float2half_rn(1.5);
*c_half = __float2half_rn(0.0);
*c_float = 0.0;
}
__global__ void print_gpu_values(half *a, half *b, half *c_half, float *c_float)
{
printf("a %f, b %f, c_half %f, c_float %f\n", __half2float(*a), __half2float(*b), __half2float(*c_half), *c_float);
}
int main(int argc, char **argv)
{
cudaStream_t cudaStream;
if (cudaSuccess != cudaStreamCreateWithFlags(&cudaStream, cudaStreamNonBlocking))
{
printf("create cuda stream failed\n");
exit(-1);
}
cublasHandle_t handle;
cublasCreate(&handle);
if (CUBLAS_STATUS_SUCCESS != cublasSetStream(handle, cudaStream))
{
printf("cublas set stream failed\n");
exit(-1);
}
half *a;
half *b;
half *c_half;
float *c_float;
cudaMalloc(&a, sizeof(half));
cudaMalloc(&b, sizeof(half));
cudaMalloc(&c_half, sizeof(half));
cudaMalloc(&c_float,sizeof(float));
float alpha = 1.0;
float beta = 1.0;
init_kernel<<<1, 1, 0, cudaStream>>>(a, b, c_half, c_float);
if (CUBLAS_STATUS_SUCCESS != cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 1,
&alpha, b, CUDA_R_16F, 1, a, CUDA_R_16F, 1, &beta, c_half, CUDA_R_16F, 1, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP))
{
printf("cublasGemmEx failed\n");
exit(-1);
}
if (CUBLAS_STATUS_SUCCESS != cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 1,
&alpha, b, CUDA_R_16F, 1, a, CUDA_R_16F, 1, &beta, c_float, CUDA_R_32F, 1, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP))
{
printf("cublasGemmEx failed\n");
exit(-1);
}
print_gpu_values<<<1, 1, 0, cudaStream>>>(a, b, c_half, c_float);
cudaStreamSynchronize(cudaStream);
return 0;
}
根据documentation for cublasGemmEx,具体针对alpha
和beta
参数,都说:
of same type as computeType
但是您的代码不满足该要求。对于(有效的)CUDA_R_32F
案例,您的 alpha
和 beta
类型的 float
参数匹配。对于(非工作)CUDA_R_16F
情况,它们不匹配。
当我使用该更改修改您的代码时,我在 CUDA 11.0 上得到了正确的结果:
# cat t3.cu
#include "cublas_v2.h"
#include "library_types.h"
#include <stdio.h>
__global__ void init_kernel(half *a, half *b, half *c_half, float *c_float)
{
*a = __float2half_rn(1.0);
*b = __float2half_rn(1.5);
*c_half = __float2half_rn(0.0);
*c_float = 0.0;
}
__global__ void print_gpu_values(half *a, half *b, half *c_half, float *c_float)
{
printf("a %f, b %f, c_half %f, c_float %f\n", __half2float(*a), __half2float(*b), __half2float(*c_half), *c_float);
}
int main(int argc, char **argv)
{
cudaStream_t cudaStream;
if (cudaSuccess != cudaStreamCreateWithFlags(&cudaStream, cudaStreamNonBlocking))
{
printf("create cuda stream failed\n");
exit(-1);
}
cublasHandle_t handle;
cublasCreate(&handle);
if (CUBLAS_STATUS_SUCCESS != cublasSetStream(handle, cudaStream))
{
printf("cublas set stream failed\n");
exit(-1);
}
half *a;
half *b;
half *c_half;
float *c_float;
cudaMalloc(&a, sizeof(half));
cudaMalloc(&b, sizeof(half));
cudaMalloc(&c_half, sizeof(half));
cudaMalloc(&c_float,sizeof(float));
float alpha = 1.0;
float beta = 1.0;
half halpha = __float2half_rn(alpha);
half hbeta = __float2half_rn(beta);
init_kernel<<<1, 1, 0, cudaStream>>>(a, b, c_half, c_float);
if (CUBLAS_STATUS_SUCCESS != cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 1,
&halpha, b, CUDA_R_16F, 1, a, CUDA_R_16F, 1, &hbeta, c_half, CUDA_R_16F, 1, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP))
{
printf("cublasGemmEx failed\n");
exit(-1);
}
if (CUBLAS_STATUS_SUCCESS != cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 1,
&alpha, b, CUDA_R_16F, 1, a, CUDA_R_16F, 1, &beta, c_float, CUDA_R_32F, 1, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP))
{
printf("cublasGemmEx failed\n");
exit(-1);
}
print_gpu_values<<<1, 1, 0, cudaStream>>>(a, b, c_half, c_float);
cudaStreamSynchronize(cudaStream);
return 0;
}
# nvcc t3.cu -o t3 -lcublas
# cuda-memcheck ./t3
========= CUDA-MEMCHECK
a 1.000000, b 1.500000, c_half 1.500000, c_float 1.500000
========= ERROR SUMMARY: 0 errors
# nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Wed_Jul_22_19:09:09_PDT_2020
Cuda compilation tools, release 11.0, V11.0.221
Build cuda_11.0_bu.TC445_37.28845127_0
#