half2在CUDA中的使用
Use of half2 in CUDA
我正在尝试使用half2,但是我运行出错了,即
error: class "__half2" has no member "y"
出现错误的那段代码如下:
uint8_t V_ [128]; // some elements (uint8), to save space
float V_C[128]; // storing the diff to use later
half2 *C_ = C.elements; // D halfs stored as half2, to be read
Cvalue = 0.0;
for (d = 0; d < D; d+=2)
{
V_C [d ] = V_[d] - __half2float(C_[d/2].x) ;
V_C [d+1] = V_[d+1] - __half2float(C_[d/2].y) ;
Cvalue += V_C [d] * V_C [d] ;
Cvalue += V_C [d+1] * V_C [d+1];
}
有什么帮助吗?
更新:
谢谢您的帮助!我终于使用了以下...
uint8_t V_ [128] ;
float V_C[128] ;
const half2 *C_ = C.elements;
Cvalue = 0.0;
float2 temp_;
for (d = 0; d < D; d+=2)
{
temp_ = __half22float2(C_[d/2]);
V_C [d ] = V_[d] - temp_.x ;
V_C [d+1] = V_[d+1] - temp_.y ;
Cvalue += V_C [d] * V_C [d] ;
Cvalue += V_C [d+1] * V_C [d+1];
}
我的特定应用程序略有加速,因为来自全局内存的负载是瓶颈...
您不能使用点运算符访问 half2
的部分内容,您应该为此使用内部函数。
__CUDA_FP16_DECL__ float __high2float ( const __half2 a )
Converts high 16 bits of half2 to float and returns the result.
__CUDA_FP16_DECL__ __half __high2half ( const __half2 a )
Returns high 16 bits of half2 input.
__CUDA_FP16_DECL__ __half2 __high2half2 ( const __half2 a )
Extracts high 16 bits from half2 input.
__CUDA_FP16_DECL__ __half2 __highs2half2 ( const __half2 a, const __half2 b )
Extracts high 16 bits from each of the two half2 inputs and combines into one half2 number.
__CUDA_FP16_DECL__ float __low2float ( const __half2 a )
Converts low 16 bits of half2 to float and returns the result.
__CUDA_FP16_DECL__ __half __low2half ( const __half2 a )
Returns low 16 bits of half2 input.
__CUDA_FP16_DECL__ __half2 __low2half2 ( const __half2 a )
Extracts low 16 bits from half2 input.
__CUDA_FP16_DECL__ __half2 __lowhigh2highlow ( const __half2 a )
Swaps both halves of the half2 input.
__CUDA_FP16_DECL__ __half2 __lows2half2 ( const __half2 a, const __half2 b )
Extracts low 16 bits from each of the two half2 inputs and combines into one half2 number.
不仅如此,这取决于C.elements
是什么类型,这一行
half2 *C_ = C.elements; // D halfs stored as half2, to be read
可能是错误的(如果 C.elements
是 half*
。这里的注释不清楚)。
half2
不是一对 half
。
事实上,在当前的实现中 half2
只是一个包裹在结构中的 unsigned int
:
// cuda_fp16.h
typedef struct __align__(2) {
unsigned short x;
} __half;
typedef struct __align__(4) {
unsigned int x;
} __half2;
#ifndef CUDA_NO_HALF
typedef __half half;
typedef __half2 half2;
#endif /*CUDA_NO_HALF*/
没有人说 half
的数组可以作为 half2
的数组访问。
我正在尝试使用half2,但是我运行出错了,即
error: class "__half2" has no member "y"
出现错误的那段代码如下:
uint8_t V_ [128]; // some elements (uint8), to save space
float V_C[128]; // storing the diff to use later
half2 *C_ = C.elements; // D halfs stored as half2, to be read
Cvalue = 0.0;
for (d = 0; d < D; d+=2)
{
V_C [d ] = V_[d] - __half2float(C_[d/2].x) ;
V_C [d+1] = V_[d+1] - __half2float(C_[d/2].y) ;
Cvalue += V_C [d] * V_C [d] ;
Cvalue += V_C [d+1] * V_C [d+1];
}
有什么帮助吗?
更新: 谢谢您的帮助!我终于使用了以下...
uint8_t V_ [128] ;
float V_C[128] ;
const half2 *C_ = C.elements;
Cvalue = 0.0;
float2 temp_;
for (d = 0; d < D; d+=2)
{
temp_ = __half22float2(C_[d/2]);
V_C [d ] = V_[d] - temp_.x ;
V_C [d+1] = V_[d+1] - temp_.y ;
Cvalue += V_C [d] * V_C [d] ;
Cvalue += V_C [d+1] * V_C [d+1];
}
我的特定应用程序略有加速,因为来自全局内存的负载是瓶颈...
您不能使用点运算符访问 half2
的部分内容,您应该为此使用内部函数。
__CUDA_FP16_DECL__ float __high2float ( const __half2 a )
Converts high 16 bits of half2 to float and returns the result.
__CUDA_FP16_DECL__ __half __high2half ( const __half2 a )
Returns high 16 bits of half2 input.
__CUDA_FP16_DECL__ __half2 __high2half2 ( const __half2 a )
Extracts high 16 bits from half2 input.
__CUDA_FP16_DECL__ __half2 __highs2half2 ( const __half2 a, const __half2 b )
Extracts high 16 bits from each of the two half2 inputs and combines into one half2 number.
__CUDA_FP16_DECL__ float __low2float ( const __half2 a )
Converts low 16 bits of half2 to float and returns the result.
__CUDA_FP16_DECL__ __half __low2half ( const __half2 a )
Returns low 16 bits of half2 input.
__CUDA_FP16_DECL__ __half2 __low2half2 ( const __half2 a )
Extracts low 16 bits from half2 input.
__CUDA_FP16_DECL__ __half2 __lowhigh2highlow ( const __half2 a )
Swaps both halves of the half2 input.
__CUDA_FP16_DECL__ __half2 __lows2half2 ( const __half2 a, const __half2 b )
Extracts low 16 bits from each of the two half2 inputs and combines into one half2 number.
不仅如此,这取决于C.elements
是什么类型,这一行
half2 *C_ = C.elements; // D halfs stored as half2, to be read
可能是错误的(如果 C.elements
是 half*
。这里的注释不清楚)。
half2
不是一对 half
。
事实上,在当前的实现中 half2
只是一个包裹在结构中的 unsigned int
:
// cuda_fp16.h
typedef struct __align__(2) {
unsigned short x;
} __half;
typedef struct __align__(4) {
unsigned int x;
} __half2;
#ifndef CUDA_NO_HALF
typedef __half half;
typedef __half2 half2;
#endif /*CUDA_NO_HALF*/
没有人说 half
的数组可以作为 half2
的数组访问。