将半浮点数转换为浮点数
Bit shifting a half-float into a float
我别无选择,只能读入构成半浮点数的2个字节。我想以 4 字节浮点数的形式使用它。我做了一些研究,我唯一能想到的就是位移。我唯一的问题是我不完全理解如何只抓取几位并将它们放入浮动中。我有这个功能,但是没用
float ToShortFloat(char v1, char v2) {
float f = ((v1 << 6) | (0x00) << 3 | (v1 >> 2) | v2 | (0x00) << 13);
return f;
}
这是16位(2字节)结构
这是典型的 32 位(4 字节)浮点数
如果您要为我编写代码,请详细说明。我想了解位运算符和位放置的真实情况。
这是演示 16 位 floating-point 到 32 位 floating-point 转换的代码以及一个测试程序。测试程序需要Clang的__fp16
类型,但是转换代码不需要。未测试 NaN 有效负载和 signaling/non-signaling 语义的处理。
#include <stdint.h>
// Produce value of bit n. n must be less than 32.
#define Bit(n) ((uint32_t) 1 << (n))
// Create a mask of n bits in the low bits. n must be less than 32.
#define Mask(n) (Bit(n) - 1)
/* Convert an IEEE-754 16-bit binary floating-point encoding to an IEEE-754
32-bit binary floating-point encoding.
This code has not been tested.
*/
uint32_t Float16ToFloat32(uint16_t x)
{
/* Separate the sign encoding (1 bit starting at bit 15), the exponent
encoding (5 bits starting at bit 10), and the primary significand
(fraction) encoding (10 bits starting at bit 0).
*/
uint32_t s = x >> 15;
uint32_t e = x >> 10 & Mask( 5);
uint32_t f = x & Mask(10);
// Left-adjust the significand field.
f <<= 23 - 10;
// Switch to handle subnormal numbers, normal numbers, and infinities/NaNs.
switch (e)
{
// Exponent code is subnormal.
case 0:
// Zero does need any changes, but subnormals need normalization.
if (f != 0)
{
/* Set the 32-bit exponent code corresponding to the 16-bit
subnormal exponent.
*/
e = 1 + (127 - 15);
/* Normalize the significand by shifting until its leading
bit moves out of the field. (This code could benefit from
a find-first-set instruction or possibly using a conversion
from integer to floating-point to do the normalization.)
*/
while (f < Bit(23))
{
f <<= 1;
e -= 1;
}
// Remove the leading bit.
f &= Mask(23);
}
break;
// Exponent code is normal.
default:
e += 127 - 15; // Adjust from 16-bit bias to 32-bit bias.
break;
// Exponent code indicates infinity or NaN.
case 31:
e = 255; // Set 32-bit exponent code for infinity or NaN.
break;
}
// Assemble and return the 32-bit encoding.
return s << 31 | e << 23 | f;
}
#include <inttypes.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
int main(void)
{
// Use unions so we can iterate and manipulate the encodings.
union { uint16_t enc; __fp16 value; } x;
union { uint32_t enc; float value; } y;
// Iterate through all 16-bit encodings.
for (uint32_t i = 0; i < Bit(16); ++i)
{
x.enc = i;
y.enc = Float16ToFloat32(x.enc);
if (isnan(x.value) != isnan(y.value) ||
!isnan(x.value) && x.value != y.value)
{
printf("Failure:\n");
printf("\tx encoding = 0x%04" PRIx16 ", value = %.99g.\n",
x.enc, x.value);
printf("\ty encoding = 0x%08" PRIx32 ", value = %.99g.\n",
y.enc, y.value);
exit(EXIT_FAILURE);
}
}
}
正如 chtz 指出的那样,我们可以使用 32 位 floating-point 算法来处理正常值和次正常值的缩放调整。为此,请将 f <<= 23 - 10;
之后的 Float16ToFloat32
中的代码替换为:
// For infinities and NaNs, set 32-bit exponent code.
if (e == 31)
return s << 31 | 255 << 23 | f;
/* For finite values, reassemble with shifted fields and using a
floating-point multiply to adjust for the changed exponent bias.
*/
union { uint32_t enc; float value; } y = { .enc = s << 31 | e << 23 | f };
y.value *= 0x1p112f;
return y.enc;
虽然这个问题已经通过正确的实现得到了解答,但您可以更快地进行转换。 Here 提供了更快的 IEEE-754 FP32<->FP16 转换算法,没有任何循环或分支。这些处理正常和非正常数字和沟渠 NaN/Inf 两倍范围。
我别无选择,只能读入构成半浮点数的2个字节。我想以 4 字节浮点数的形式使用它。我做了一些研究,我唯一能想到的就是位移。我唯一的问题是我不完全理解如何只抓取几位并将它们放入浮动中。我有这个功能,但是没用
float ToShortFloat(char v1, char v2) {
float f = ((v1 << 6) | (0x00) << 3 | (v1 >> 2) | v2 | (0x00) << 13);
return f;
}
这是16位(2字节)结构
如果您要为我编写代码,请详细说明。我想了解位运算符和位放置的真实情况。
这是演示 16 位 floating-point 到 32 位 floating-point 转换的代码以及一个测试程序。测试程序需要Clang的__fp16
类型,但是转换代码不需要。未测试 NaN 有效负载和 signaling/non-signaling 语义的处理。
#include <stdint.h>
// Produce value of bit n. n must be less than 32.
#define Bit(n) ((uint32_t) 1 << (n))
// Create a mask of n bits in the low bits. n must be less than 32.
#define Mask(n) (Bit(n) - 1)
/* Convert an IEEE-754 16-bit binary floating-point encoding to an IEEE-754
32-bit binary floating-point encoding.
This code has not been tested.
*/
uint32_t Float16ToFloat32(uint16_t x)
{
/* Separate the sign encoding (1 bit starting at bit 15), the exponent
encoding (5 bits starting at bit 10), and the primary significand
(fraction) encoding (10 bits starting at bit 0).
*/
uint32_t s = x >> 15;
uint32_t e = x >> 10 & Mask( 5);
uint32_t f = x & Mask(10);
// Left-adjust the significand field.
f <<= 23 - 10;
// Switch to handle subnormal numbers, normal numbers, and infinities/NaNs.
switch (e)
{
// Exponent code is subnormal.
case 0:
// Zero does need any changes, but subnormals need normalization.
if (f != 0)
{
/* Set the 32-bit exponent code corresponding to the 16-bit
subnormal exponent.
*/
e = 1 + (127 - 15);
/* Normalize the significand by shifting until its leading
bit moves out of the field. (This code could benefit from
a find-first-set instruction or possibly using a conversion
from integer to floating-point to do the normalization.)
*/
while (f < Bit(23))
{
f <<= 1;
e -= 1;
}
// Remove the leading bit.
f &= Mask(23);
}
break;
// Exponent code is normal.
default:
e += 127 - 15; // Adjust from 16-bit bias to 32-bit bias.
break;
// Exponent code indicates infinity or NaN.
case 31:
e = 255; // Set 32-bit exponent code for infinity or NaN.
break;
}
// Assemble and return the 32-bit encoding.
return s << 31 | e << 23 | f;
}
#include <inttypes.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
int main(void)
{
// Use unions so we can iterate and manipulate the encodings.
union { uint16_t enc; __fp16 value; } x;
union { uint32_t enc; float value; } y;
// Iterate through all 16-bit encodings.
for (uint32_t i = 0; i < Bit(16); ++i)
{
x.enc = i;
y.enc = Float16ToFloat32(x.enc);
if (isnan(x.value) != isnan(y.value) ||
!isnan(x.value) && x.value != y.value)
{
printf("Failure:\n");
printf("\tx encoding = 0x%04" PRIx16 ", value = %.99g.\n",
x.enc, x.value);
printf("\ty encoding = 0x%08" PRIx32 ", value = %.99g.\n",
y.enc, y.value);
exit(EXIT_FAILURE);
}
}
}
正如 chtz 指出的那样,我们可以使用 32 位 floating-point 算法来处理正常值和次正常值的缩放调整。为此,请将 f <<= 23 - 10;
之后的 Float16ToFloat32
中的代码替换为:
// For infinities and NaNs, set 32-bit exponent code.
if (e == 31)
return s << 31 | 255 << 23 | f;
/* For finite values, reassemble with shifted fields and using a
floating-point multiply to adjust for the changed exponent bias.
*/
union { uint32_t enc; float value; } y = { .enc = s << 31 | e << 23 | f };
y.value *= 0x1p112f;
return y.enc;
虽然这个问题已经通过正确的实现得到了解答,但您可以更快地进行转换。 Here 提供了更快的 IEEE-754 FP32<->FP16 转换算法,没有任何循环或分支。这些处理正常和非正常数字和沟渠 NaN/Inf 两倍范围。