使用最接近的舍入从 IEEE-754 转换为定点

Converting from IEEE-754 to Fixed Point with nearest rounding

我正在 FPGA 中使用 S15.16 实现 IEEE 754 32 位到定点的转换器。 IEEE-754 标准将数字表示为:

其中s代表符号,exp是非规格化指数,m是尾数。所有这些值分别用定点表示。

嗯,最简单的方法是将 IEEE-754 值乘以 2**16。最后,四舍五入到最接近的位置,截断误差更小。

问题:我是在FPGA设备上做的,所以,不能这样做。

解决方案:使用值的二进制表示通过按位运算执行转换

从前面的表达式来看,在指数和尾数都在定点的条件下,逻辑告诉我可以这样执行:

因为 2 的幂是不动点的移位,可以将表达式重写为(使用 Verilog 表示法):

x_fixed = ({1'b1, m[22:7]}) << (exp - 126)

好的,这很完美,但不是所有时候...这里的问题是:如何应用最近的舍入?我已经进行了实验,看看在不同的范围内会发生什么。范围包含在 2 的幂内。我想说:

以此类推包含在以下 2 的幂中的值...当包含 1 到 2 的值时,我已经能够毫无问题地舍入 2 个以下位的行为在尾数中丢弃。这些位表明:

if 00: Rounding is not necessary
if 01 or 10: Adding one to the shifted mantissa
if 11: adding two to the shifted mantissa.

为了执行实验,我在 Python 中使用按位运算实现了一个最小的解决方案。代码是:

# Get the bits of sign, exponent and mantissa
def FLOAT_2_BIN(num):
    bits, = struct.unpack('!I', struct.pack('!f', num))
    N = "{:032b}".format(bits)
    a = N[0]        # sign
    b = N[1:9]      # exponent
    c = "1" + N[9:] # mantissa with hidden bit
    return {'sign': a, 'exp': b, 'mantissa': c}

# Convert the floating point value to fixed via
# bitwise operations
def FLOAT_2_FIXED(x):
    # Get the IEEE-754 bit representation
    IEEE754 = FLOAT_2_BIN(x)
    
    # Exponent minus 127 to normalize
    shift = int(IEEE754['exp'],2) - 126
    
    # Get 16 MSB from mantissa
    MSB_mnts = IEEE754['mantissa'][0:16]
    
    # Convert value from binary to int
    value = int(MSB_mnts, 2)
    
    # Get the rounding bits: similars to guard bits???
    rnd_bits = IEEE754['mantissa'][16:18]
            
    # Shifted value by exponent
    value_shift = value << shift
    
    # Control to rounding nearest
    # Only works with values from 1 to 2 
    if rnd_bits == '00':
        rnd = 0
    elif rnd_bits == '01' or rnd_bits == '10':
        rnd = 1
    else:
        rnd = 2
    return value_shift + rnd

值介于 0 和 1 之间的测试给出以下结果:

Test for values from 1 <= x < 2

 FLOAT 32 VALUE    16 MSB MANTISSA    THEORICAL FIXED    PRACTICAL FIXED    RND BITS    DIFS    4 LSB MANTISSA
----------------  -----------------  -----------------  -----------------  ----------  ------  ----------------
       1          1000000000000000         65536              65536            00        0           0000
      1.1         1000110011001100         72090              72090            11        0           1101
      1.2         1001100110011001         78643              78643            10        0           1010
      1.3         1010011001100110         85197              85197            01        0           0110
      1.4         1011001100110011         91750              91750            00        0           0011
      1.5         1100000000000000         98304              98304            00        0           0000
      1.6         1100110011001100        104858             104858            11        0           1101
      1.7         1101100110011001        111411             111411            10        0           1010
      1.8         1110011001100110        117965             117965            01        0           0110
      1.9         1111001100110011        124518             124518            00        0           0011

显然:如果我取的值的小数部分是 2 的幂的倍数,则不需要四舍五入:

In this case the values have an increment of 1/32

FLOAT 32 VALUE    16 MSB MANTISSA    THEORICAL FIXED    PRACTICAL FIXED    RND BITS    DIFS    4 LSB MANTISSA
----------------  -----------------  -----------------  -----------------  ----------  ------  ----------------
       10         1010000000000000        655360             655360            00        0           0000
    10.0312       1010000010000000        657408             657408            00        0           0000
    10.0625       1010000100000000        659456             659456            00        0           0000
    10.0938       1010000110000000        661504             661504            00        0           0000
     10.125       1010001000000000        663552             663552            00        0           0000
    10.1562       1010001010000000        665600             665600            00        0           0000
    10.1875       1010001100000000        667648             667648            00        0           0000
    10.2188       1010001110000000        669696             669696            00        0           0000
     10.25        1010010000000000        671744             671744            00        0           0000
    10.2812       1010010010000000        673792             673792            00        0           0000
    10.3125       1010010100000000        675840             675840            00        0           0000
    10.3438       1010010110000000        677888             677888            00        0           0000
     10.375       1010011000000000        679936             679936            00        0           0000
    10.4062       1010011010000000        681984             681984            00        0           0000
    10.4375       1010011100000000        684032             684032            00        0           0000
    10.4688       1010011110000000        686080             686080            00        0           0000
      10.5        1010100000000000        688128             688128            00        0           0000
    10.5312       1010100010000000        690176             690176            00        0           0000
    10.5625       1010100100000000        692224             692224            00        0           0000
    10.5938       1010100110000000        694272             694272            00        0           0000
     10.625       1010101000000000        696320             696320            00        0           0000
    10.6562       1010101010000000        698368             698368            00        0           0000
    10.6875       1010101100000000        700416             700416            00        0           0000
    10.7188       1010101110000000        702464             702464            00        0           0000
     10.75        1010110000000000        704512             704512            00        0           0000
    10.7812       1010110010000000        706560             706560            00        0           0000
    10.8125       1010110100000000        708608             708608            00        0           0000
    10.8438       1010110110000000        710656             710656            00        0           0000
     10.875       1010111000000000        712704             712704            00        0           0000
    10.9062       1010111010000000        714752             714752            00        0           0000
    10.9375       1010111100000000        716800             716800            00        0           0000
    10.9688       1010111110000000        718848             718848            00        0           0000

但是,如果 2 <= x < 4 并且增量不是 2 的幂的倍数:

Test for values from 2 <= x < 4. Increment is 0.1
Here, I am not applying the rounding in order to show how the rounding error 
increase with the exponent. e.g: shift**2 - 1, where shift is exponent - 126

FLOAT 32 VALUE    16 MSB MANTISSA    THEORICAL FIXED    PRACTICAL FIXED    RND BITS    DIFS    4 LSB MANTISSA
----------------  -----------------  -----------------  -----------------  ----------  ------  ----------------
       2          1000000000000000        131072             131072            00        0           0000
      2.1         1000011001100110        137626             137624            01        -2          0110
      2.2         1000110011001100        144179             144176            11        -3          1101
      2.3         1001001100110011        150733             150732            00        -1          0011
      2.4         1001100110011001        157286             157284            10        -2          1010
      2.5         1010000000000000        163840             163840            00        0           0000
      2.6         1010011001100110        170394             170392            01        -2          0110
      2.7         1010110011001100        176947             176944            11        -3          1101
      2.8         1011001100110011        183501             183500            00        -1          0011
      2.9         1011100110011001        190054             190052            10        -2          1010
       3          1100000000000000        196608             196608            00        0           0000
      3.1         1100011001100110        203162             203160            01        -2          0110
      3.2         1100110011001100        209715             209712            11        -3          1101
      3.3         1101001100110011        216269             216268            00        -1          0011
      3.4         1101100110011001        222822             222820            10        -2          1010
      3.5         1110000000000000        229376             229376            00        0           0000
      3.6         1110011001100110        235930             235928            01        -2          0110
      3.7         1110110011001100        242483             242480            11        -3          1101
      3.8         1111001100110011        249037             249036            00        -1          0011
      3.9         1111100110011001        255590             255588            10        -2          1010

很明显四舍五入是不正确的,而且我发现定点的最大四舍五入误差总是2**shift - 1

有什么想法或建议吗?我认为这里的问题是我没有考虑保护位:GSR,但另一方面,如果实际上问题是这样的:当必要的舍入高于 1 时会发生什么,例如:2、3、4...?

下面的 ISO-C99 代码演示了一种可能的转换方式。 binary32 参数的有效数(尾数)位构成 s15.16 结果的位。指数位告诉我们是否需要将这些位右移或左移以将最低有效整数位移动到位 16。如果需要左移,则不需要舍入。如果需要右移,我们需要捕获丢弃的任何不太重要的位。最重要的丢弃位是 round 位,所有其他位共同表示 sticky 位。使用舍入模式的字面定义,如果 (1) 设置了舍入位和粘性位,或者 (2) 设置了舍入位并且清除了粘性位(即,我们有平局例),但中间结果的最低有效位是奇数。

请注意,实际的硬件实现通常会偏离舍入模式逻辑的这种文字应用。一种常见的方案是在设置 round 位时首先递增结果。然后,如果出现这样的增量,如果未设置 sticky 位,则清除结果的最低有效位。不难看出,这里枚举了round bit、sticky bit、result LSB所有可能的组合,达到了同样的效果。

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <math.h>

#define USE_LITERAL_RND_DEF  (1)

uint32_t float_as_uint32 (float a)
{
    uint32_t r;
    memcpy (&r, &a, sizeof r);
    return r;
}

#define FP32_MANT_FRAC_BITS  (23)
#define FP32_EXPO_BITS       (8)
#define FP32_EXPO_MASK       ((1u << FP32_EXPO_BITS) - 1)
#define FP32_MANT_MASK       ((1u << FP32_MANT_FRAC_BITS) - 1)
#define FP32_MANT_INT_BIT    (1u << FP32_MANT_FRAC_BITS)
#define FP32_SIGN_BIT        (1u << (FP32_MANT_FRAC_BITS + FP32_EXPO_BITS))
#define FP32_EXPO_BIAS       (127)
#define FX15P16_FRAC_BITS    (16)
#define FRAC_BITS_DIFF       (FP32_MANT_FRAC_BITS - FX15P16_FRAC_BITS)

int32_t fp32_to_fixed (float a)
{
    /* split binary32 operand into constituent parts */
    uint32_t ia = float_as_uint32 (a);
    uint32_t expo = (ia >> FP32_MANT_FRAC_BITS) & FP32_EXPO_MASK;
    uint32_t mant = expo ? ((ia & FP32_MANT_MASK) | FP32_MANT_INT_BIT) : 0;
    int32_t sign = ia & FP32_SIGN_BIT;
    /* compute and clamp shift count */
    int32_t shift = (expo - FP32_EXPO_BIAS) - FRAC_BITS_DIFF;
    shift = (shift < (-31)) ? (-31) : shift;
    shift = (shift > ( 31)) ? ( 31) : shift;
    /* shift left or right so least significant integer bit becomes bit 16 */
    uint32_t shifted_right = mant >> (-shift);
    uint32_t shifted_left = mant << shift;
    /* capture discarded bits if right shift */
    uint32_t discard = mant << (32 + shift);
    /* round to nearest or even if right shift */
    uint32_t round = (discard & 0x80000000) ? 1 : 0;
    uint32_t sticky = (discard & 0x7fffffff) ? 1 : 0;
#if USE_LITERAL_RND_DEF
    uint32_t odd = shifted_right & 1;
    shifted_right = (round & (sticky | odd)) ? (shifted_right + 1) : shifted_right;
#else // USE_LITERAL_RND_DEF
    shifted_right = (round) ? (shifted_right + 1) : shifted_right;
    shifted_right = (round & ~sticky) ? (shifted_right & ~1) : shifted_right;
#endif // USE_LITERAL_RND_DEF
    /* make final selection between left shifted and right shifted */
    int32_t res = (shift < 0) ? shifted_right : shifted_left;
    /* negate if negative */
    return (sign < 0) ? (-res) : res;
}

int main (void)
{
    int32_t res, ref;
    float x;

    printf ("IEEE-754 binary32 to S15.16 fixed-point conversion in RNE mode\n");
    printf ("use %s implementation of round to nearest or even\n",
            USE_LITERAL_RND_DEF ? "literal" : "alternate");

    /* test positive half-plane */
    x = 0.0f;
    while (x < 0x1.0p15f) {
        ref = (int32_t) rint ((double)x * 65536);
        res = fp32_to_fixed(x);
        if (res != ref) {
            printf ("error @ x = % 14.6a: res=%08x ref=%08x\n", x, res, ref);
            printf ("Test FAILED\n");
            return EXIT_FAILURE;
        }
        x = nextafterf (x, INFINITY);
    }
    
    /* test negative half-plane */
    x = -1.0f * 0.0f;
    while (x >= -0x1.0p15f) {
        ref = (int32_t) rint ((double)x * 65536);
        res = fp32_to_fixed(x);
        if (res != ref) {
            printf ("error @ x = % 14.6a: res=%08x ref=%08x\n", x, res, ref);
            printf ("Test FAILED\n");
            return EXIT_FAILURE;
        }
        x = nextafterf (x, -INFINITY);
    }
    printf ("Test PASSED\n");
    return EXIT_SUCCESS;
}