是否可以在 w/o 获得二进制表示后实现 next?

Is it possible to implement nextafter w/o obtaining a binary representation?

通常nextafter是这样实现的:

double nextafter(double x, double y)
{
    // handle corner cases

    int delta = ((x > 0) == (x < y)) ? 1 : -1;
    unsigned long long mant = __mant(x);  // get mantissa as int
    mant += delta;
    ...
}

这里使用__mant(x).

得到二进制表示

出于好奇:是否可以在不获得二进制表示的情况下实现 nextafter?例如,使用算术浮点运算序列。

考虑 FP may/may 不支持次法线、无穷大、大多数值具有唯一值(例如,对 double 使用 2 float)、支持 +/= 0、不支持对 FP 编码的深入了解,使用像 mant += delta; 这样的假设是 next 值会导致可移植性失败 - 即使在使用二进制操作时也是如此。仅使用 FP 操作需要对 FP 编码进行许多假设。

我认为更有用的方法是 post 使用“一系列算术浮点运算”的候选代码,然后询问 1) 它在哪些条件下失败 2) 如何改进?

下面的代码在 IEEE-754 算法的有限值的升序方向上实现 nextafter 舍入到最近的关系到偶数。 NaN、无穷大和下降方向的处理是显而易见的。

在不假设 IEEE-754 或舍入到最近的情况下,C 2018 5.2.4.2.2 充分描述了浮点属性,我们可以通过这种方式实现 nextafter(同样在升序方向):

  • 如果输入是 NaN,return 它,如果是信号 NaN,则报告错误。
  • 如果输入为-∞,return -DBL_MAX.
  • 如果输入是 -DBL_TRUE_MIN,return 零。
  • 如果输入为零,return +DBL_TRUE_MIN.
  • 如果输入是+DBL_MAX, return +∞.
  • 如果输入是+∞,return+∞。 (请注意,完整的 nextafter(x, y) 实现不会发生这种情况,因为它将第一个参数移向第二个参数的方向,因此我们永远不会从 +∞ 上升,因为我们永远不会收到大于 +∞ 的第二个参数。)
  • 否则,如果是正数,使用logb求指数e。如果 e 小于 DBL_MIN,则 return 输入加上 DBL_TRUE_MIN(次正规的 ULP 和最低的正规)。如果e不小于DBL_MIN,return输入加scalb(1, e + 1 - DBL_MANT_DIG)(输入的具体ULP)。舍入方法无关紧要,因为这些加法是精确的。
  • 否则,输入为负数。使用上面的方法,除非输入恰好是 FLT_RADIX 的幂(输入等于 scalb(1, e)),将 scalb 的第二个参数减一(因为这个 nextafter 步骤转换从更大的指数到更低的指数)。

注意上面的FLT_RADIX是正确的;没有DBL_RADIX;所有浮点格式都使用相同的基数。

如果您想将logbscalb视为操作浮点表示的函数,则可以将它们替换为普通算术。 log 可以找到一个可以快速细化为真实指数的快速近似值,并且 scalb 可以通过多种方式实现,可能只是 table 查找。如果 log 仍然令人反感,那么试验比较就足够了。

以上处理有或没有次正规的格式,因为如果支持次正规,它会以递减量进入它们,如果不支持次正规,最小法线幅度为 DBL_TRUE_MIN,所以它是在上面识别为我们下一步要归零的点。

有一个警告; C 标准允许“不确定”一个实现是否支持次正规,“如果浮点运算不一致地将次正规表示解释为零,也不解释为非零。”在那种情况下,我没有看到标准指定标准 nextafter 的作用,因此我们无需在实现中匹配它。假设有时支持次正规,DBL_TRUE_MIN 必须是次正规值,并且上面将尝试工作,就好像次正规支持当前处于打开状态(例如,刷新为零已关闭),如果关闭,你会得到你所得到的。

#include <float.h>
#include <math.h>


/*  Return the next floating-point value after the finite value q.

    This was inspired by Algorithm 3.5 in Siegfried M. Rump, Takeshi Ogita, and
    Shin'ichi Oishi, "Accurate Floating-Point Summation", _Technical Report
    05.12_, Faculty for Information and Communication Sciences, Hamburg
    University of Technology, November 13, 2005.
    
    IEEE-754 and the default rounding mode,
    round-to-nearest-ties-to-even, may be required.
*/
double NextAfter(double q)
{
    /*  Scale is .625 ULP, so multiplying it by any significand in [1, 2)
        yields something in [.625 ULP, 1.25 ULP].
    */
    static const double Scale = 0.625 * DBL_EPSILON;

    /*  Either of the following may be used, according to preference and
        performance characteristics.  In either case, use a fused multiply-add
        (fma) to add to q a number that is in [.625 ULP, 1.25 ULP].  When this
        is rounded to the floating-point format, it must produce the next
        number after q.
    */
#if 0
    // SmallestPositive is the smallest positive floating-point number.
    static const double SmallestPositive = DBL_EPSILON * DBL_MIN;

    if (fabs(q) < 2*DBL_MIN)
        return q + SmallestPositive;

    return fma(fabs(q), Scale, q);
#else
    return fma(fmax(fabs(q), DBL_MIN), Scale, q);
#endif
}


#if defined CompileMain


#include <stdio.h>
#include <stdlib.h>


#define NumberOf(a) (sizeof (a) / sizeof *(a))


int main(void)
{
    int status = EXIT_SUCCESS;

    static const struct { double in, out; } cases[] =
    {
        {  INFINITY,                 INFINITY                },
        {  0x1.fffffffffffffp1023,   INFINITY                },
        {  0x1.ffffffffffffep1023,   0x1.fffffffffffffp1023  },
        {  0x1.ffffffffffffdp1023,   0x1.ffffffffffffep1023  },
        {  0x1.ffffffffffffcp1023,   0x1.ffffffffffffdp1023  },
        {  0x1.0000000000003p1023,   0x1.0000000000004p1023  },
        {  0x1.0000000000002p1023,   0x1.0000000000003p1023  },
        {  0x1.0000000000001p1023,   0x1.0000000000002p1023  },
        {  0x1.0000000000000p1023,   0x1.0000000000001p1023  },

        {  0x1.fffffffffffffp1022,   0x1.0000000000000p1023  },

        {  0x1.fffffffffffffp1,      0x1.0000000000000p2     },
        {  0x1.ffffffffffffep1,      0x1.fffffffffffffp1     },
        {  0x1.ffffffffffffdp1,      0x1.ffffffffffffep1     },
        {  0x1.ffffffffffffcp1,      0x1.ffffffffffffdp1     },
        {  0x1.0000000000003p1,      0x1.0000000000004p1     },
        {  0x1.0000000000002p1,      0x1.0000000000003p1     },
        {  0x1.0000000000001p1,      0x1.0000000000002p1     },
        {  0x1.0000000000000p1,      0x1.0000000000001p1     },

        {  0x1.fffffffffffffp-1022,  0x1.0000000000000p-1021 },
        {  0x1.ffffffffffffep-1022,  0x1.fffffffffffffp-1022 },
        {  0x1.ffffffffffffdp-1022,  0x1.ffffffffffffep-1022 },
        {  0x1.ffffffffffffcp-1022,  0x1.ffffffffffffdp-1022 },
        {  0x1.0000000000003p-1022,  0x1.0000000000004p-1022 },
        {  0x1.0000000000002p-1022,  0x1.0000000000003p-1022 },
        {  0x1.0000000000001p-1022,  0x1.0000000000002p-1022 },
        {  0x1.0000000000000p-1022,  0x1.0000000000001p-1022 },

        {  0x0.fffffffffffffp-1022,  0x1.0000000000000p-1022 },
        {  0x0.ffffffffffffep-1022,  0x0.fffffffffffffp-1022 },
        {  0x0.ffffffffffffdp-1022,  0x0.ffffffffffffep-1022 },
        {  0x0.ffffffffffffcp-1022,  0x0.ffffffffffffdp-1022 },
        {  0x0.0000000000003p-1022,  0x0.0000000000004p-1022 },
        {  0x0.0000000000002p-1022,  0x0.0000000000003p-1022 },
        {  0x0.0000000000001p-1022,  0x0.0000000000002p-1022 },
        {  0x0.0000000000000p-1022,  0x0.0000000000001p-1022 },

        { -0x1.fffffffffffffp1023,  -0x1.ffffffffffffep1023  },
        { -0x1.ffffffffffffep1023,  -0x1.ffffffffffffdp1023  },
        { -0x1.ffffffffffffdp1023,  -0x1.ffffffffffffcp1023  },
        { -0x1.0000000000004p1023,  -0x1.0000000000003p1023  },
        { -0x1.0000000000003p1023,  -0x1.0000000000002p1023  },
        { -0x1.0000000000002p1023,  -0x1.0000000000001p1023  },
        { -0x1.0000000000001p1023,  -0x1.0000000000000p1023  },

        { -0x1.0000000000000p1023,  -0x1.fffffffffffffp1022  },

        { -0x1.0000000000000p2,     -0x1.fffffffffffffp1     },
        { -0x1.fffffffffffffp1,     -0x1.ffffffffffffep1     },
        { -0x1.ffffffffffffep1,     -0x1.ffffffffffffdp1     },
        { -0x1.ffffffffffffdp1,     -0x1.ffffffffffffcp1     },
        { -0x1.0000000000004p1,     -0x1.0000000000003p1     },
        { -0x1.0000000000003p1,     -0x1.0000000000002p1     },
        { -0x1.0000000000002p1,     -0x1.0000000000001p1     },
        { -0x1.0000000000001p1,     -0x1.0000000000000p1     },

        { -0x1.0000000000000p-1021, -0x1.fffffffffffffp-1022 },
        { -0x1.fffffffffffffp-1022, -0x1.ffffffffffffep-1022 },
        { -0x1.ffffffffffffep-1022, -0x1.ffffffffffffdp-1022 },
        { -0x1.ffffffffffffdp-1022, -0x1.ffffffffffffcp-1022 },
        { -0x1.0000000000004p-1022, -0x1.0000000000003p-1022 },
        { -0x1.0000000000003p-1022, -0x1.0000000000002p-1022 },
        { -0x1.0000000000002p-1022, -0x1.0000000000001p-1022 },
        { -0x1.0000000000001p-1022, -0x1.0000000000000p-1022 },

        { -0x1.0000000000000p-1022, -0x0.fffffffffffffp-1022 },
        { -0x0.fffffffffffffp-1022, -0x0.ffffffffffffep-1022 },
        { -0x0.ffffffffffffep-1022, -0x0.ffffffffffffdp-1022 },
        { -0x0.ffffffffffffdp-1022, -0x0.ffffffffffffcp-1022 },
        { -0x0.0000000000004p-1022, -0x0.0000000000003p-1022 },
        { -0x0.0000000000003p-1022, -0x0.0000000000002p-1022 },
        { -0x0.0000000000002p-1022, -0x0.0000000000001p-1022 },
        { -0x0.0000000000001p-1022, -0x0.0000000000000p-1022 },
    };

    for (int i = 0; i < NumberOf(cases); ++i)
    {
        double in = cases[i].in, expected = cases[i].out;
        double observed = NextAfter(in);
        printf("NextAfter(%a) = %a.\n", in, observed);
        if (! (observed == expected))
        {
            printf("\tError, expected %a.\n", expected);
            status = EXIT_FAILURE;
        }
    }

    return status;
}


#endif  // defined CompileMain

据我所知,这只有在融合乘加 (FMA) 可用时才有可能。对于我下面的示例 ISO-C99 实现,我使用 float 映射到 IEEE-754 binary32 因为这样我可以获得更好的测试覆盖率。假设 C 的 IEEE-754 绑定有效(因此 C 浮点类型绑定到 IEEE-754 二进制类型,支持次正规等),有效的舍入模式是舍入到最近或偶数, 和 ISO-C 标准 (FE_INEXACT, FE_OVERFLOW, FE_UNDERFLOW) 指定的异常信号要求被放弃(传递屏蔽响应)。

代码可能比需要的更复杂;我简单的把各个操作数类分离出来,一个一个处理。我使用了具有最严格的浮点设置的英特尔编译器进行编译。来自英特尔数学库的 nextafterf() 的实现被用作黄金参考。我认为我的实现中存在与英特尔库实现中的错误相匹配的错误的可能性很小,但显然并非不可能。

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <float.h>
#include <string.h>
#include <fenv.h>
#include <math.h>

float my_nextafterf (float a, float b)
{
    const float FP32_MIN_NORMAL = 0x1.000000p-126f;
    const float FP32_MAX_NORMAL = 0x1.fffffep+127f;
    const float FP32_EPSILON = 0x1.0p-23f;
    const float FP32_ONE = 1.0f;
    const float FP32_HALF = 0.5f;
    const float FP32_SUBNORM_SCALE = FP32_ONE / FP32_MIN_NORMAL;
    const float FP32_INC = (FP32_ONE + FP32_EPSILON) * FP32_EPSILON * FP32_HALF;
    const float FP32_INT_SCALE = FP32_ONE / FP32_EPSILON;

    float r;
    if ((!(fabsf(a) <= INFINITY)) || (!(fabsf(b) <= INFINITY))) { // unordered
        r = a + b;
    }
    else if (a == b) { // equal
        r = b;
    }
    else if (fabsf (a) == INFINITY) { // infinity
        r = (a >= 0) ? FP32_MAX_NORMAL : (-FP32_MAX_NORMAL);
    }
    else if (fabsf (a) >= FP32_MIN_NORMAL) { // normal
        float factor = ((a >= 0) != (a > b)) ? FP32_INC : (-FP32_INC);
        r = fmaf (factor, a, a);
    } else { // subnormal or zero
        float scal = (a >= 0) ? FP32_INT_SCALE : (-FP32_INT_SCALE);
        float incr = ((a >= 0) != (a > b)) ? FP32_ONE : (-FP32_ONE);
        r = (a * scal * FP32_SUBNORM_SCALE + incr) / scal / FP32_SUBNORM_SCALE;
    }
    return r;
}

float uint32_as_float (uint32_t a)
{
    float r;
    memcpy (&r, &a, sizeof r);
    return r;
}

uint32_t float_as_uint32 (float a)
{
    uint32_t r;
    memcpy (&r, &a, sizeof r);
    return r;
}

// Fixes via: Greg Rose, KISS: A Bit Too Simple. http://eprint.iacr.org/2011/007
static uint32_t kiss_z = 362436069;
static uint32_t kiss_w = 521288629;
static uint32_t kiss_jsr = 123456789;
static uint32_t kiss_jcong = 380116160;
#define znew (kiss_z=36969*(kiss_z&0xffff)+(kiss_z>>16))
#define wnew (kiss_w=18000*(kiss_w&0xffff)+(kiss_w>>16))
#define MWC  ((znew<<16)+wnew)
#define SHR3 (kiss_jsr^=(kiss_jsr<<13),kiss_jsr^=(kiss_jsr>>17),kiss_jsr^=(kiss_jsr<<5))
#define CONG (kiss_jcong=69069*kiss_jcong+13579)
#define KISS ((MWC^CONG)+SHR3)

int main (void)
{
    float a, b, res, ref;
    uint32_t ia, ib, ires, iref;
    unsigned long long int count = 0;
    const uint32_t FP32_QNAN_BIT = 0x00400000;
    const uint32_t FP32_SIGN_BIT = 0x80000000;
    const uint32_t FP32_INFINITY = 0x7f800000;
    
    printf ("Testing nextafterf()\n");

    while (1) {
        ia = KISS;
        ib = KISS;
        /* increase likelihood of zeros, infinities, equality */
        if (!(KISS & 0xfff)) ia = ia & FP32_SIGN_BIT;
        if (!(KISS & 0xfff)) ib = ib & FP32_SIGN_BIT;
        if (!(KISS & 0xfff)) ia = ia | FP32_INFINITY;
        if (!(KISS & 0xfff)) ib = ib | FP32_INFINITY;
        if (!(KISS & 0xffffff)) ib = ia;
        a = uint32_as_float (ia);
        b = uint32_as_float (ib);

        res = my_nextafterf (a, b);
        ref = nextafterf (a, b);
        ires = float_as_uint32 (res);
        iref = float_as_uint32 (ref);

        if (ires != iref) {
            /* if both 'from' and 'to' are NaN, result may be either NaN, quietened */
            if (!(isnan (a) && isnan (b) && 
                  ((ires == (ia | FP32_QNAN_BIT)) || (ires == (ib | FP32_QNAN_BIT))))) {
                printf ("error: a=%08x b=%08x  res=%08x  ref=%08x\n", ia, ib, ires, iref);
            }
        }
        count++;
        if (!(count & 0xffffff)) printf ("\rcount = 0x%llx", count);
    }
    return EXIT_SUCCESS;
}