如何在支持 FMA 的 GPU 上改进浮点除法?

How to refine floating-point division on FMA-capable GPUs?

当使用 APIs 为 GPU 编写计算代码时,计算着色器是通过 SPIR-V(特别是 Vulkan)转换的,我保证浮点除法的 ULP 错误最多为 3 .其他基本算术(加法、乘法)正确四舍五入。

如何在这些情况下有效地正确实施舍入除法?假设 FMA 可用且正确舍入。

非正规化会发生什么取决于底层硬件。 Vulkan API 允许查询设备是否可以保留非正规化以及是否可以将它们刷新为零(因此不完全支持非正规化的 GPU 将具有“canPreserve: false, canFlush: true”)。因此,让我们另外假设 GPU 可以产生和处理非正规化,而不会将它们刷新为零(否则试图产生一个正确的四舍五入的结果是次正规的似乎是徒劳的)。

现代基于 FMA 的除法算法通常在很大程度上依赖于广泛研究此问题的 Peter Markstein 的工作。规范参考是:

Peter Markstein,“IA-64 和初等函数:速度和精度”,Prentice-Hall 2000。

下面的 C 代码也是基于 Markstein 的作品。它基本上由一条尽可能快地处理常见情况的快速路径和一条处理所有讨厌的特殊情况的慢速路径组成。

涉及的概念非常简单,但确实需要仔细的数学分析以确保正确的舍入结果。第一步是计算除数的准确倒数。这需要精确计算近似误差,FMA 是最好的工具。来自(基于硬件的)近似的倒数的细化通常采用具有二次收敛的单个 Newton-Raphson 迭代或具有三次收敛的单个 Halley 迭代的形式,两者都完美映射到 FMA。

将除数的倒数乘以被除数就可以得到商的近似值。这同样基于基于 FMA 的残差计算进行了改进。在这个阶段,人们通常会使用一种趋向统一的被除数版本,以防止中间计算中的上溢和下溢,并允许在除法算法的快速路径中使用尽可能广泛的操作数。这也意味着最后需要一个最终的缩放步骤来调整商的大小。

代码中两条路径的一个有利安排是,先无条件地执行快路径计算,最后检查是否满足快路径使用条件,如果不满足,则调用慢路径代码。这允许与快速路径计算同时进行必要条件的计算,并允许慢速路径计算的概述,这有助于将该代码放置在很少使用的内存页面中,其中各种收集了慢速路径或异常处理代码。

请将下面的代码视为算法草图。包含的基于随机测试向量的“冒烟”级测试远非严格的测试框架。调用慢速路径的条件既没有被证明是尽可能严格的,也不是必要的严格的,也不是最有效的安排。倒数的细化使用单个 Newton-Raphson 步骤,但这对于给定 GPU 的倒数近似可能是不够的,并且可能需要以额外的 FMA 为代价来替代 Halley 迭代。最后,我省略了整个慢速路径的构造,因为这会超出 Whosebug 答案的限制,无论是在设计工作量还是文本描述量方面。

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>

float uint32_as_float (uint32_t a)
{
    float r;
    memcpy (&r, &a, sizeof r);
    return r;
}

uint32_t float_as_uint32 (float a)
{
    uint32_t r;
    memcpy (&r, &a, sizeof r);
    return r;
}

float rcp_approx (float a)
{
    return 1.0f / a; // fake it for now
}

float fp32_div_slowpath (float dividend, float divisor)
{
    return dividend / divisor; // fake it for now
}

float fp32_div (float dividend, float divisor)
{
    const uint32_t FP32_MANT_MASK = 0x007fffffu;
    const uint32_t FP32_ONE = 0x3f800000u;
    const uint32_t FP32_SIGN_MASK = 0x80000000u;
    const uint32_t FP32_SIGN_EXPO_MASK = 0xff800000u;
    const uint32_t FP32_HI_LIMIT = 0x7e800000u; // 0x1.0p+126
    const uint32_t FP32_LO_LIMIT = 0x00800001u; // 0x1.0p-126 + ulp

    // fast path
    float recip_approx = rcp_approx (divisor); 
    float recip_err = fmaf (recip_approx, -divisor, 1.0f);
    float dividend_mant = uint32_as_float ((float_as_uint32 (dividend) & FP32_MANT_MASK) | FP32_ONE);
    float dividend_scale = uint32_as_float (float_as_uint32 (dividend) & FP32_SIGN_EXPO_MASK);
    float refined_recip = fmaf (recip_approx, recip_err, recip_approx);
    float quotient_mant = refined_recip * dividend_mant;
    float quotient_residual = fmaf (quotient_mant, -divisor, dividend_mant);
    float refined_quotient_mant = fmaf (refined_recip, quotient_residual, quotient_mant);
    float refined_quotient_residual = fmaf (refined_quotient_mant, -divisor, dividend_mant);
    float final_quotient_mant = fmaf (refined_recip, refined_quotient_residual, refined_quotient_mant);
    float final_quotient = final_quotient_mant * dividend_scale;

    // check if we need to apply the slow path and invoke it if that is the case
    uint32_t id = float_as_uint32 (divisor) & ~FP32_SIGN_MASK;
    uint32_t iq = float_as_uint32 (final_quotient) & ~FP32_SIGN_MASK;
    if ((id > FP32_HI_LIMIT) || ((iq - FP32_LO_LIMIT) > (FP32_HI_LIMIT - FP32_LO_LIMIT))) {
        final_quotient = fp32_div_slowpath (dividend, divisor);
    }

    return final_quotient;
}

// Fixes via: Greg Rose, KISS: A Bit Too Simple. http://eprint.iacr.org/2011/007
static unsigned int z=362436069,w=521288629,jsr=362436069,jcong=123456789;
#define znew (z=36969*(z&0xffff)+(z>>16))
#define wnew (w=18000*(w&0xffff)+(w>>16))
#define MWC  ((znew<<16)+wnew)
#define SHR3 (jsr^=(jsr<<13),jsr^=(jsr>>17),jsr^=(jsr<<5)) /* 2^32-1 */
#define CONG (jcong=69069*jcong+13579)                     /* 2^32 */
#define KISS ((MWC^CONG)+SHR3)

int main (void)
{
    uint64_t count = 0;
    float divisor, dividend, res, ref;

    do {
        if (!(count & 0xffffffULL)) printf ("\rcount: %llu", count);

        dividend = uint32_as_float (KISS);
        divisor  = uint32_as_float (KISS);
        res = fp32_div (dividend, divisor);
        ref = dividend / divisor;
        if (float_as_uint32 (res) != float_as_uint32 (ref)) {
            printf ("error @ dividend=% 16.6a divisor=% 16.6a (% 15.8e, % 15.8e): res=% 16.6a ref=% 16.6a (% 15.8e, % 15.8e)\n", 
                    dividend, divisor, dividend, divisor, res, ref, res, ref);
            return EXIT_FAILURE;
        }

        count++;
    } while (count);
    return EXIT_SUCCESS;
}

重新认识这个问题后的一些补充说明。如果硬件的倒数逼近指令将次正规值视为零是有利的。这允许在调用慢速路径代码时消除除数大小检查,因为任何大于 2126 的除数将导致零的倒数和无穷大的商,因此触发商检查。

至于商检查,任何表示法线的快速路径结果都可以接受,但最小法线除外,它可能表示来自次正规结果的不正确舍入。也就是说,绝对值在[0x1.000002p-126,0x1.fffffep+127]中的商不应该被慢路径代码触发重新计算。

只要使用足够准确的倒数,就不需要商细化步骤。涵盖 [1,2] 中的所有被除数和 [1,2] 中的所有除数以及有效数(尾数)模式的所有可能组合的详尽测试对于现代硬件是可行的,需要 246 测试用例。即使在单个 CPU 核心上使用标量代码 运行,它也能在不到四天内完成,并且没有报告任何错误。

在实际使用中,如果可能,人们会希望强制编译器内联 fp32_div(),同时强制 fp_div_slowpath() 进入被调用的子程序。

下面我使用上面讨论的简化方法制作了一个基于 AVX 的单精度除法版本的原型。它通过了我所有的测试。由于基于 AVX 的硬件倒数的精度较低,因此需要哈雷迭代将倒数细化为完全单精度,从而提供最大误差接近 0.5 ulp 的结果。

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "immintrin.h"

#define MODE_RANDOM           (1)
#define MODE_MANT_EXHAUSTIVE  (2)
#define TEST_MODE             (MODE_RANDOM)

float uint32_as_float (uint32_t a) { float r; memcpy (&r, &a, sizeof r); return r; }
uint32_t float_as_uint32 (float a) { uint32_t r; memcpy (&r, &a, sizeof r); return r; }
float fp32_div_slowpath (float dividend, float divisor) { return dividend / divisor; }

/* 12-bit approximation. Subnormal arguments and results are treated as zero */
float rcp_approx_avx (float divisor)
{
    float r;
    __m256 t = _mm256_set1_ps (divisor);
    t = _mm256_rcp_ps (t);
    memcpy (&r, &t, sizeof r);
    return r;
}

float fp32_div (float dividend, float divisor)
{
    const uint32_t FP32_MANT_MASK = 0x007fffffu;
    const uint32_t FP32_ONE = 0x3f800000u;
    const uint32_t FP32_SIGN_MASK = 0x80000000u;
    const uint32_t FP32_SIGN_EXPO_MASK = 0xff800000u;
    const uint32_t FP32_QUOTIENT_HI_LIMIT = 0x7f7fffffu; // 0x1.0p+128 - ulp
    const uint32_t FP32_QUOTIENT_LO_LIMIT = 0x00800001u; // 0x1.0p-126 + ulp

    // fast path
    float recip_approx = rcp_approx_avx (divisor);
    float recip_err = fmaf (recip_approx, -divisor, 1.0f);
    float recip_err2 = fmaf (recip_err, recip_err, recip_err);
    float refined_recip = fmaf (recip_approx, recip_err2, recip_approx);
    float dividend_mant = uint32_as_float ((float_as_uint32 (dividend) & FP32_MANT_MASK) | FP32_ONE);
    float dividend_scale = uint32_as_float (float_as_uint32 (dividend) & FP32_SIGN_EXPO_MASK);
    float refined_quotient_mant = refined_recip * dividend_mant;
    float refined_quotient_residual = fmaf (refined_quotient_mant, -divisor, dividend_mant);
    float final_quotient_mant = fmaf (refined_recip, refined_quotient_residual, refined_quotient_mant);
    float final_quotient = final_quotient_mant * dividend_scale;

    // check if we need to apply the slow path and invoke it if that is the case
    uint32_t iq = float_as_uint32 (final_quotient) & ~FP32_SIGN_MASK;
    if ((iq - FP32_QUOTIENT_LO_LIMIT) > (FP32_QUOTIENT_HI_LIMIT - FP32_QUOTIENT_LO_LIMIT)) {
        final_quotient = fp32_div_slowpath (dividend, divisor);
    }
    return final_quotient;
}

// Fixes via: Greg Rose, KISS: A Bit Too Simple. http://eprint.iacr.org/2011/007
static unsigned int z=362436069,w=521288629,jsr=362436069,jcong=123456789;
#define znew (z=36969*(z&0xffff)+(z>>16))
#define wnew (w=18000*(w&0xffff)+(w>>16))
#define MWC  ((znew<<16)+wnew)
#define SHR3 (jsr^=(jsr<<13),jsr^=(jsr>>17),jsr^=(jsr<<5)) /* 2^32-1 */
#define CONG (jcong=69069*jcong+13579)                     /* 2^32 */
#define KISS ((MWC^CONG)+SHR3)

int main (void)
{
    uint64_t count = 0;
    float divisor, dividend, res, ref;

#if TEST_MODE == MODE_RANDOM
    do {
        if (!(count & 0xffffffULL)) printf ("\rcount: %llu", count);

        dividend = uint32_as_float (KISS);
        divisor  = uint32_as_float (KISS);
        res = fp32_div (dividend, divisor);
        ref = dividend / divisor;
        if (float_as_uint32 (res) != float_as_uint32 (ref)) {
            printf ("error @ dividend=% 16.6a divisor=% 16.6a (% 15.8e, % 15.8e): res=% 16.6a ref=% 16.6a (% 15.8e, % 15.8e)\n", 
                    dividend, divisor,  dividend, divisor, res, ref, res, ref);
            return EXIT_FAILURE;
        }

        count++;
    } while (count);
#else
    for (dividend = 1.0f; dividend <= 2.0f; dividend = uint32_as_float (float_as_uint32 (dividend) + 1)) {
        printf ("\rdividend=%08x", float_as_uint32 (dividend));
        for (divisor = 1.0f; divisor <= 2.0f; divisor = uint32_as_float (float_as_uint32 (divisor) + 1)) {
            res = fp32_div (dividend, divisor);
            ref = dividend / divisor;
            if (float_as_uint32 (res) != float_as_uint32 (ref)) {
                printf ("error @ dividend=% 16.6a divisor=% 16.6a (% 15.8e, % 15.8e): res=% 16.6a ref=% 16.6a (% 15.8e, % 15.8e)\n", 
                        dividend, divisor,  dividend, divisor, res, ref, res, ref);
                return EXIT_FAILURE;
            }
        }
    }
#endif

    return EXIT_SUCCESS;
}

NVIDIA GPU 的相应 CUDA 代码(已测试)如下所示:

__noinline__ __device__ float fp32_div_slowpath (float dividend, float divisor) 
{ 
    return dividend / divisor; 
}

/* Subnormal arguments and results are treated as zero */
__forceinline__ __device__ float rcp_approx_gpu (float divisor)
{
    float r;
    asm ("rcp.approx.ftz.f32 %0,%1;\n\t" : "=f"(r) : "f"(divisor));
    return r;
}

__forceinline__ __device__ float fp32_div (float dividend, float divisor)
{
    const uint32_t FP32_MANT_MASK = 0x007fffffu;
    const uint32_t FP32_ONE = 0x3f800000u;
    const uint32_t FP32_SIGN_MASK = 0x80000000u;
    const uint32_t FP32_SIGN_EXPO_MASK = 0xff800000u;
    const uint32_t FP32_QUOTIENT_HI_LIMIT = 0x7f7fffffu; // 0x1.0p+128 - ulp
    const uint32_t FP32_QUOTIENT_LO_LIMIT = 0x00800001u; // 0x1.0p-126 + ulp

    // fast path
    float recip_approx = rcp_approx_gpu (divisor);
    float recip_err = fmaf (recip_approx, -divisor, 1.0f);
    float refined_recip = fmaf (recip_approx, recip_err, recip_approx);
    float dividend_mant = __int_as_float ((__float_as_int (dividend) & FP32_MANT_MASK) | FP32_ONE);
    float dividend_scale = __int_as_float (__float_as_int (dividend) & FP32_SIGN_EXPO_MASK);
    float refined_quotient_mant = refined_recip * dividend_mant;
    float refined_quotient_residual = fmaf (refined_quotient_mant, -divisor, dividend_mant);
    float final_quotient_mant = fmaf (refined_recip, refined_quotient_residual, refined_quotient_mant);
    float final_quotient = final_quotient_mant * dividend_scale;

    // check if we need to apply the slow path and invoke it if that is the case
    uint32_t iq = __float_as_int (final_quotient) & ~FP32_SIGN_MASK;
    if ((iq - FP32_QUOTIENT_LO_LIMIT) > (FP32_QUOTIENT_HI_LIMIT - FP32_QUOTIENT_LO_LIMIT)) {
        final_quotient = fp32_div_slowpath (dividend, divisor);
    }
    return final_quotient;
}