如何在 OpenGL ES 中使用两个浮点数模拟双精度？

Question

我正在为 Mandelbrot 集创建深度缩放，您可能知道 OpenGL ES 不支持 double 数据类型。它可以提供的最高精度是 IEEE 754 float。在谷歌搜索和大量搜索之后，我发现了这个博客：https://blog.cyclemap.link/2011-06-09-glsl-part2-emu/，它完全致力于这个主题。但是，不幸的是，我无法理解那里提供的加法、减法和乘法代码。尤其是纠错和携带的部分我看不懂。如果您能向我解释错误检查的深度以及从低位到高位的位转移，那将非常有帮助。所以，到目前为止，我只了解将双精度拆分为两个浮点数的基本概念。但是，我不清楚基本操作的实现。如果使用二进制数的上下文来解释，那将非常有帮助。

Answer 1

首先是用来处理这个的基本原理。一旦您添加或减去具有高指数差异的数字，结果就会四舍五入：

12345600000 + 8.76542995683848E-4 = 12345600000

现在，正如我在中向您展示的那样，我们可以将我们的数字存储为更多浮点数的总和，例如 vec2, vec3, vec4，它们仍然是浮点数，但一起可以组合成更大的整体尾数位宽。您问题中的 link 不像我那样使用指数范围，而是使用四舍五入和非四舍五入结果之间的差异。然而，linked 库仅使用 vec2，这仍然比原生 64 位 double 精确得多，因为 double 的尾数有 53 位float 只有 24 位，所以 24+24 = 48 < 53 这就是我决定使用 vec3 的原因。现在的诀窍是获得舍入误差。对于与上面相同的示例：

a=12345600000 
b=8.76542995683848E-4
c=a+b=12345600000

a,b 是 + 操作的 float 个操作数，c 是舍入结果。所以差异 e 可以这样得到：

e=c-a; // e= 0
e-=b;  // e=-8.76542995683848E-4
e=-e;  // e=+8.76542995683848E-4

其中 e 是应该添加到 c 以获得不四舍五入结果的内容。

现在，如果我们在 vec3 的每个组件中存储数字的一部分，那么我们可以尝试将此 e 添加到所有组件中（始终从 e 中删除添加的部分）直到 e 为零。

因此，如果 c.x+e 确实存在，我们将其添加到 c.y 等等......基于此，我设法编写了这个：

//---------------------------------------------------------------------------
//--- High Precision float ver: 1.000 ---------------------------------------
//---------------------------------------------------------------------------
#ifndef _GLSL_HP32
#define _GLSL_HP32
//---------------------------------------------------------------------------
// helper functions (internals)
void hp32_nor(vec3 &c)          // bubble sort c coordinates desc by magnitude
    {
    float x;
    if (fabs(c.x)<fabs(c.y)){ x=c.x; c.x=c.y; c.y=x; }
    if (fabs(c.x)<fabs(c.z)){ x=c.x; c.x=c.z; c.z=x; }
    if (fabs(c.y)<fabs(c.z)){ x=c.y; c.y=c.z; c.z=x; }
    }
void hp32_err(vec3 &c,vec3 &e)  // c+=e; apply rounding error e corection to c
    {
    float q;
    q=c.x; c.x+=e.x; e.x=e.x-(c.x-q);
    q=c.x; c.x+=e.y; e.y=e.y-(c.x-q);
    q=c.x; c.x+=e.z; e.z=e.z-(c.x-q);
    q=c.y; c.y+=e.x; e.x=e.x-(c.y-q);
    q=c.y; c.y+=e.y; e.y=e.y-(c.y-q);
    q=c.y; c.y+=e.z; e.z=e.z-(c.y-q);
    q=c.z; c.z+=e.x; e.x=e.x-(c.z-q);
    q=c.z; c.z+=e.y; e.y=e.y-(c.z-q);
    q=c.z; c.z+=e.z; e.z=e.z-(c.z-q);
    hp32_nor(c);
    }
void hp32_split(vec3 &h,vec3 &l,vec3 a) // (h+l)=a; split mantissas to half
    {
    const float n=8193.0; // 10000000000001 bin uses ~half of mantissa bits
    h=a*n;  // this shifts the a left by half of mantissa (almost no rounding yet)
    l=h-a;  // this will round half of mantissa as h,a have half of mantisa bits exponent difference
    h-=l;   // this will get rid of the `n*` part from number leaving just high half of mantisa from original a
    l=a-h;  // this is just the differenc ebetween original a and h ... so lower half of mantisa beware might change sign
    }
//---------------------------------------------------------------------------
// double api (comment it out if double not present)
vec3 hp32_set(double a)                                             // double -> vec2
    {
    vec3 c;
    c.x=a; a-=c.x;
    c.y=a; a-=c.y;
    c.z=a; hp32_nor(c);
    return c;
    }
double hp32_getl(vec3 a){ double c; c=a.z+a.y; c+=a.x; return c; }  // vec2 -> double
//---------------------------------------------------------------------------
// normal api
vec3 hp32_set(float a){ return vec3(a,0.0,0.0); }                   // float -> vec2
float hp32_get(vec3 a){ float c; c=a.z+a.y; c+=a.x; return c; }     // vec2 -> float
vec3 hp32_add(vec3 a,vec3 b)                                        // = a+b
    {
    // c=a+b; addition
    vec3 c=a+b,e; float q;
    // e=(a+b)-c; rounding error
    c.x=a.x+b.x; e.x=c.x-a.x; e.x-=b.x;
    c.y=a.y+b.y; e.y=c.y-a.y; e.y-=b.y;
    c.z=a.z+b.z; e.z=c.z-a.z; e.z-=b.z;
    e=-e; hp32_err(c,e);
    return c;
    }
vec3 hp32_sub(vec3 a,vec3 b)                                        // = a-b
    {
    // c=a-b; substraction
    vec3 c=a-b,e; float q;
    // e=(a-b)-c; rounding error
    c.x=a.x+b.x; e.x=c.x-a.x; e.x+=b.x;
    c.y=a.y+b.y; e.y=c.y-a.y; e.y+=b.y;
    c.z=a.z+b.z; e.z=c.z-a.z; e.z+=b.z;
    e=-e; hp32_err(c,e);
    return c;
    }
vec3 hp32_mul_half(vec3 a,vec3 b)                                   // = a*b where a,b are just half of mantissas !!! internal call do not use this !!!
    {
    //  c = (a.x+a.y+a.z)*(b.x+b.y+b.z)     // multiplication of 2 expresions
    //  c = (a.x*b.x)+(a.x*b.y)+(a.x*b.z)   // expanded
    //     +(a.y*b.x)+(a.y*b.y)+(a.y*b.z)
    //     +(a.z*b.x)+(a.z*b.y)+(a.z*b.z)
    //  c = (a.x*b.x)                       // ordered desc by magnitude (x>=y>=z)
    //     +(a.x*b.y)+(a.y*b.x)
    //     +(a.x*b.z)+(a.z*b.x)+(a.y*b.y)
    //     +(a.y*b.z)+(a.z*b.y)
    //     +(a.z*b.z)
    vec3 c,e,f; float q,r;
    // c=a*b; (e,f)=(a*b)-c; multiplication
    c.x=(a.x*b.x);
    r=(a.x*b.y); q=c.x; c.x+=r; e.x=r-(c.x-q);
    r=(a.y*b.x); q=c.x; c.x+=r; e.y=r-(c.x-q);
    c.y=(a.x*b.z);
    r=(a.z*b.x); q=c.y; c.y+=r; e.z=r-(c.y-q);
    r=(a.y*b.y); q=c.y; c.y+=r; f.x=r-(c.y-q);
    c.z=(a.y*b.z);
    r=(a.z*b.y); q=c.z; c.z+=r; f.y=r-(c.z-q);
    r=(a.z*b.z); q=c.z; c.z+=r; f.z=r-(c.z-q);
    e=+hp32_add(e,f); hp32_err(c,e);
    return c;
    }
vec3 hp32_mul(vec3 a,vec3 b)                                        // = a*b
    {
    vec3 ah,al,bh,bl,c;
    // split operands to halves of mantissa
    hp32_split(ah,al,a);
    hp32_split(bh,bl,b);
    //  c = (ah+al)*(bh+bl) = ah*bh + ah*bl + al*bh + al*bl
    c=           hp32_mul_half(ah,bh) ;
    c=hp32_add(c,hp32_mul_half(ah,bl));
    c=hp32_add(c,hp32_mul_half(al,bh));
    c=hp32_add(c,hp32_mul_half(al,bl));
    return c;
    }
//---------------------------------------------------------------------------
#endif
//---------------------------------------------------------------------------

现在我只在 CPU 端（C++）测试了这个。为了在 GLSL 中使用它，只需注释掉或删除我用来验证准确性的双 api 函数。并将 fabs 更改为 abs 或声明：

float fabs(float x){ return abs(x); }

同样，我有一些归一化函数 hp32_nor，它按大小对分量进行排序，因此 fabs(x)>=fabs(y)>=fabs(z) 需要返回 float 和乘法。 +,-不需要。

hp32_err就像上面描述的正常数和舍入误差差的加法（有点可怕，但它尽可能保持准确性）。

我还没有广泛测试！！！ 与 double 相比，看起来 +,-,* 操作更精确。

乘法实现有点复杂，因为浮点数的 a*b 将尾数的结果位宽作为操作数位宽的总和。因此，为了避免舍入，我们需要先将操作数分成两半。可以这样做（从你 linked 的库分析）：

// split float into h,l
float a,h,l,n;
n=8193.0;   // n =               10000000000001.00000000000000000000b
a=123.4567; // a =                      1111011.01110100111010101000b
h=a*n;      // h =         11110110111100011000.11000000000000000000b
l=h-a;      // l =         11110110111010011101.01010000000000000000b
h-=l;       // h =                      1111011.01110000000000000000b
l=a-h;      // l =                            0.00000100111010101000b

所以浮点数有 24 位尾数，8193 是 (24/2)+1=13 位宽。因此，一旦您将任何浮点数与它相乘，结果就需要比现在多一半的尾数位并四舍五入。这只是回到原始操作数规模并将另一半作为新的一半与原始浮点值之间的差异的问题。所有这些都是在辅助函数 hp32_split.

中完成的

现在对半的乘法 c=a*b 看起来像这样：

c = (ah+al)*(bh+bl) = ah*bh + ah*bl + al*bh + al*bl

每个半乘法 a?*b? 就像这样：

c = (a.x+a.y+a.z)*(b.x+b.y+b.z)     // multiplication of 2 expresions
c = (a.x*b.x)+(a.x*b.y)+(a.x*b.z)   // expanded
   +(a.y*b.x)+(a.y*b.y)+(a.y*b.z)
   +(a.z*b.x)+(a.z*b.y)+(a.z*b.z)
c = (a.x*b.x)                       // ordered desc by magnitude (x>=y>=z)
   +(a.x*b.y)+(a.y*b.x)
   +(a.x*b.z)+(a.z*b.x)+(a.y*b.y)
   +(a.y*b.z)+(a.z*b.y)
   +(a.z*b.z)

所以我将 c 的每个组成部分分成 3 个括号项。重要的是按大小对项进行排序以尽可能防止舍入错误。随着项的求和，我只是像加法一样累积错误。

如何在 OpenGL ES 中使用两个浮点数模拟双精度？

How to emulate double precision using two floats in OpenGL ES?

floating-point

opengl-es

glsl