"Big Float" Mandelbrot 在 GPU 上的运行速度比 CPU 慢

"Big Float" Mandelbrot runs slower on GPU than CPU

我刚刚将我的“大浮动”实现移植到 OpenCL,但事实证明 GPU 比 CPU 慢?这非常令人惊讶,因为我认为 Mandelbrot 是一个令人尴尬的并行问题。

GPU
Preview render completed. Time taken: 86.031792ms. Iterations: 32
Preview render completed. Time taken: 268.726288ms. Iterations: 96
Preview render completed. Time taken: 461.045258ms. Iterations: 160
Full render completed. Time taken: 5908.507812ms. Iterations: 160

CPU
Preview render completed. Time taken: 101.530037ms. Iterations: 32
Preview render completed. Time taken: 199.125870ms. Iterations: 96
Preview render completed. Time taken: 281.195496ms. Iterations: 160
Full render completed. Time taken: 3264.943115ms. Iterations: 160

有没有人有过在 GPU 上做大浮点数的经验?高效使用 GPU 是否过于复杂,因此最好在 CPU 上完成?

似乎没有办法在 M1 上分析 OpenCL Mac 所以我自己无法真正回答这个问题:(

详情:

GPU 版本代码:

enum sign
{
    SIGN_NEG,
    SIGN_ZERO,
    SIGN_POS
};

struct __attribute__ ((packed)) fp256
{
    enum sign sign;
    CL_UINT man[8];
};

struct __attribute__ ((packed)) fp512
{
    enum sign sign;
    CL_UINT man[16];
};

struct fp256 fp_uadd256(struct fp256 a, struct fp256 b)
{
    struct fp256 c;
    char carry = 0;
    for (int i = 7; i >= 0; i--)
    {
        CL_ULONG temp = (CL_ULONG)a.man[i] + (CL_ULONG)b.man[i] + (CL_ULONG)carry;
        carry = (char)(temp >> 32); // Note that the highest 31 bits of temp are all 0.
        c.man[i] = (CL_UINT)temp;
    }
    return c;
}

struct fp256 fp_usub256(struct fp256 a, struct fp256 b)
{
    struct fp256 c;
    char carry = 0;
    for (int i = 7; i >= 0; i--)
    {
        CL_ULONG temp = (CL_ULONG)a.man[i] - (CL_ULONG)b.man[i] - (CL_ULONG)carry;
        carry = (char)(temp >> 63); // Check if wrapped around.
        c.man[i] = (CL_UINT)temp;
    }
    return c;
}

struct fp512 fp_uadd512(struct fp512 a, struct fp512 b)
{
    struct fp512 c;
    char carry = 0;
    for (int i = 15; i >= 0; i--)
    {
        CL_ULONG temp = (CL_ULONG)a.man[i] + (CL_ULONG)b.man[i] + (CL_ULONG)carry;
        carry = (char)(temp >> 32); // Note that the highest 31 bits of temp are all 0.
        c.man[i] = (CL_UINT)temp;
    }
    return c;
}

enum cmp
{
    CMP_SAME,
    CMP_A_BIG,
    CMP_B_BIG
};

enum cmp fp_ucmp256(struct fp256 a, struct fp256 b)
{
    struct fp256 c = fp_usub256(a, b);
    bool is_negative = (c.man[0] >> 31) == 1;
    if (is_negative)
        return CMP_B_BIG;
    else
    {

// Here, we check if the difference is 0. If so, then both a and b are the same,
// but if not, then the difference must be positive, and thus a > b.

        for (int i = 0; i < 8; i++)
            if (c.man[i] != 0)
                return CMP_A_BIG;
        return CMP_SAME;
    }
}

struct fp256 fp_sadd256(struct fp256 a, struct fp256 b)
{
    if (a.sign == SIGN_ZERO && b.sign == SIGN_ZERO)
        return a;
    if (b.sign == SIGN_ZERO)
        return a;
    if (a.sign == SIGN_ZERO)
        return b;
    if ((a.sign == SIGN_POS && b.sign == SIGN_POS) ||
        (a.sign == SIGN_NEG && b.sign == SIGN_NEG))
    {
        struct fp256 c = fp_uadd256(a, b);
        c.sign = a.sign;
        return c;
    }

    //assert((a.sign == SIGN_POS && b.sign == SIGN_NEG) ||
    //       (a.sign == SIGN_NEG && b.sign == SIGN_POS));

    enum cmp cmp = fp_ucmp256(a, b);
    if (cmp == CMP_SAME)
        return (struct fp256) { SIGN_ZERO, {0} };
    
    if (a.sign == SIGN_POS && b.sign == SIGN_NEG)
    {
        if (cmp == CMP_A_BIG)
        {
            struct fp256 c = fp_usub256(a, b);
            c.sign = SIGN_POS;
            return c;
        }
        else
        {
            struct fp256 c = fp_usub256(b, a);
            c.sign = SIGN_NEG;
            return c;
        }
    }
    else
    {
        if (cmp == CMP_A_BIG)
        {
            struct fp256 c = fp_usub256(a, b);
            c.sign = SIGN_NEG;
            return c;
        }
        else
        {
            struct fp256 c = fp_usub256(b, a);
            c.sign = SIGN_POS;
            return c;
        }
    }
}

struct fp256 fp_sinv256(struct fp256 a)
{
    if (a.sign == SIGN_POS) a.sign = SIGN_NEG;
    else if (a.sign == SIGN_NEG) a.sign = SIGN_POS;
    return a;
}

struct fp256 fp_ssub256(struct fp256 a, struct fp256 b)
{
    return fp_sadd256(a, fp_sinv256(b));
}

struct fp256 fp_smul256(struct fp256 a, struct fp256 b)
{
    if (a.sign == SIGN_ZERO || b.sign == SIGN_ZERO)
        return (struct fp256) { SIGN_ZERO, {0} };

    enum sign sign;
    if (a.sign == SIGN_NEG && b.sign == SIGN_NEG)
        sign = SIGN_POS;
    else if (a.sign == SIGN_NEG || b.sign == SIGN_NEG)
        sign = SIGN_NEG;
    else
        sign = SIGN_POS;

    struct fp512 c = {0};
    for (int i = 7; i >= 0; i--) // a
    {
        for (int j = 7; j >= 0; j--) // b
        {
            int low_offset = 15 - (7 - i) - (7 - j);
            //assert(low_offset >= 1);
            int high_offset = low_offset - 1;

            CL_ULONG mult = (CL_ULONG)a.man[i] * (CL_ULONG)b.man[j];
            struct fp512 temp = {0};
            temp.man[low_offset] = (CL_UINT)mult;
            temp.man[high_offset] = mult >> 32;

            c = fp_uadd512(c, temp);
        }
    }

    struct fp256 c256;
    c256.sign = sign;
    for (int i = 1; i <= 8; i++)
        c256.man[i - 1] = c.man[i];

    return c256;
}

struct fp256 fp_ssqr256(struct fp256 a)
{
    return fp_smul256(a, a);
}

struct fp256 fp_asr256(struct fp256 a)
{
    for (int i = 7; i >= 1; i--)
    {
        a.man[i] >>= 1;
        a.man[i] |= (a.man[i - 1] & 0x1) << 31;
    }
    a.man[0] >>= 1;
    return a;
}

struct fp256 int_to_fp256(int a)
{
    if (a == 0)
        return (struct fp256){ SIGN_ZERO, {0} };
    
    struct fp256 b = {0};
    if (a < 0)
    {
        b.sign = SIGN_NEG;
        a = -a;
    }
    else
        b.sign = SIGN_POS;
    
    b.man[0] = (CL_UINT)a;
    return b;
}

That's quite surprising as I thought that Mandelbrot was an embarrassingly parallel problem.

是的,但是两个维度上工作项之间的工作映射不稳定。有些工作项仅在 5 次迭代中完成,而有些则需要 100 次迭代。那些提前完成的只是开始空闲,直到计算出同一图块中的最后一个(200 次迭代?)像素。

要优化等待繁忙像素(高迭代像素)期间的空闲周期,您可以这样做:

  • 让 1 个 GPU 工作项占用 8x8 像素块。 (800x800 图像的 10k 个工作项)
  • 使用 GPU 工作项 运行 内核
  • 对图块的 4 个角进行“采样”
  • 如果采样迭代值彼此相对接近,则通过该工作项生成一个迷你内核(使用动态并行性)并在 64 个工作项(GPU 线程)上使用该迷你内核计算 8x8 区域
  • 如果采样迭代值彼此之间差异很大,则让父工作项计算整个 8x8 区域,像素计算之间没有任何空闲周期。

另一种选择可能是将那些“发散”的图块发送到 CPU 并在 GPU 上仅计算“稳定”的图块。这种方式可以将两全其美与相对较少的 GPU 周期浪费相结合。或者,您可以根据猜测的平均迭代值对所有图块进行排序,运行 最高的仅在 GPU 上,最低的仅在 CPU.

对 8x8 图块的角点进行采样大约占其总计算量的 1/16。所以有了相对便宜的预处理,就可以公平分配了。