"Big Float" Mandelbrot 在 GPU 上的运行速度比 CPU 慢
"Big Float" Mandelbrot runs slower on GPU than CPU
我刚刚将我的“大浮动”实现移植到 OpenCL,但事实证明 GPU 比 CPU 慢?这非常令人惊讶,因为我认为 Mandelbrot 是一个令人尴尬的并行问题。
GPU
Preview render completed. Time taken: 86.031792ms. Iterations: 32
Preview render completed. Time taken: 268.726288ms. Iterations: 96
Preview render completed. Time taken: 461.045258ms. Iterations: 160
Full render completed. Time taken: 5908.507812ms. Iterations: 160
CPU
Preview render completed. Time taken: 101.530037ms. Iterations: 32
Preview render completed. Time taken: 199.125870ms. Iterations: 96
Preview render completed. Time taken: 281.195496ms. Iterations: 160
Full render completed. Time taken: 3264.943115ms. Iterations: 160
有没有人有过在 GPU 上做大浮点数的经验?高效使用 GPU 是否过于复杂,因此最好在 CPU 上完成?
似乎没有办法在 M1 上分析 OpenCL Mac 所以我自己无法真正回答这个问题:(
详情:
GPU 使用 32 位整数,而 CPU 使用 64 位整数。这意味着 GPU 需要 8 个来获得 256 位数字,而 CPU 只需要 4 个。我这样做是因为我不能将内联 ASM 用于“ADC”,也不能在 OpenCL 中使用 128 位整数,而我是能够在 CPU.
上这样做
目前 CPU 的分析和优化似乎要容易得多。我想知道这是否是更好的选择。
我已经确认内核是瓶颈,而不是 CPU-GPU 传输(1 毫秒)或使用 SDL 位图传输图像(1 毫秒)。
GPU 版本代码:
enum sign
{
SIGN_NEG,
SIGN_ZERO,
SIGN_POS
};
struct __attribute__ ((packed)) fp256
{
enum sign sign;
CL_UINT man[8];
};
struct __attribute__ ((packed)) fp512
{
enum sign sign;
CL_UINT man[16];
};
struct fp256 fp_uadd256(struct fp256 a, struct fp256 b)
{
struct fp256 c;
char carry = 0;
for (int i = 7; i >= 0; i--)
{
CL_ULONG temp = (CL_ULONG)a.man[i] + (CL_ULONG)b.man[i] + (CL_ULONG)carry;
carry = (char)(temp >> 32); // Note that the highest 31 bits of temp are all 0.
c.man[i] = (CL_UINT)temp;
}
return c;
}
struct fp256 fp_usub256(struct fp256 a, struct fp256 b)
{
struct fp256 c;
char carry = 0;
for (int i = 7; i >= 0; i--)
{
CL_ULONG temp = (CL_ULONG)a.man[i] - (CL_ULONG)b.man[i] - (CL_ULONG)carry;
carry = (char)(temp >> 63); // Check if wrapped around.
c.man[i] = (CL_UINT)temp;
}
return c;
}
struct fp512 fp_uadd512(struct fp512 a, struct fp512 b)
{
struct fp512 c;
char carry = 0;
for (int i = 15; i >= 0; i--)
{
CL_ULONG temp = (CL_ULONG)a.man[i] + (CL_ULONG)b.man[i] + (CL_ULONG)carry;
carry = (char)(temp >> 32); // Note that the highest 31 bits of temp are all 0.
c.man[i] = (CL_UINT)temp;
}
return c;
}
enum cmp
{
CMP_SAME,
CMP_A_BIG,
CMP_B_BIG
};
enum cmp fp_ucmp256(struct fp256 a, struct fp256 b)
{
struct fp256 c = fp_usub256(a, b);
bool is_negative = (c.man[0] >> 31) == 1;
if (is_negative)
return CMP_B_BIG;
else
{
// Here, we check if the difference is 0. If so, then both a and b are the same,
// but if not, then the difference must be positive, and thus a > b.
for (int i = 0; i < 8; i++)
if (c.man[i] != 0)
return CMP_A_BIG;
return CMP_SAME;
}
}
struct fp256 fp_sadd256(struct fp256 a, struct fp256 b)
{
if (a.sign == SIGN_ZERO && b.sign == SIGN_ZERO)
return a;
if (b.sign == SIGN_ZERO)
return a;
if (a.sign == SIGN_ZERO)
return b;
if ((a.sign == SIGN_POS && b.sign == SIGN_POS) ||
(a.sign == SIGN_NEG && b.sign == SIGN_NEG))
{
struct fp256 c = fp_uadd256(a, b);
c.sign = a.sign;
return c;
}
//assert((a.sign == SIGN_POS && b.sign == SIGN_NEG) ||
// (a.sign == SIGN_NEG && b.sign == SIGN_POS));
enum cmp cmp = fp_ucmp256(a, b);
if (cmp == CMP_SAME)
return (struct fp256) { SIGN_ZERO, {0} };
if (a.sign == SIGN_POS && b.sign == SIGN_NEG)
{
if (cmp == CMP_A_BIG)
{
struct fp256 c = fp_usub256(a, b);
c.sign = SIGN_POS;
return c;
}
else
{
struct fp256 c = fp_usub256(b, a);
c.sign = SIGN_NEG;
return c;
}
}
else
{
if (cmp == CMP_A_BIG)
{
struct fp256 c = fp_usub256(a, b);
c.sign = SIGN_NEG;
return c;
}
else
{
struct fp256 c = fp_usub256(b, a);
c.sign = SIGN_POS;
return c;
}
}
}
struct fp256 fp_sinv256(struct fp256 a)
{
if (a.sign == SIGN_POS) a.sign = SIGN_NEG;
else if (a.sign == SIGN_NEG) a.sign = SIGN_POS;
return a;
}
struct fp256 fp_ssub256(struct fp256 a, struct fp256 b)
{
return fp_sadd256(a, fp_sinv256(b));
}
struct fp256 fp_smul256(struct fp256 a, struct fp256 b)
{
if (a.sign == SIGN_ZERO || b.sign == SIGN_ZERO)
return (struct fp256) { SIGN_ZERO, {0} };
enum sign sign;
if (a.sign == SIGN_NEG && b.sign == SIGN_NEG)
sign = SIGN_POS;
else if (a.sign == SIGN_NEG || b.sign == SIGN_NEG)
sign = SIGN_NEG;
else
sign = SIGN_POS;
struct fp512 c = {0};
for (int i = 7; i >= 0; i--) // a
{
for (int j = 7; j >= 0; j--) // b
{
int low_offset = 15 - (7 - i) - (7 - j);
//assert(low_offset >= 1);
int high_offset = low_offset - 1;
CL_ULONG mult = (CL_ULONG)a.man[i] * (CL_ULONG)b.man[j];
struct fp512 temp = {0};
temp.man[low_offset] = (CL_UINT)mult;
temp.man[high_offset] = mult >> 32;
c = fp_uadd512(c, temp);
}
}
struct fp256 c256;
c256.sign = sign;
for (int i = 1; i <= 8; i++)
c256.man[i - 1] = c.man[i];
return c256;
}
struct fp256 fp_ssqr256(struct fp256 a)
{
return fp_smul256(a, a);
}
struct fp256 fp_asr256(struct fp256 a)
{
for (int i = 7; i >= 1; i--)
{
a.man[i] >>= 1;
a.man[i] |= (a.man[i - 1] & 0x1) << 31;
}
a.man[0] >>= 1;
return a;
}
struct fp256 int_to_fp256(int a)
{
if (a == 0)
return (struct fp256){ SIGN_ZERO, {0} };
struct fp256 b = {0};
if (a < 0)
{
b.sign = SIGN_NEG;
a = -a;
}
else
b.sign = SIGN_POS;
b.man[0] = (CL_UINT)a;
return b;
}
That's quite surprising as I thought that Mandelbrot was an
embarrassingly parallel problem.
是的,但是两个维度上工作项之间的工作映射不稳定。有些工作项仅在 5 次迭代中完成,而有些则需要 100 次迭代。那些提前完成的只是开始空闲,直到计算出同一图块中的最后一个(200 次迭代?)像素。
要优化等待繁忙像素(高迭代像素)期间的空闲周期,您可以这样做:
- 让 1 个 GPU 工作项占用 8x8 像素块。 (800x800 图像的 10k 个工作项)
- 使用 GPU 工作项 运行 内核
对图块的 4 个角进行“采样”
- 如果采样迭代值彼此相对接近,则通过该工作项生成一个迷你内核(使用动态并行性)并在 64 个工作项(GPU 线程)上使用该迷你内核计算 8x8 区域
- 如果采样迭代值彼此之间差异很大,则让父工作项计算整个 8x8 区域,像素计算之间没有任何空闲周期。
另一种选择可能是将那些“发散”的图块发送到 CPU 并在 GPU 上仅计算“稳定”的图块。这种方式可以将两全其美与相对较少的 GPU 周期浪费相结合。或者,您可以根据猜测的平均迭代值对所有图块进行排序,运行 最高的仅在 GPU 上,最低的仅在 CPU.
上
对 8x8 图块的角点进行采样大约占其总计算量的 1/16。所以有了相对便宜的预处理,就可以公平分配了。
我刚刚将我的“大浮动”实现移植到 OpenCL,但事实证明 GPU 比 CPU 慢?这非常令人惊讶,因为我认为 Mandelbrot 是一个令人尴尬的并行问题。
GPU
Preview render completed. Time taken: 86.031792ms. Iterations: 32
Preview render completed. Time taken: 268.726288ms. Iterations: 96
Preview render completed. Time taken: 461.045258ms. Iterations: 160
Full render completed. Time taken: 5908.507812ms. Iterations: 160
CPU
Preview render completed. Time taken: 101.530037ms. Iterations: 32
Preview render completed. Time taken: 199.125870ms. Iterations: 96
Preview render completed. Time taken: 281.195496ms. Iterations: 160
Full render completed. Time taken: 3264.943115ms. Iterations: 160
有没有人有过在 GPU 上做大浮点数的经验?高效使用 GPU 是否过于复杂,因此最好在 CPU 上完成?
似乎没有办法在 M1 上分析 OpenCL Mac 所以我自己无法真正回答这个问题:(
详情:
GPU 使用 32 位整数,而 CPU 使用 64 位整数。这意味着 GPU 需要 8 个来获得 256 位数字,而 CPU 只需要 4 个。我这样做是因为我不能将内联 ASM 用于“ADC”,也不能在 OpenCL 中使用 128 位整数,而我是能够在 CPU.
上这样做目前 CPU 的分析和优化似乎要容易得多。我想知道这是否是更好的选择。
我已经确认内核是瓶颈,而不是 CPU-GPU 传输(1 毫秒)或使用 SDL 位图传输图像(1 毫秒)。
GPU 版本代码:
enum sign
{
SIGN_NEG,
SIGN_ZERO,
SIGN_POS
};
struct __attribute__ ((packed)) fp256
{
enum sign sign;
CL_UINT man[8];
};
struct __attribute__ ((packed)) fp512
{
enum sign sign;
CL_UINT man[16];
};
struct fp256 fp_uadd256(struct fp256 a, struct fp256 b)
{
struct fp256 c;
char carry = 0;
for (int i = 7; i >= 0; i--)
{
CL_ULONG temp = (CL_ULONG)a.man[i] + (CL_ULONG)b.man[i] + (CL_ULONG)carry;
carry = (char)(temp >> 32); // Note that the highest 31 bits of temp are all 0.
c.man[i] = (CL_UINT)temp;
}
return c;
}
struct fp256 fp_usub256(struct fp256 a, struct fp256 b)
{
struct fp256 c;
char carry = 0;
for (int i = 7; i >= 0; i--)
{
CL_ULONG temp = (CL_ULONG)a.man[i] - (CL_ULONG)b.man[i] - (CL_ULONG)carry;
carry = (char)(temp >> 63); // Check if wrapped around.
c.man[i] = (CL_UINT)temp;
}
return c;
}
struct fp512 fp_uadd512(struct fp512 a, struct fp512 b)
{
struct fp512 c;
char carry = 0;
for (int i = 15; i >= 0; i--)
{
CL_ULONG temp = (CL_ULONG)a.man[i] + (CL_ULONG)b.man[i] + (CL_ULONG)carry;
carry = (char)(temp >> 32); // Note that the highest 31 bits of temp are all 0.
c.man[i] = (CL_UINT)temp;
}
return c;
}
enum cmp
{
CMP_SAME,
CMP_A_BIG,
CMP_B_BIG
};
enum cmp fp_ucmp256(struct fp256 a, struct fp256 b)
{
struct fp256 c = fp_usub256(a, b);
bool is_negative = (c.man[0] >> 31) == 1;
if (is_negative)
return CMP_B_BIG;
else
{
// Here, we check if the difference is 0. If so, then both a and b are the same,
// but if not, then the difference must be positive, and thus a > b.
for (int i = 0; i < 8; i++)
if (c.man[i] != 0)
return CMP_A_BIG;
return CMP_SAME;
}
}
struct fp256 fp_sadd256(struct fp256 a, struct fp256 b)
{
if (a.sign == SIGN_ZERO && b.sign == SIGN_ZERO)
return a;
if (b.sign == SIGN_ZERO)
return a;
if (a.sign == SIGN_ZERO)
return b;
if ((a.sign == SIGN_POS && b.sign == SIGN_POS) ||
(a.sign == SIGN_NEG && b.sign == SIGN_NEG))
{
struct fp256 c = fp_uadd256(a, b);
c.sign = a.sign;
return c;
}
//assert((a.sign == SIGN_POS && b.sign == SIGN_NEG) ||
// (a.sign == SIGN_NEG && b.sign == SIGN_POS));
enum cmp cmp = fp_ucmp256(a, b);
if (cmp == CMP_SAME)
return (struct fp256) { SIGN_ZERO, {0} };
if (a.sign == SIGN_POS && b.sign == SIGN_NEG)
{
if (cmp == CMP_A_BIG)
{
struct fp256 c = fp_usub256(a, b);
c.sign = SIGN_POS;
return c;
}
else
{
struct fp256 c = fp_usub256(b, a);
c.sign = SIGN_NEG;
return c;
}
}
else
{
if (cmp == CMP_A_BIG)
{
struct fp256 c = fp_usub256(a, b);
c.sign = SIGN_NEG;
return c;
}
else
{
struct fp256 c = fp_usub256(b, a);
c.sign = SIGN_POS;
return c;
}
}
}
struct fp256 fp_sinv256(struct fp256 a)
{
if (a.sign == SIGN_POS) a.sign = SIGN_NEG;
else if (a.sign == SIGN_NEG) a.sign = SIGN_POS;
return a;
}
struct fp256 fp_ssub256(struct fp256 a, struct fp256 b)
{
return fp_sadd256(a, fp_sinv256(b));
}
struct fp256 fp_smul256(struct fp256 a, struct fp256 b)
{
if (a.sign == SIGN_ZERO || b.sign == SIGN_ZERO)
return (struct fp256) { SIGN_ZERO, {0} };
enum sign sign;
if (a.sign == SIGN_NEG && b.sign == SIGN_NEG)
sign = SIGN_POS;
else if (a.sign == SIGN_NEG || b.sign == SIGN_NEG)
sign = SIGN_NEG;
else
sign = SIGN_POS;
struct fp512 c = {0};
for (int i = 7; i >= 0; i--) // a
{
for (int j = 7; j >= 0; j--) // b
{
int low_offset = 15 - (7 - i) - (7 - j);
//assert(low_offset >= 1);
int high_offset = low_offset - 1;
CL_ULONG mult = (CL_ULONG)a.man[i] * (CL_ULONG)b.man[j];
struct fp512 temp = {0};
temp.man[low_offset] = (CL_UINT)mult;
temp.man[high_offset] = mult >> 32;
c = fp_uadd512(c, temp);
}
}
struct fp256 c256;
c256.sign = sign;
for (int i = 1; i <= 8; i++)
c256.man[i - 1] = c.man[i];
return c256;
}
struct fp256 fp_ssqr256(struct fp256 a)
{
return fp_smul256(a, a);
}
struct fp256 fp_asr256(struct fp256 a)
{
for (int i = 7; i >= 1; i--)
{
a.man[i] >>= 1;
a.man[i] |= (a.man[i - 1] & 0x1) << 31;
}
a.man[0] >>= 1;
return a;
}
struct fp256 int_to_fp256(int a)
{
if (a == 0)
return (struct fp256){ SIGN_ZERO, {0} };
struct fp256 b = {0};
if (a < 0)
{
b.sign = SIGN_NEG;
a = -a;
}
else
b.sign = SIGN_POS;
b.man[0] = (CL_UINT)a;
return b;
}
That's quite surprising as I thought that Mandelbrot was an embarrassingly parallel problem.
是的,但是两个维度上工作项之间的工作映射不稳定。有些工作项仅在 5 次迭代中完成,而有些则需要 100 次迭代。那些提前完成的只是开始空闲,直到计算出同一图块中的最后一个(200 次迭代?)像素。
要优化等待繁忙像素(高迭代像素)期间的空闲周期,您可以这样做:
- 让 1 个 GPU 工作项占用 8x8 像素块。 (800x800 图像的 10k 个工作项)
- 使用 GPU 工作项 运行 内核 对图块的 4 个角进行“采样”
- 如果采样迭代值彼此相对接近,则通过该工作项生成一个迷你内核(使用动态并行性)并在 64 个工作项(GPU 线程)上使用该迷你内核计算 8x8 区域
- 如果采样迭代值彼此之间差异很大,则让父工作项计算整个 8x8 区域,像素计算之间没有任何空闲周期。
另一种选择可能是将那些“发散”的图块发送到 CPU 并在 GPU 上仅计算“稳定”的图块。这种方式可以将两全其美与相对较少的 GPU 周期浪费相结合。或者,您可以根据猜测的平均迭代值对所有图块进行排序,运行 最高的仅在 GPU 上,最低的仅在 CPU.
上对 8x8 图块的角点进行采样大约占其总计算量的 1/16。所以有了相对便宜的预处理,就可以公平分配了。