在 Clang 下解决缺少 Yz 机器约束的问题?
Work around lack of Yz machine constraint under Clang?
如果 __SHA__
未定义,我们使用内联汇编使 SHA 指令可用。在 GCC 下我们使用:
GCC_INLINE __m128i GCC_INLINE_ATTRIB
MM_SHA256RNDS2_EPU32(__m128i a, const __m128i b, const __m128i c)
{
asm ("sha256rnds2 %2, %1, %0" : "+x"(a) : "xm"(b), "Yz" (c));
return a;
}
Clang不消耗GCC的Yz
constraint (see Clang 3.2 Issue 13199 and Clang 3.9 Issue 32727),这是sha256rnds2
指令所需要的:
Yz
First SSE register (%xmm0).
我们为 Clang 添加了 mov
:
asm ("mov %2, %%xmm0; sha256rnds2 %%xmm0, %1, %0" : "+x"(a) : "xm"(b), "x" (c) : "xmm0");
性能每字节下降大约 3 个周期。在我的 2.2 GHz Celeron J3455 测试机(带有 SHA 扩展的 Goldmont)上,大约是 230 MiB/s。这很重要。
查看反汇编,当执行两轮时,Clang 没有围绕 SHA k
进行优化:
Breakpoint 2, SHA256_SSE_SHA_HashBlocks (state=0xaaa3a0,
data=0xaaa340, length=0x40) at sha.cpp:1101
1101 STATE1 = _mm_loadu_si128((__m128i*) &state[4]);
(gdb) disass
Dump of assembler code for function SHA256_SSE_SHA_HashBlocks(unsigned int*, unsigned int const*, unsigned long):
0x000000000068cdd0 <+0>: sub [=13=]x308,%rsp
0x000000000068cdd7 <+7>: movdqu (%rdi),%xmm0
0x000000000068cddb <+11>: movdqu 0x10(%rdi),%xmm1
...
0x000000000068ce49 <+121>: movq %xmm2,%xmm0
0x000000000068ce4d <+125>: sha256rnds2 %xmm0,0x2f0(%rsp),%xmm1
0x000000000068ce56 <+134>: pshufd [=13=]xe,%xmm2,%xmm3
0x000000000068ce5b <+139>: movdqa %xmm13,%xmm2
0x000000000068ce60 <+144>: movaps %xmm1,0x2e0(%rsp)
0x000000000068ce68 <+152>: movq %xmm3,%xmm0
0x000000000068ce6c <+156>: sha256rnds2 %xmm0,0x2e0(%rsp),%xmm2
0x000000000068ce75 <+165>: movdqu 0x10(%rsi),%xmm3
0x000000000068ce7a <+170>: pshufb %xmm8,%xmm3
0x000000000068ce80 <+176>: movaps %xmm2,0x2d0(%rsp)
0x000000000068ce88 <+184>: movdqa %xmm3,%xmm4
0x000000000068ce8c <+188>: paddd 0x6729c(%rip),%xmm4 # 0x6f4130
0x000000000068ce94 <+196>: movq %xmm4,%xmm0
0x000000000068ce98 <+200>: sha256rnds2 %xmm0,0x2d0(%rsp),%xmm1
...
例如,0068ce8c
虽然 0068ce98
应该是:
paddd 0x6729c(%rip),%xmm0 # 0x6f4130
sha256rnds2 %xmm0,0x2d0(%rsp),%xmm1
我猜我们选择的内联 asm 指令有点不对。
我们如何解决 Clang 下缺少 Yz
机器约束的问题?什么模式避免了优化代码中的中间移动?
正在尝试使用 Explicit Register Variable:
const __m128i k asm("xmm0") = c;
asm ("sha256rnds2 %2, %1, %0" : "+x"(a) : "xm"(b), "x" (k));
return a;
结果:
In file included from sha.cpp:24:
./cpu.h:831:22: warning: ignored asm label 'xmm0' on automatic variable
const __m128i k asm("xmm0") = c;
^
./cpu.h:833:7: error: invalid operand for instruction
asm ("sha256rnds2 %2, %1, %0" : "+x"(a) : "xm"(b), "x" (k));
^
<inline asm>:1:21: note: instantiated into assembly here
sha256rnds2 %xmm1, 752(%rsp), %xmm0
^~~~~~~~~~
In file included from sha.cpp:24:
./cpu.h:833:7: error: invalid operand for instruction
asm ("sha256rnds2 %2, %1, %0" : "+x"(a) : "xm"(b), "x" (k));
^
<inline asm>:1:21: note: instantiated into assembly here
sha256rnds2 %xmm3, 736(%rsp), %xmm1
^~~~~~~~~~
...
我根据标签 inline assembly
创建了这个答案,没有提到特定的语言。 Extended assembly templates 已经假定使用语言扩展。
如果 Yz
约束不可用,您可以尝试创建一个临时变量来告诉 CLANG 使用什么寄存器而不是约束。您可以通过所谓的 Explicit Register Variable:
来做到这一点
You can define a local register variable and associate it with a specified register like this:
register int *foo asm ("r12");
Here r12 is the name of the register that should be used. Note that this is the same syntax used for defining global register variables, but for a local variable the declaration appears within a function. The register keyword is required, and cannot be combined with static. The register name must be a valid register name for the target platform.
在您的情况下,您希望强制使用 xmm0
寄存器。您可以使用显式寄存器将 c
参数分配给临时变量,并将该临时变量用作扩展内联程序集的参数。这是 GCC/CLANG.
中显式寄存器的主要目的
GCC_INLINE __m128i GCC_INLINE_ATTRIB
MM_SHA256RNDS2_EPU32(__m128i a, const __m128i b, const __m128i c)
{
register const __m128i tmpc asm("xmm0") = c;
__asm__("sha256rnds2 %2, %1, %0" : "+x"(a) : "x"(b), "x" (tmpc));
return a;
}
编译器现在应该能够提供一些优化,因为它对如何使用 xmm0
寄存器有更多的了解。
当您将 mov %2, %%xmm0;
放入模板 CLANG(和 GCC)时,不要对指令进行任何优化。 Basic Assembly 和 Extended Assembly 模板是一个黑盒子,它只知道如何根据约束进行基本替换。
下面是使用上述方法的反汇编。它是用 clang++
和 -std=c++03
编译的。额外的动作不再存在:
Breakpoint 1, SHA256_SSE_SHA_HashBlocks (state=0x7fffffffae60,
data=0x7fffffffae00, length=0x40) at sha.cpp:1101
1101 STATE1 = _mm_loadu_si128((__m128i*) &state[4]);
(gdb) disass
Dump of assembler code for function SHA256_SSE_SHA_HashBlocks(unsigned int*, unsigned int const*, unsigned long):
0x000000000068cf60 <+0>: sub [=12=]x308,%rsp
0x000000000068cf67 <+7>: movdqu (%rdi),%xmm0
0x000000000068cf6b <+11>: movdqu 0x10(%rdi),%xmm1
...
0x000000000068cfe6 <+134>: paddd 0x670e2(%rip),%xmm0 # 0x6f40d0
0x000000000068cfee <+142>: sha256rnds2 %xmm0,0x2f0(%rsp),%xmm2
0x000000000068cff7 <+151>: pshufd [=12=]xe,%xmm0,%xmm1
0x000000000068cffc <+156>: movdqa %xmm1,%xmm0
0x000000000068d000 <+160>: movaps %xmm2,0x2e0(%rsp)
0x000000000068d008 <+168>: sha256rnds2 %xmm0,0x2e0(%rsp),%xmm3
0x000000000068d011 <+177>: movdqu 0x10(%rsi),%xmm5
0x000000000068d016 <+182>: pshufb %xmm9,%xmm5
0x000000000068d01c <+188>: movaps %xmm3,0x2d0(%rsp)
0x000000000068d024 <+196>: movdqa %xmm5,%xmm0
0x000000000068d028 <+200>: paddd 0x670b0(%rip),%xmm0 # 0x6f40e0
0x000000000068d030 <+208>: sha256rnds2 %xmm0,0x2d0(%rsp),%xmm2
...
如果 __SHA__
未定义,我们使用内联汇编使 SHA 指令可用。在 GCC 下我们使用:
GCC_INLINE __m128i GCC_INLINE_ATTRIB
MM_SHA256RNDS2_EPU32(__m128i a, const __m128i b, const __m128i c)
{
asm ("sha256rnds2 %2, %1, %0" : "+x"(a) : "xm"(b), "Yz" (c));
return a;
}
Clang不消耗GCC的Yz
constraint (see Clang 3.2 Issue 13199 and Clang 3.9 Issue 32727),这是sha256rnds2
指令所需要的:
Yz First SSE register (%xmm0).
我们为 Clang 添加了 mov
:
asm ("mov %2, %%xmm0; sha256rnds2 %%xmm0, %1, %0" : "+x"(a) : "xm"(b), "x" (c) : "xmm0");
性能每字节下降大约 3 个周期。在我的 2.2 GHz Celeron J3455 测试机(带有 SHA 扩展的 Goldmont)上,大约是 230 MiB/s。这很重要。
查看反汇编,当执行两轮时,Clang 没有围绕 SHA k
进行优化:
Breakpoint 2, SHA256_SSE_SHA_HashBlocks (state=0xaaa3a0,
data=0xaaa340, length=0x40) at sha.cpp:1101
1101 STATE1 = _mm_loadu_si128((__m128i*) &state[4]);
(gdb) disass
Dump of assembler code for function SHA256_SSE_SHA_HashBlocks(unsigned int*, unsigned int const*, unsigned long):
0x000000000068cdd0 <+0>: sub [=13=]x308,%rsp
0x000000000068cdd7 <+7>: movdqu (%rdi),%xmm0
0x000000000068cddb <+11>: movdqu 0x10(%rdi),%xmm1
...
0x000000000068ce49 <+121>: movq %xmm2,%xmm0
0x000000000068ce4d <+125>: sha256rnds2 %xmm0,0x2f0(%rsp),%xmm1
0x000000000068ce56 <+134>: pshufd [=13=]xe,%xmm2,%xmm3
0x000000000068ce5b <+139>: movdqa %xmm13,%xmm2
0x000000000068ce60 <+144>: movaps %xmm1,0x2e0(%rsp)
0x000000000068ce68 <+152>: movq %xmm3,%xmm0
0x000000000068ce6c <+156>: sha256rnds2 %xmm0,0x2e0(%rsp),%xmm2
0x000000000068ce75 <+165>: movdqu 0x10(%rsi),%xmm3
0x000000000068ce7a <+170>: pshufb %xmm8,%xmm3
0x000000000068ce80 <+176>: movaps %xmm2,0x2d0(%rsp)
0x000000000068ce88 <+184>: movdqa %xmm3,%xmm4
0x000000000068ce8c <+188>: paddd 0x6729c(%rip),%xmm4 # 0x6f4130
0x000000000068ce94 <+196>: movq %xmm4,%xmm0
0x000000000068ce98 <+200>: sha256rnds2 %xmm0,0x2d0(%rsp),%xmm1
...
例如,0068ce8c
虽然 0068ce98
应该是:
paddd 0x6729c(%rip),%xmm0 # 0x6f4130
sha256rnds2 %xmm0,0x2d0(%rsp),%xmm1
我猜我们选择的内联 asm 指令有点不对。
我们如何解决 Clang 下缺少 Yz
机器约束的问题?什么模式避免了优化代码中的中间移动?
正在尝试使用 Explicit Register Variable:
const __m128i k asm("xmm0") = c;
asm ("sha256rnds2 %2, %1, %0" : "+x"(a) : "xm"(b), "x" (k));
return a;
结果:
In file included from sha.cpp:24:
./cpu.h:831:22: warning: ignored asm label 'xmm0' on automatic variable
const __m128i k asm("xmm0") = c;
^
./cpu.h:833:7: error: invalid operand for instruction
asm ("sha256rnds2 %2, %1, %0" : "+x"(a) : "xm"(b), "x" (k));
^
<inline asm>:1:21: note: instantiated into assembly here
sha256rnds2 %xmm1, 752(%rsp), %xmm0
^~~~~~~~~~
In file included from sha.cpp:24:
./cpu.h:833:7: error: invalid operand for instruction
asm ("sha256rnds2 %2, %1, %0" : "+x"(a) : "xm"(b), "x" (k));
^
<inline asm>:1:21: note: instantiated into assembly here
sha256rnds2 %xmm3, 736(%rsp), %xmm1
^~~~~~~~~~
...
我根据标签 inline assembly
创建了这个答案,没有提到特定的语言。 Extended assembly templates 已经假定使用语言扩展。
如果 Yz
约束不可用,您可以尝试创建一个临时变量来告诉 CLANG 使用什么寄存器而不是约束。您可以通过所谓的 Explicit Register Variable:
You can define a local register variable and associate it with a specified register like this:
register int *foo asm ("r12");
Here r12 is the name of the register that should be used. Note that this is the same syntax used for defining global register variables, but for a local variable the declaration appears within a function. The register keyword is required, and cannot be combined with static. The register name must be a valid register name for the target platform.
在您的情况下,您希望强制使用 xmm0
寄存器。您可以使用显式寄存器将 c
参数分配给临时变量,并将该临时变量用作扩展内联程序集的参数。这是 GCC/CLANG.
GCC_INLINE __m128i GCC_INLINE_ATTRIB
MM_SHA256RNDS2_EPU32(__m128i a, const __m128i b, const __m128i c)
{
register const __m128i tmpc asm("xmm0") = c;
__asm__("sha256rnds2 %2, %1, %0" : "+x"(a) : "x"(b), "x" (tmpc));
return a;
}
编译器现在应该能够提供一些优化,因为它对如何使用 xmm0
寄存器有更多的了解。
当您将 mov %2, %%xmm0;
放入模板 CLANG(和 GCC)时,不要对指令进行任何优化。 Basic Assembly 和 Extended Assembly 模板是一个黑盒子,它只知道如何根据约束进行基本替换。
下面是使用上述方法的反汇编。它是用 clang++
和 -std=c++03
编译的。额外的动作不再存在:
Breakpoint 1, SHA256_SSE_SHA_HashBlocks (state=0x7fffffffae60,
data=0x7fffffffae00, length=0x40) at sha.cpp:1101
1101 STATE1 = _mm_loadu_si128((__m128i*) &state[4]);
(gdb) disass
Dump of assembler code for function SHA256_SSE_SHA_HashBlocks(unsigned int*, unsigned int const*, unsigned long):
0x000000000068cf60 <+0>: sub [=12=]x308,%rsp
0x000000000068cf67 <+7>: movdqu (%rdi),%xmm0
0x000000000068cf6b <+11>: movdqu 0x10(%rdi),%xmm1
...
0x000000000068cfe6 <+134>: paddd 0x670e2(%rip),%xmm0 # 0x6f40d0
0x000000000068cfee <+142>: sha256rnds2 %xmm0,0x2f0(%rsp),%xmm2
0x000000000068cff7 <+151>: pshufd [=12=]xe,%xmm0,%xmm1
0x000000000068cffc <+156>: movdqa %xmm1,%xmm0
0x000000000068d000 <+160>: movaps %xmm2,0x2e0(%rsp)
0x000000000068d008 <+168>: sha256rnds2 %xmm0,0x2e0(%rsp),%xmm3
0x000000000068d011 <+177>: movdqu 0x10(%rsi),%xmm5
0x000000000068d016 <+182>: pshufb %xmm9,%xmm5
0x000000000068d01c <+188>: movaps %xmm3,0x2d0(%rsp)
0x000000000068d024 <+196>: movdqa %xmm5,%xmm0
0x000000000068d028 <+200>: paddd 0x670b0(%rip),%xmm0 # 0x6f40e0
0x000000000068d030 <+208>: sha256rnds2 %xmm0,0x2d0(%rsp),%xmm2
...