与 C 代码相比,Neon 内部代码没有提升性能
Neon intrinsic code not boosting performance compared to C code
我有一个简单的 C 代码可以减去 'num' 否。来自两个不同指针的值并写回第三个指针。我尝试使用 neon 内在函数来提高性能的相同代码,但我看不到代码执行时间有任何减少。我正在使用 ARM Cortex-A9 处理器。
下面是我的 C 代码:
int code_c(uint8_t *in1, uint8_t *in2, uint8_t *out, uint32_t num)
{
uint32_t i;
for(i = 0; i < (num); i++) {
out[i] = in1[i] - in2[i];
}
return 0;
}
对应的neon内部代码如下:
#include <arm_neon.h>
int code_neon(uint8_t * __restrict in1, uint8_t * __restrict in2, uint8_t * __restrict y, uint32_t num)
{
uint32_t i;
uint8x8_t s1, s2;
uint8x8_t out;
num = num/8;
for (i = num; i != 0; i--) {
s1 = vld1_u8(in1);
s2 = vld1_u8(in2);
out = vsub_u8(s1, s2);
vst1_u8(y, out);
in1+=8; in2+=8;y+=8;
__builtin_prefetch(in1+8);
__builtin_prefetch(in2+8);
}
return 0;
}
这里出了什么问题?
为 Neon 生成的汇编代码:
00000000 <code_neon(unsigned char*, unsigned char*, unsigned char*, unsigned int)>:
0: e92d4008 push {r3, lr}
4: e52de004 push {lr} ; (str lr, [sp, #-4]!)
8: ebfffffe bl 0 <__gnu_mcount_nc>
8: R_ARM_CALL __gnu_mcount_nc
c: e1b031a3 lsrs r3, r3, #3
10: 0a00000d beq 4c <code_neon(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x4c>
14: e280e008 add lr, r0, #8
18: e281c008 add ip, r1, #8
1c: f460070f vld1.8 {d16}, [r0]
20: e2533001 subs r3, r3, #1
24: e1a0000e mov r0, lr
28: e28ee008 add lr, lr, #8
2c: f461170f vld1.8 {d17}, [r1]
30: e1a0100c mov r1, ip
34: e28cc008 add ip, ip, #8
38: f5def000 pld [lr]
3c: f34008a1 vsub.i8 d16, d16, d17
40: f5dcf000 pld [ip]
44: f442070d vst1.8 {d16}, [r2]!
48: 1afffff3 bne 1c <code_neon(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1c>
4c: e3a00000 mov r0, #0
50: e8bd8008 pop {r3, pc}
C 的汇编代码:
00000000 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)>:
0: e92d43f8 push {r3, r4, r5, r6, r7, r8, r9, lr}
4: e52de004 push {lr} ; (str lr, [sp, #-4]!)
8: ebfffffe bl 0 <__gnu_mcount_nc>
8: R_ARM_CALL __gnu_mcount_nc
c: e3530000 cmp r3, #0
10: 0a0000f1 beq 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
14: e282c010 add ip, r2, #16
18: e280e010 add lr, r0, #16
1c: e152000e cmp r2, lr
20: 3150000c cmpcc r0, ip
24: e2814010 add r4, r1, #16
28: 23a0e001 movcs lr, #1
2c: 33a0e000 movcc lr, #0
30: e1520004 cmp r2, r4
34: 3151000c cmpcc r1, ip
38: 23a0c001 movcs ip, #1
3c: 33a0c000 movcc ip, #0
40: e00cc00e and ip, ip, lr
44: e3530013 cmp r3, #19
48: 93a0c000 movls ip, #0
4c: 820cc001 andhi ip, ip, #1
50: e35c0000 cmp ip, #0
54: 0a0000e2 beq 3e4 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3e4>
58: e200c007 and ip, r0, #7
5c: e26cc000 rsb ip, ip, #0
60: e20cc00f and ip, ip, #15
64: e15c0003 cmp ip, r3
68: 21a0c003 movcs ip, r3
6c: e35c0000 cmp ip, #0
70: 0a000059 beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
74: e5d0e000 ldrb lr, [r0]
78: e35c0001 cmp ip, #1
7c: e5d14000 ldrb r4, [r1]
80: e064e00e rsb lr, r4, lr
84: e5c2e000 strb lr, [r2]
88: 0a000053 beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
8c: e5d0e001 ldrb lr, [r0, #1]
90: e35c0002 cmp ip, #2
94: e5d14001 ldrb r4, [r1, #1]
98: e064e00e rsb lr, r4, lr
9c: e5c2e001 strb lr, [r2, #1]
a0: 0a00004d beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
a4: e5d0e002 ldrb lr, [r0, #2]
a8: e35c0003 cmp ip, #3
ac: e5d14002 ldrb r4, [r1, #2]
b0: e064e00e rsb lr, r4, lr
b4: e5c2e002 strb lr, [r2, #2]
b8: 0a000047 beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
bc: e5d0e003 ldrb lr, [r0, #3]
c0: e35c0004 cmp ip, #4
c4: e5d14003 ldrb r4, [r1, #3]
c8: e064e00e rsb lr, r4, lr
cc: e5c2e003 strb lr, [r2, #3]
d0: 0a000041 beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
d4: e5d0e004 ldrb lr, [r0, #4]
d8: e35c0005 cmp ip, #5
dc: e5d14004 ldrb r4, [r1, #4]
e0: e064e00e rsb lr, r4, lr
e4: e5c2e004 strb lr, [r2, #4]
e8: 0a00003b beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
ec: e5d0e005 ldrb lr, [r0, #5]
f0: e35c0006 cmp ip, #6
f4: e5d14005 ldrb r4, [r1, #5]
f8: e064e00e rsb lr, r4, lr
fc: e5c2e005 strb lr, [r2, #5]
100: 0a000035 beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
104: e5d0e006 ldrb lr, [r0, #6]
108: e35c0007 cmp ip, #7
10c: e5d14006 ldrb r4, [r1, #6]
110: e064e00e rsb lr, r4, lr
114: e5c2e006 strb lr, [r2, #6]
118: 0a0000be beq 418 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x418>
11c: e5d0e007 ldrb lr, [r0, #7]
120: e35c0008 cmp ip, #8
124: e5d14007 ldrb r4, [r1, #7]
128: e064e00e rsb lr, r4, lr
12c: e5c2e007 strb lr, [r2, #7]
130: 0a000029 beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
134: e5d0e008 ldrb lr, [r0, #8]
138: e35c0009 cmp ip, #9
13c: e5d14008 ldrb r4, [r1, #8]
140: e064e00e rsb lr, r4, lr
144: e5c2e008 strb lr, [r2, #8]
148: 0a000023 beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
14c: e5d0e009 ldrb lr, [r0, #9]
150: e35c000a cmp ip, #10
154: e5d14009 ldrb r4, [r1, #9]
158: e064e00e rsb lr, r4, lr
15c: e5c2e009 strb lr, [r2, #9]
160: 0a00001d beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
164: e5d0e00a ldrb lr, [r0, #10]
168: e35c000b cmp ip, #11
16c: e5d1400a ldrb r4, [r1, #10]
170: e064e00e rsb lr, r4, lr
174: e5c2e00a strb lr, [r2, #10]
178: 0a000017 beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
17c: e5d0e00b ldrb lr, [r0, #11]
180: e35c000c cmp ip, #12
184: e5d1400b ldrb r4, [r1, #11]
188: e064e00e rsb lr, r4, lr
18c: e5c2e00b strb lr, [r2, #11]
190: 0a000011 beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
194: e5d0e00c ldrb lr, [r0, #12]
198: e35c000d cmp ip, #13
19c: e5d1400c ldrb r4, [r1, #12]
1a0: e064e00e rsb lr, r4, lr
1a4: e5c2e00c strb lr, [r2, #12]
1a8: 0a00000b beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
1ac: e5d0e00d ldrb lr, [r0, #13]
1b0: e35c000f cmp ip, #15
1b4: e5d1400d ldrb r4, [r1, #13]
1b8: e064e00e rsb lr, r4, lr
1bc: e5c2e00d strb lr, [r2, #13]
1c0: 1a000092 bne 410 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x410>
1c4: e5d0400e ldrb r4, [r0, #14]
1c8: e1a0e00c mov lr, ip
1cc: e5d1500e ldrb r5, [r1, #14]
1d0: e0654004 rsb r4, r5, r4
1d4: e5c2400e strb r4, [r2, #14]
1d8: ea000000 b 1e0 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1e0>
1dc: e1a0e00c mov lr, ip
1e0: e06c6003 rsb r6, ip, r3
1e4: e2435001 sub r5, r3, #1
1e8: e2464010 sub r4, r6, #16
1ec: e06c5005 rsb r5, ip, r5
1f0: e1a04224 lsr r4, r4, #4
1f4: e355000e cmp r5, #14
1f8: e2844001 add r4, r4, #1
1fc: e1a05204 lsl r5, r4, #4
200: 9a000010 bls 248 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x248>
204: e080900c add r9, r0, ip
208: e081800c add r8, r1, ip
20c: e3a07000 mov r7, #0
210: e082c00c add ip, r2, ip
214: f4690adf vld1.64 {d16-d17}, [r9 :64]
218: e2877001 add r7, r7, #1
21c: e1570004 cmp r7, r4
220: e2899010 add r9, r9, #16
224: f4682a0f vld1.8 {d18-d19}, [r8]
228: e2888010 add r8, r8, #16
22c: f34008e2 vsub.i8 q8, q8, q9
230: f44c0a0f vst1.8 {d16-d17}, [ip]
234: e28cc010 add ip, ip, #16
238: 3afffff5 bcc 214 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x214>
23c: e1560005 cmp r6, r5
240: e08ee005 add lr, lr, r5
244: 0a000064 beq 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
248: e7d0c00e ldrb ip, [r0, lr]
24c: e28e4001 add r4, lr, #1
250: e7d1500e ldrb r5, [r1, lr]
254: e1530004 cmp r3, r4
258: e065c00c rsb ip, r5, ip
25c: e7c2c00e strb ip, [r2, lr]
260: 9a00005d bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
264: e7d05004 ldrb r5, [r0, r4]
268: e28ec002 add ip, lr, #2
26c: e7d16004 ldrb r6, [r1, r4]
270: e153000c cmp r3, ip
274: e0665005 rsb r5, r6, r5
278: e7c25004 strb r5, [r2, r4]
27c: 9a000056 bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
280: e7d0500c ldrb r5, [r0, ip]
284: e28e4003 add r4, lr, #3
288: e7d1600c ldrb r6, [r1, ip]
28c: e1530004 cmp r3, r4
290: e0665005 rsb r5, r6, r5
294: e7c2500c strb r5, [r2, ip]
298: 9a00004f bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
29c: e7d05004 ldrb r5, [r0, r4]
2a0: e28ec004 add ip, lr, #4
2a4: e7d16004 ldrb r6, [r1, r4]
2a8: e153000c cmp r3, ip
2ac: e0665005 rsb r5, r6, r5
2b0: e7c25004 strb r5, [r2, r4]
2b4: 9a000048 bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
2b8: e7d0500c ldrb r5, [r0, ip]
2bc: e28e4005 add r4, lr, #5
2c0: e7d1600c ldrb r6, [r1, ip]
2c4: e1530004 cmp r3, r4
2c8: e0665005 rsb r5, r6, r5
2cc: e7c2500c strb r5, [r2, ip]
2d0: 9a000041 bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
2d4: e7d05004 ldrb r5, [r0, r4]
2d8: e28ec006 add ip, lr, #6
2dc: e7d16004 ldrb r6, [r1, r4]
2e0: e153000c cmp r3, ip
2e4: e0665005 rsb r5, r6, r5
2e8: e7c25004 strb r5, [r2, r4]
2ec: 9a00003a bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
2f0: e7d0500c ldrb r5, [r0, ip]
2f4: e28e4007 add r4, lr, #7
2f8: e7d1600c ldrb r6, [r1, ip]
2fc: e1530004 cmp r3, r4
300: e0665005 rsb r5, r6, r5
304: e7c2500c strb r5, [r2, ip]
308: 9a000033 bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
30c: e7d05004 ldrb r5, [r0, r4]
310: e28ec008 add ip, lr, #8
314: e7d16004 ldrb r6, [r1, r4]
318: e153000c cmp r3, ip
31c: e0665005 rsb r5, r6, r5
320: e7c25004 strb r5, [r2, r4]
324: 9a00002c bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
328: e7d0500c ldrb r5, [r0, ip]
32c: e28e4009 add r4, lr, #9
330: e7d1600c ldrb r6, [r1, ip]
334: e1530004 cmp r3, r4
338: e0665005 rsb r5, r6, r5
33c: e7c2500c strb r5, [r2, ip]
340: 9a000025 bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
344: e7d05004 ldrb r5, [r0, r4]
348: e28ec00a add ip, lr, #10
34c: e7d16004 ldrb r6, [r1, r4]
350: e153000c cmp r3, ip
354: e0665005 rsb r5, r6, r5
358: e7c25004 strb r5, [r2, r4]
35c: 9a00001e bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
360: e7d0500c ldrb r5, [r0, ip]
364: e28e400b add r4, lr, #11
368: e7d1600c ldrb r6, [r1, ip]
36c: e1530004 cmp r3, r4
370: e0665005 rsb r5, r6, r5
374: e7c2500c strb r5, [r2, ip]
378: 9a000017 bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
37c: e7d05004 ldrb r5, [r0, r4]
380: e28ec00c add ip, lr, #12
384: e7d16004 ldrb r6, [r1, r4]
388: e153000c cmp r3, ip
38c: e0665005 rsb r5, r6, r5
390: e7c25004 strb r5, [r2, r4]
394: 9a000010 bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
398: e7d0500c ldrb r5, [r0, ip]
39c: e28e400d add r4, lr, #13
3a0: e7d1600c ldrb r6, [r1, ip]
3a4: e1530004 cmp r3, r4
3a8: e0665005 rsb r5, r6, r5
3ac: e7c2500c strb r5, [r2, ip]
3b0: 9a000009 bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
3b4: e7d05004 ldrb r5, [r0, r4]
3b8: e28ec00e add ip, lr, #14
3bc: e7d1e004 ldrb lr, [r1, r4]
3c0: e153000c cmp r3, ip
3c4: e06e3005 rsb r3, lr, r5
3c8: e7c23004 strb r3, [r2, r4]
3cc: 87d0300c ldrbhi r3, [r0, ip]
3d0: 87d1100c ldrbhi r1, [r1, ip]
3d4: 80613003 rsbhi r3, r1, r3
3d8: 87c2300c strbhi r3, [r2, ip]
3dc: e3a00000 mov r0, #0
3e0: e8bd83f8 pop {r3, r4, r5, r6, r7, r8, r9, pc}
3e4: e2411001 sub r1, r1, #1
3e8: e0803003 add r3, r0, r3
3ec: e2422001 sub r2, r2, #1
3f0: e4d0c001 ldrb ip, [r0], #1
3f4: e5f1e001 ldrb lr, [r1, #1]!
3f8: e1500003 cmp r0, r3
3fc: e06ec00c rsb ip, lr, ip
400: e5e2c001 strb ip, [r2, #1]!
404: 1afffff9 bne 3f0 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3f0>
408: e3a00000 mov r0, #0
40c: e8bd83f8 pop {r3, r4, r5, r6, r7, r8, r9, pc}
410: e3a0e00e mov lr, #14
414: eaffff71 b 1e0 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1e0>
418: e3a0e007 mov lr, #7
41c: eaffff6f b 1e0 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1e0>
我猜你的 C 编译器,更具体地说是它的 ARM 后端,知道一些关于你正在编译的 ARM 架构的事情。你的例子非常简单和普遍,可能正是针对这个问题进行了优化:)
代码的手动优化最适用于奇怪和不常见的情况,因此编译器只是放弃并使用实际的非展开循环直接 1 : 1 转换为线性代码,这当然会非常大通过手动优化改进:)
编译器写了这个(埋在很多设置代码中以处理边缘情况):
214: f4690adf vld1.64 {d16-d17}, [r9 :64]
218: e2877001 add r7, r7, #1
21c: e1570004 cmp r7, r4
220: e2899010 add r9, r9, #16
224: f4682a0f vld1.8 {d18-d19}, [r8]
228: e2888010 add r8, r8, #16
22c: f34008e2 vsub.i8 q8, q8, q9
230: f44c0a0f vst1.8 {d16-d17}, [ip]
234: e28cc010 add ip, ip, #16
238: 3afffff5 bcc 214 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x214>
我的 NEON 很生疏,我不打算在这里解码每一行(我会弄错;希望有人会提供更完整的答案),但这是一次加载 16 个字节到两个 128 位寄存器,并行地相互减去所有 16,然后将它们全部写回目标。所以这是在做你正在寻找的矢量化。您的 NEON 代码可能比编译器的代码稍快这一事实至少部分是因为您没有检查 n
不能被 8 整除的边缘情况。编译器会。
在大多数情况下,内在函数不会有帮助。如果你想打败编译器,你将需要自己处理整个管道,而内在函数的功能还不够强大。您需要能够选择您的寄存器,决定何时读取和写入内存,并非常仔细地管理您的数据布局,甚至开始击败编译器(因为它已经完成了所有这些)。
为什么即使编写基本相同的内容,编译器的并行行为通常也比手写的要好?那么,您如何管理故障?汇编指令不运行串联;其中许多 运行 并行。通常当您执行一条昂贵的指令时,您可能需要几个时钟周期才能读取结果。如果您尝试,处理器必须停止并等待。为了避免这个问题,您经常以非常奇怪的顺序编写汇编,例如 "start computation, load next data, write result of computation." 这真的很难用内部函数实现。
对于您在@yeoman 的回答中的一些评论:
- 执行时间不是取决于生成的汇编指令的数量吗?
绝对不是。执行时间取决于执行的汇编指令的数量以及这些指令是什么以及它们的执行顺序。非常非常经常(几乎总是)更快的代码是汇编中更长的代码。 (当然不能保证反过来……)最著名的例子是循环展开。连续 3 次剪切和粘贴操作将比计数为 3 的循环更快。单独避免分支将是巨大的。因此编译器在预先知道迭代次数时会自动展开小循环。
- 与 C 上的单个操作相比,Neon 上应该有 8 个并行操作。
应该有 8 个并行操作,编译器会生成它。但是您的代码没有;它一次做一个。
仅仅使用 NEON 确实神奇地让它运行得更快;编译器已经使用了 NEON。
对于稍微不同的问题(在 iOS 中讨论 Accelerate 框架),但仍然解决相同的基本问题,请参阅 Introduction to Fast Bezier。
还要重申@yeoman 的观点:如果一些非常简单和机械的更改可以使您的 C 代码运行得更快,编译器会为您完成(并且确实如此)。
我有一个简单的 C 代码可以减去 'num' 否。来自两个不同指针的值并写回第三个指针。我尝试使用 neon 内在函数来提高性能的相同代码,但我看不到代码执行时间有任何减少。我正在使用 ARM Cortex-A9 处理器。
下面是我的 C 代码:
int code_c(uint8_t *in1, uint8_t *in2, uint8_t *out, uint32_t num)
{
uint32_t i;
for(i = 0; i < (num); i++) {
out[i] = in1[i] - in2[i];
}
return 0;
}
对应的neon内部代码如下:
#include <arm_neon.h>
int code_neon(uint8_t * __restrict in1, uint8_t * __restrict in2, uint8_t * __restrict y, uint32_t num)
{
uint32_t i;
uint8x8_t s1, s2;
uint8x8_t out;
num = num/8;
for (i = num; i != 0; i--) {
s1 = vld1_u8(in1);
s2 = vld1_u8(in2);
out = vsub_u8(s1, s2);
vst1_u8(y, out);
in1+=8; in2+=8;y+=8;
__builtin_prefetch(in1+8);
__builtin_prefetch(in2+8);
}
return 0;
}
这里出了什么问题?
为 Neon 生成的汇编代码:
00000000 <code_neon(unsigned char*, unsigned char*, unsigned char*, unsigned int)>:
0: e92d4008 push {r3, lr}
4: e52de004 push {lr} ; (str lr, [sp, #-4]!)
8: ebfffffe bl 0 <__gnu_mcount_nc>
8: R_ARM_CALL __gnu_mcount_nc
c: e1b031a3 lsrs r3, r3, #3
10: 0a00000d beq 4c <code_neon(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x4c>
14: e280e008 add lr, r0, #8
18: e281c008 add ip, r1, #8
1c: f460070f vld1.8 {d16}, [r0]
20: e2533001 subs r3, r3, #1
24: e1a0000e mov r0, lr
28: e28ee008 add lr, lr, #8
2c: f461170f vld1.8 {d17}, [r1]
30: e1a0100c mov r1, ip
34: e28cc008 add ip, ip, #8
38: f5def000 pld [lr]
3c: f34008a1 vsub.i8 d16, d16, d17
40: f5dcf000 pld [ip]
44: f442070d vst1.8 {d16}, [r2]!
48: 1afffff3 bne 1c <code_neon(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1c>
4c: e3a00000 mov r0, #0
50: e8bd8008 pop {r3, pc}
C 的汇编代码:
00000000 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)>:
0: e92d43f8 push {r3, r4, r5, r6, r7, r8, r9, lr}
4: e52de004 push {lr} ; (str lr, [sp, #-4]!)
8: ebfffffe bl 0 <__gnu_mcount_nc>
8: R_ARM_CALL __gnu_mcount_nc
c: e3530000 cmp r3, #0
10: 0a0000f1 beq 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
14: e282c010 add ip, r2, #16
18: e280e010 add lr, r0, #16
1c: e152000e cmp r2, lr
20: 3150000c cmpcc r0, ip
24: e2814010 add r4, r1, #16
28: 23a0e001 movcs lr, #1
2c: 33a0e000 movcc lr, #0
30: e1520004 cmp r2, r4
34: 3151000c cmpcc r1, ip
38: 23a0c001 movcs ip, #1
3c: 33a0c000 movcc ip, #0
40: e00cc00e and ip, ip, lr
44: e3530013 cmp r3, #19
48: 93a0c000 movls ip, #0
4c: 820cc001 andhi ip, ip, #1
50: e35c0000 cmp ip, #0
54: 0a0000e2 beq 3e4 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3e4>
58: e200c007 and ip, r0, #7
5c: e26cc000 rsb ip, ip, #0
60: e20cc00f and ip, ip, #15
64: e15c0003 cmp ip, r3
68: 21a0c003 movcs ip, r3
6c: e35c0000 cmp ip, #0
70: 0a000059 beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
74: e5d0e000 ldrb lr, [r0]
78: e35c0001 cmp ip, #1
7c: e5d14000 ldrb r4, [r1]
80: e064e00e rsb lr, r4, lr
84: e5c2e000 strb lr, [r2]
88: 0a000053 beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
8c: e5d0e001 ldrb lr, [r0, #1]
90: e35c0002 cmp ip, #2
94: e5d14001 ldrb r4, [r1, #1]
98: e064e00e rsb lr, r4, lr
9c: e5c2e001 strb lr, [r2, #1]
a0: 0a00004d beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
a4: e5d0e002 ldrb lr, [r0, #2]
a8: e35c0003 cmp ip, #3
ac: e5d14002 ldrb r4, [r1, #2]
b0: e064e00e rsb lr, r4, lr
b4: e5c2e002 strb lr, [r2, #2]
b8: 0a000047 beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
bc: e5d0e003 ldrb lr, [r0, #3]
c0: e35c0004 cmp ip, #4
c4: e5d14003 ldrb r4, [r1, #3]
c8: e064e00e rsb lr, r4, lr
cc: e5c2e003 strb lr, [r2, #3]
d0: 0a000041 beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
d4: e5d0e004 ldrb lr, [r0, #4]
d8: e35c0005 cmp ip, #5
dc: e5d14004 ldrb r4, [r1, #4]
e0: e064e00e rsb lr, r4, lr
e4: e5c2e004 strb lr, [r2, #4]
e8: 0a00003b beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
ec: e5d0e005 ldrb lr, [r0, #5]
f0: e35c0006 cmp ip, #6
f4: e5d14005 ldrb r4, [r1, #5]
f8: e064e00e rsb lr, r4, lr
fc: e5c2e005 strb lr, [r2, #5]
100: 0a000035 beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
104: e5d0e006 ldrb lr, [r0, #6]
108: e35c0007 cmp ip, #7
10c: e5d14006 ldrb r4, [r1, #6]
110: e064e00e rsb lr, r4, lr
114: e5c2e006 strb lr, [r2, #6]
118: 0a0000be beq 418 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x418>
11c: e5d0e007 ldrb lr, [r0, #7]
120: e35c0008 cmp ip, #8
124: e5d14007 ldrb r4, [r1, #7]
128: e064e00e rsb lr, r4, lr
12c: e5c2e007 strb lr, [r2, #7]
130: 0a000029 beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
134: e5d0e008 ldrb lr, [r0, #8]
138: e35c0009 cmp ip, #9
13c: e5d14008 ldrb r4, [r1, #8]
140: e064e00e rsb lr, r4, lr
144: e5c2e008 strb lr, [r2, #8]
148: 0a000023 beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
14c: e5d0e009 ldrb lr, [r0, #9]
150: e35c000a cmp ip, #10
154: e5d14009 ldrb r4, [r1, #9]
158: e064e00e rsb lr, r4, lr
15c: e5c2e009 strb lr, [r2, #9]
160: 0a00001d beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
164: e5d0e00a ldrb lr, [r0, #10]
168: e35c000b cmp ip, #11
16c: e5d1400a ldrb r4, [r1, #10]
170: e064e00e rsb lr, r4, lr
174: e5c2e00a strb lr, [r2, #10]
178: 0a000017 beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
17c: e5d0e00b ldrb lr, [r0, #11]
180: e35c000c cmp ip, #12
184: e5d1400b ldrb r4, [r1, #11]
188: e064e00e rsb lr, r4, lr
18c: e5c2e00b strb lr, [r2, #11]
190: 0a000011 beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
194: e5d0e00c ldrb lr, [r0, #12]
198: e35c000d cmp ip, #13
19c: e5d1400c ldrb r4, [r1, #12]
1a0: e064e00e rsb lr, r4, lr
1a4: e5c2e00c strb lr, [r2, #12]
1a8: 0a00000b beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
1ac: e5d0e00d ldrb lr, [r0, #13]
1b0: e35c000f cmp ip, #15
1b4: e5d1400d ldrb r4, [r1, #13]
1b8: e064e00e rsb lr, r4, lr
1bc: e5c2e00d strb lr, [r2, #13]
1c0: 1a000092 bne 410 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x410>
1c4: e5d0400e ldrb r4, [r0, #14]
1c8: e1a0e00c mov lr, ip
1cc: e5d1500e ldrb r5, [r1, #14]
1d0: e0654004 rsb r4, r5, r4
1d4: e5c2400e strb r4, [r2, #14]
1d8: ea000000 b 1e0 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1e0>
1dc: e1a0e00c mov lr, ip
1e0: e06c6003 rsb r6, ip, r3
1e4: e2435001 sub r5, r3, #1
1e8: e2464010 sub r4, r6, #16
1ec: e06c5005 rsb r5, ip, r5
1f0: e1a04224 lsr r4, r4, #4
1f4: e355000e cmp r5, #14
1f8: e2844001 add r4, r4, #1
1fc: e1a05204 lsl r5, r4, #4
200: 9a000010 bls 248 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x248>
204: e080900c add r9, r0, ip
208: e081800c add r8, r1, ip
20c: e3a07000 mov r7, #0
210: e082c00c add ip, r2, ip
214: f4690adf vld1.64 {d16-d17}, [r9 :64]
218: e2877001 add r7, r7, #1
21c: e1570004 cmp r7, r4
220: e2899010 add r9, r9, #16
224: f4682a0f vld1.8 {d18-d19}, [r8]
228: e2888010 add r8, r8, #16
22c: f34008e2 vsub.i8 q8, q8, q9
230: f44c0a0f vst1.8 {d16-d17}, [ip]
234: e28cc010 add ip, ip, #16
238: 3afffff5 bcc 214 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x214>
23c: e1560005 cmp r6, r5
240: e08ee005 add lr, lr, r5
244: 0a000064 beq 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
248: e7d0c00e ldrb ip, [r0, lr]
24c: e28e4001 add r4, lr, #1
250: e7d1500e ldrb r5, [r1, lr]
254: e1530004 cmp r3, r4
258: e065c00c rsb ip, r5, ip
25c: e7c2c00e strb ip, [r2, lr]
260: 9a00005d bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
264: e7d05004 ldrb r5, [r0, r4]
268: e28ec002 add ip, lr, #2
26c: e7d16004 ldrb r6, [r1, r4]
270: e153000c cmp r3, ip
274: e0665005 rsb r5, r6, r5
278: e7c25004 strb r5, [r2, r4]
27c: 9a000056 bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
280: e7d0500c ldrb r5, [r0, ip]
284: e28e4003 add r4, lr, #3
288: e7d1600c ldrb r6, [r1, ip]
28c: e1530004 cmp r3, r4
290: e0665005 rsb r5, r6, r5
294: e7c2500c strb r5, [r2, ip]
298: 9a00004f bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
29c: e7d05004 ldrb r5, [r0, r4]
2a0: e28ec004 add ip, lr, #4
2a4: e7d16004 ldrb r6, [r1, r4]
2a8: e153000c cmp r3, ip
2ac: e0665005 rsb r5, r6, r5
2b0: e7c25004 strb r5, [r2, r4]
2b4: 9a000048 bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
2b8: e7d0500c ldrb r5, [r0, ip]
2bc: e28e4005 add r4, lr, #5
2c0: e7d1600c ldrb r6, [r1, ip]
2c4: e1530004 cmp r3, r4
2c8: e0665005 rsb r5, r6, r5
2cc: e7c2500c strb r5, [r2, ip]
2d0: 9a000041 bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
2d4: e7d05004 ldrb r5, [r0, r4]
2d8: e28ec006 add ip, lr, #6
2dc: e7d16004 ldrb r6, [r1, r4]
2e0: e153000c cmp r3, ip
2e4: e0665005 rsb r5, r6, r5
2e8: e7c25004 strb r5, [r2, r4]
2ec: 9a00003a bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
2f0: e7d0500c ldrb r5, [r0, ip]
2f4: e28e4007 add r4, lr, #7
2f8: e7d1600c ldrb r6, [r1, ip]
2fc: e1530004 cmp r3, r4
300: e0665005 rsb r5, r6, r5
304: e7c2500c strb r5, [r2, ip]
308: 9a000033 bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
30c: e7d05004 ldrb r5, [r0, r4]
310: e28ec008 add ip, lr, #8
314: e7d16004 ldrb r6, [r1, r4]
318: e153000c cmp r3, ip
31c: e0665005 rsb r5, r6, r5
320: e7c25004 strb r5, [r2, r4]
324: 9a00002c bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
328: e7d0500c ldrb r5, [r0, ip]
32c: e28e4009 add r4, lr, #9
330: e7d1600c ldrb r6, [r1, ip]
334: e1530004 cmp r3, r4
338: e0665005 rsb r5, r6, r5
33c: e7c2500c strb r5, [r2, ip]
340: 9a000025 bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
344: e7d05004 ldrb r5, [r0, r4]
348: e28ec00a add ip, lr, #10
34c: e7d16004 ldrb r6, [r1, r4]
350: e153000c cmp r3, ip
354: e0665005 rsb r5, r6, r5
358: e7c25004 strb r5, [r2, r4]
35c: 9a00001e bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
360: e7d0500c ldrb r5, [r0, ip]
364: e28e400b add r4, lr, #11
368: e7d1600c ldrb r6, [r1, ip]
36c: e1530004 cmp r3, r4
370: e0665005 rsb r5, r6, r5
374: e7c2500c strb r5, [r2, ip]
378: 9a000017 bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
37c: e7d05004 ldrb r5, [r0, r4]
380: e28ec00c add ip, lr, #12
384: e7d16004 ldrb r6, [r1, r4]
388: e153000c cmp r3, ip
38c: e0665005 rsb r5, r6, r5
390: e7c25004 strb r5, [r2, r4]
394: 9a000010 bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
398: e7d0500c ldrb r5, [r0, ip]
39c: e28e400d add r4, lr, #13
3a0: e7d1600c ldrb r6, [r1, ip]
3a4: e1530004 cmp r3, r4
3a8: e0665005 rsb r5, r6, r5
3ac: e7c2500c strb r5, [r2, ip]
3b0: 9a000009 bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
3b4: e7d05004 ldrb r5, [r0, r4]
3b8: e28ec00e add ip, lr, #14
3bc: e7d1e004 ldrb lr, [r1, r4]
3c0: e153000c cmp r3, ip
3c4: e06e3005 rsb r3, lr, r5
3c8: e7c23004 strb r3, [r2, r4]
3cc: 87d0300c ldrbhi r3, [r0, ip]
3d0: 87d1100c ldrbhi r1, [r1, ip]
3d4: 80613003 rsbhi r3, r1, r3
3d8: 87c2300c strbhi r3, [r2, ip]
3dc: e3a00000 mov r0, #0
3e0: e8bd83f8 pop {r3, r4, r5, r6, r7, r8, r9, pc}
3e4: e2411001 sub r1, r1, #1
3e8: e0803003 add r3, r0, r3
3ec: e2422001 sub r2, r2, #1
3f0: e4d0c001 ldrb ip, [r0], #1
3f4: e5f1e001 ldrb lr, [r1, #1]!
3f8: e1500003 cmp r0, r3
3fc: e06ec00c rsb ip, lr, ip
400: e5e2c001 strb ip, [r2, #1]!
404: 1afffff9 bne 3f0 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3f0>
408: e3a00000 mov r0, #0
40c: e8bd83f8 pop {r3, r4, r5, r6, r7, r8, r9, pc}
410: e3a0e00e mov lr, #14
414: eaffff71 b 1e0 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1e0>
418: e3a0e007 mov lr, #7
41c: eaffff6f b 1e0 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1e0>
我猜你的 C 编译器,更具体地说是它的 ARM 后端,知道一些关于你正在编译的 ARM 架构的事情。你的例子非常简单和普遍,可能正是针对这个问题进行了优化:)
代码的手动优化最适用于奇怪和不常见的情况,因此编译器只是放弃并使用实际的非展开循环直接 1 : 1 转换为线性代码,这当然会非常大通过手动优化改进:)
编译器写了这个(埋在很多设置代码中以处理边缘情况):
214: f4690adf vld1.64 {d16-d17}, [r9 :64]
218: e2877001 add r7, r7, #1
21c: e1570004 cmp r7, r4
220: e2899010 add r9, r9, #16
224: f4682a0f vld1.8 {d18-d19}, [r8]
228: e2888010 add r8, r8, #16
22c: f34008e2 vsub.i8 q8, q8, q9
230: f44c0a0f vst1.8 {d16-d17}, [ip]
234: e28cc010 add ip, ip, #16
238: 3afffff5 bcc 214 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x214>
我的 NEON 很生疏,我不打算在这里解码每一行(我会弄错;希望有人会提供更完整的答案),但这是一次加载 16 个字节到两个 128 位寄存器,并行地相互减去所有 16,然后将它们全部写回目标。所以这是在做你正在寻找的矢量化。您的 NEON 代码可能比编译器的代码稍快这一事实至少部分是因为您没有检查 n
不能被 8 整除的边缘情况。编译器会。
在大多数情况下,内在函数不会有帮助。如果你想打败编译器,你将需要自己处理整个管道,而内在函数的功能还不够强大。您需要能够选择您的寄存器,决定何时读取和写入内存,并非常仔细地管理您的数据布局,甚至开始击败编译器(因为它已经完成了所有这些)。
为什么即使编写基本相同的内容,编译器的并行行为通常也比手写的要好?那么,您如何管理故障?汇编指令不运行串联;其中许多 运行 并行。通常当您执行一条昂贵的指令时,您可能需要几个时钟周期才能读取结果。如果您尝试,处理器必须停止并等待。为了避免这个问题,您经常以非常奇怪的顺序编写汇编,例如 "start computation, load next data, write result of computation." 这真的很难用内部函数实现。
对于您在@yeoman 的回答中的一些评论:
- 执行时间不是取决于生成的汇编指令的数量吗?
绝对不是。执行时间取决于执行的汇编指令的数量以及这些指令是什么以及它们的执行顺序。非常非常经常(几乎总是)更快的代码是汇编中更长的代码。 (当然不能保证反过来……)最著名的例子是循环展开。连续 3 次剪切和粘贴操作将比计数为 3 的循环更快。单独避免分支将是巨大的。因此编译器在预先知道迭代次数时会自动展开小循环。
- 与 C 上的单个操作相比,Neon 上应该有 8 个并行操作。
应该有 8 个并行操作,编译器会生成它。但是您的代码没有;它一次做一个。
仅仅使用 NEON 确实神奇地让它运行得更快;编译器已经使用了 NEON。
对于稍微不同的问题(在 iOS 中讨论 Accelerate 框架),但仍然解决相同的基本问题,请参阅 Introduction to Fast Bezier。
还要重申@yeoman 的观点:如果一些非常简单和机械的更改可以使您的 C 代码运行得更快,编译器会为您完成(并且确实如此)。