NASM - 使用标签作为数组偏移量
NASM - Using Labels as Array offsets
我正在尝试用汇编程序编写一个小程序,它将三个 char
数组作为输入,计算第一个数组中每个元素的平均值,并将结果存储在第三个数组中,如下所示。
%macro prologue 0
push rbp
mov rbp,rsp
push rbx
push r12
push r13
push r14
push r15
%endmacro
%macro epilogue 0
pop r15
pop r14
pop r13
pop r12
pop rbx
leave
ret
%endmacro
segment .data
offset db 1
segment .bss
a1 resq 1
a2 resq 1
avg resq 1
avgL resd 1
segment .text
global avgArray
avgArray:
prologue
mov [a1], rdi
mov [a2], rsi
mov [avg], rdx
mov [avgL], rcx
mov rsi, [a1]
mov r9, [a2]
mov rdi, [avg]
mov rcx, rsi
add rcx, [avgL] ; array length
xor rdx, rdx
xor rax, rax
xor rbx, rbx
avgArray_loop:
mov al, [rsi]
mov dl, [r9]
add ax, dx
shr ax, 1
mov [rdi], al
add rsi, [offset]
add r9, [offset]
add rdi, [offset]
cmp rsi, rcx
jb avgArray_loop
epilogue
将 [offset]
替换为 1
时效果非常好。但是,当使用 [offset]
确定下一个数组元素时,它似乎不会将其值添加到 rsi
、rdi
和 r9
。
我已经使用 gdb 检查了它。 rsi
中保存的地址在调用add rsi, [offset]
后还是一样的。
有人能告诉我为什么使用 [offset]
不起作用但添加一个简单的 1 可以吗?
顺便说一句:Linuxx86_64机器
所以我找到了解决该问题的方法。
avgL
和 offset
的地址直接存储在彼此后面。当从 rcx
读取并将其存储到 avgL
时,它也覆盖了 offset
的值。将 avgL
声明为 QWORD 而不是 DWORD 可防止 mov
覆盖 offset
数据。
新的数据和 bss 段看起来像这样
segment .data
offset db 1
segment .bss
a1 resq 1
a2 resq 1
avg resq 1
avgL resq 1
亲自调试问题,干得不错。因为我已经开始看代码了,我会给你一些效率/风格评论作为补充评论:
%macro prologue 0
push rbp
mov rbp,rsp ; you can drop this and the LEAVE.
; Stack frames were useful before debuggers could keep track of things without them, and as a convenience
; so local variables were always at the same offset from your base pointer, even while you were pushing/popping stuff on the stack.
; With the SysV ABI, you can use the red zone for locals without even
; fiddling with RSP at all, if you don't push/pop or call anything.
push rbx
push r12
push r13
push r14
push r15
%endmacro
%macro epilogue 0
pop r15
pop r14
pop r13
pop r12
pop rbx
leave
ret
%endmacro
segment .data
offset db 1
segment .bss ; These should really be locals on the stack (or in regs!), not globals
a1 resq 1
a2 resq 1
avg resq 1
avgL resd 1
segment .text
; usually a comment with a C function prototype and description is a good idea for functions
global avgArray
avgArray:
prologue
mov [a1], rdi ; what is this sillyness? you have 16 registers for a reason.
mov [a2], rsi ; shuffling the values you want into the regs you want them in
mov [avg], rdx ; is best done with reg-reg moves.
mov [avgL], rcx ; I like to just put a comment at the top of a block of code
; to document what goes in what reg.
mov rsi, [a1]
mov r9, [a2]
mov rdi, [avg]
mov rcx, rsi
add rcx, [avgL] ; This could be lea rcx, [rsi+rcx]
; (since avgL is in rcx anyway as a function arg).
xor rdx, rdx
xor rax, rax
xor rbx, rbx
avgArray_loop: ; you can use a local label here, starting with a .
; You don't need a diff name for each loop: the assembler will branch to the most recent instance of that label
mov al, [rsi] ; there's a data dependency on the old value of ax
mov dl, [r9] ; since the CPU doesn't "know" that shr ax, 1 will always leave ah zeroed in this algorithm
add ax, dx ; Avoid ALU ops on 16bit regs whenever possible. (8bit is fine, they have diff opcodes instead of a prefix)
; to avoid decode stalls on Intel
shr ax, 1 ; Better to use 32bit regs (movsx/movzx)
mov [rdi], al
add rsi, [offset] ; These are 64bit adds, so you're reading 7 bytes after the 1 you set with db.
add r9, [offset]
add rdi, [offset]
cmp rsi, rcx
jb avgArray_loop
epilogue
您有大量可用的寄存器,为什么要在内存中保留循环增量?我希望它在调试/尝试时就这样结束了。
此外,1-reg addressing modes are only more efficient when used as mem operands for ALU ops。当你有很多指针(除非你正在展开循环)时,只需增加一个计数器并使用 base+offset*scale 寻址,尤其是。如果你用 mov
.
加载它们
这是我的做法(对 Intel SnB 及更高版本进行性能分析):
标量
; no storage needed
segment .text
GLOBAL avgArray
avgArray:
; void avgArray (uint8_t *avg, const uint8_t *a1, const uint8_t *a2, size_t len)
; if you can choose your prototype, do it so args go where you want them anyway.
; prologue
; rdi = avg
; rsi = a1
; rdx = a2
; rcx = len
; mov [rsp-8], rcx ; if I wanted to spill len to memory
add rcx, rdi
add rcx, rsi
add rcx, rdx
neg rcx ; now [rdi+rcx] is the start of dest, and we can count rcx upwards towards zero.
; We could also have just counted down towards zero
; but HW memory prefetchers have more stream slots for forward patterns than reverse.
ALIGN 16
.loop:
; use movsx for signed char
movzx eax, [rsi+rcx] ; dependency-breaker
movzx r8d, [rdx+rcx] ; Using r8d to save push/pop of rbx
; on pre-Nehalem where insn decode can be a bottleneck even in tight loops
; using ebx or ebp would save a REX prefix (1 insn byte).
add eax, r8d
shr eax, 1
mov [rdi+rcx], al
inc rcx ; No cmp needed: this is the point of counting up towards zero
jl .loop ; inc/jl can Macro-fuse into one uop
; nothing to pop, we only used caller-saved regs.
ret
在Intel上,loop是7 uops,(store是2 uops:store address和store data,不能micro-fuse),所以一个CPU一个循环可以发出4 uops将以每字节 2 个周期执行。 movzx
(对于 32 位或 64 位 reg)无论如何都是 1 uop,因为没有端口 0/1/5 uop 用于微融合或不微融合。 (这是读取,而不是读取修改)。
7 微指令占用最多 4 微指令的 2 个块,因此循环可以在 2 个周期内发出。没有其他瓶颈应该阻止执行单元跟上它,所以它应该 运行 每 2 个周期一个。
向量
有一个矢量指令可以准确地执行此操作:PAVGB
是无符号字节的平均打包(带有 9 位临时值以避免溢出,与您的 add/shr 相同)。
; no storage needed
segment .text
GLOBAL avgArray
avgArray:
; void avgArray (uint8_t *avg, const uint8_t *a1, const uint8_t *a2, size_t len)
; rdi = avg
; rsi = a1
; rdx = a2
; rcx = len
; same setup
; TODO: scalar loop here until [rdx+rcx] is aligned.
ALIGN 16
.loop:
; use movsx for signed char
movdqu xmm0, [rsi+rcx] ; 1 uop
pavgb xmm0, [rdx+rcx] ; 2 uops (no micro-fusion)
movdqu [rdi+rcx], xmm0 ; 2 uops: no micro-fusion
add rcx, 16
jl .loop ; 1 macro-fused uop add/branch
; TODO: scalar cleanup.
ret
正确设置循环退出条件很棘手,因为如果下一个 16B 超出数组末尾,您需要结束向量循环。概率。最好通过在将 rcx 添加到指针之前将 rcx 递减 15 或其他值来处理它。
所以,每次迭代 6 微指令/2 个周期,但每次迭代将执行 16 个字节。展开是理想的,因此您的循环是 4 微指令的倍数,因此您不会在循环结束时以小于 4 微指令的周期丢失问题率。每个周期 2 次加载/1 次存储是我们的瓶颈,因为 PAVGB
每个周期的吞吐量为 2。
16B / 周期在 Haswell 和更高版本上应该不难。对于使用 ymm 寄存器的 AVX2,您将获得 32B/周期。 (SnB/IvB 每个周期只能做两次内存操作,最多一次存储,除非你使用 256b loads/stores)。无论如何,在这一点上,您已经从矢量化中获得了 16 倍的巨大加速,通常这已经足够好了。我只是喜欢通过计算 uops 和展开来调整理论最大吞吐量。 :)
如果您要完全展开循环,那么值得增加指针而不仅仅是索引。 (因此 [rdx] 有两种用途,一种加法,而 [rdx+rcx] 有两种用途。
无论哪种方式,清理循环设置并将所有内容保存在寄存器中可以节省大量的指令字节和短数组的开销。
我正在尝试用汇编程序编写一个小程序,它将三个 char
数组作为输入,计算第一个数组中每个元素的平均值,并将结果存储在第三个数组中,如下所示。
%macro prologue 0
push rbp
mov rbp,rsp
push rbx
push r12
push r13
push r14
push r15
%endmacro
%macro epilogue 0
pop r15
pop r14
pop r13
pop r12
pop rbx
leave
ret
%endmacro
segment .data
offset db 1
segment .bss
a1 resq 1
a2 resq 1
avg resq 1
avgL resd 1
segment .text
global avgArray
avgArray:
prologue
mov [a1], rdi
mov [a2], rsi
mov [avg], rdx
mov [avgL], rcx
mov rsi, [a1]
mov r9, [a2]
mov rdi, [avg]
mov rcx, rsi
add rcx, [avgL] ; array length
xor rdx, rdx
xor rax, rax
xor rbx, rbx
avgArray_loop:
mov al, [rsi]
mov dl, [r9]
add ax, dx
shr ax, 1
mov [rdi], al
add rsi, [offset]
add r9, [offset]
add rdi, [offset]
cmp rsi, rcx
jb avgArray_loop
epilogue
将 [offset]
替换为 1
时效果非常好。但是,当使用 [offset]
确定下一个数组元素时,它似乎不会将其值添加到 rsi
、rdi
和 r9
。
我已经使用 gdb 检查了它。 rsi
中保存的地址在调用add rsi, [offset]
后还是一样的。
有人能告诉我为什么使用 [offset]
不起作用但添加一个简单的 1 可以吗?
顺便说一句:Linuxx86_64机器
所以我找到了解决该问题的方法。
avgL
和 offset
的地址直接存储在彼此后面。当从 rcx
读取并将其存储到 avgL
时,它也覆盖了 offset
的值。将 avgL
声明为 QWORD 而不是 DWORD 可防止 mov
覆盖 offset
数据。
新的数据和 bss 段看起来像这样
segment .data
offset db 1
segment .bss
a1 resq 1
a2 resq 1
avg resq 1
avgL resq 1
亲自调试问题,干得不错。因为我已经开始看代码了,我会给你一些效率/风格评论作为补充评论:
%macro prologue 0
push rbp
mov rbp,rsp ; you can drop this and the LEAVE.
; Stack frames were useful before debuggers could keep track of things without them, and as a convenience
; so local variables were always at the same offset from your base pointer, even while you were pushing/popping stuff on the stack.
; With the SysV ABI, you can use the red zone for locals without even
; fiddling with RSP at all, if you don't push/pop or call anything.
push rbx
push r12
push r13
push r14
push r15
%endmacro
%macro epilogue 0
pop r15
pop r14
pop r13
pop r12
pop rbx
leave
ret
%endmacro
segment .data
offset db 1
segment .bss ; These should really be locals on the stack (or in regs!), not globals
a1 resq 1
a2 resq 1
avg resq 1
avgL resd 1
segment .text
; usually a comment with a C function prototype and description is a good idea for functions
global avgArray
avgArray:
prologue
mov [a1], rdi ; what is this sillyness? you have 16 registers for a reason.
mov [a2], rsi ; shuffling the values you want into the regs you want them in
mov [avg], rdx ; is best done with reg-reg moves.
mov [avgL], rcx ; I like to just put a comment at the top of a block of code
; to document what goes in what reg.
mov rsi, [a1]
mov r9, [a2]
mov rdi, [avg]
mov rcx, rsi
add rcx, [avgL] ; This could be lea rcx, [rsi+rcx]
; (since avgL is in rcx anyway as a function arg).
xor rdx, rdx
xor rax, rax
xor rbx, rbx
avgArray_loop: ; you can use a local label here, starting with a .
; You don't need a diff name for each loop: the assembler will branch to the most recent instance of that label
mov al, [rsi] ; there's a data dependency on the old value of ax
mov dl, [r9] ; since the CPU doesn't "know" that shr ax, 1 will always leave ah zeroed in this algorithm
add ax, dx ; Avoid ALU ops on 16bit regs whenever possible. (8bit is fine, they have diff opcodes instead of a prefix)
; to avoid decode stalls on Intel
shr ax, 1 ; Better to use 32bit regs (movsx/movzx)
mov [rdi], al
add rsi, [offset] ; These are 64bit adds, so you're reading 7 bytes after the 1 you set with db.
add r9, [offset]
add rdi, [offset]
cmp rsi, rcx
jb avgArray_loop
epilogue
您有大量可用的寄存器,为什么要在内存中保留循环增量?我希望它在调试/尝试时就这样结束了。
此外,1-reg addressing modes are only more efficient when used as mem operands for ALU ops。当你有很多指针(除非你正在展开循环)时,只需增加一个计数器并使用 base+offset*scale 寻址,尤其是。如果你用 mov
.
这是我的做法(对 Intel SnB 及更高版本进行性能分析):
标量
; no storage needed
segment .text
GLOBAL avgArray
avgArray:
; void avgArray (uint8_t *avg, const uint8_t *a1, const uint8_t *a2, size_t len)
; if you can choose your prototype, do it so args go where you want them anyway.
; prologue
; rdi = avg
; rsi = a1
; rdx = a2
; rcx = len
; mov [rsp-8], rcx ; if I wanted to spill len to memory
add rcx, rdi
add rcx, rsi
add rcx, rdx
neg rcx ; now [rdi+rcx] is the start of dest, and we can count rcx upwards towards zero.
; We could also have just counted down towards zero
; but HW memory prefetchers have more stream slots for forward patterns than reverse.
ALIGN 16
.loop:
; use movsx for signed char
movzx eax, [rsi+rcx] ; dependency-breaker
movzx r8d, [rdx+rcx] ; Using r8d to save push/pop of rbx
; on pre-Nehalem where insn decode can be a bottleneck even in tight loops
; using ebx or ebp would save a REX prefix (1 insn byte).
add eax, r8d
shr eax, 1
mov [rdi+rcx], al
inc rcx ; No cmp needed: this is the point of counting up towards zero
jl .loop ; inc/jl can Macro-fuse into one uop
; nothing to pop, we only used caller-saved regs.
ret
在Intel上,loop是7 uops,(store是2 uops:store address和store data,不能micro-fuse),所以一个CPU一个循环可以发出4 uops将以每字节 2 个周期执行。 movzx
(对于 32 位或 64 位 reg)无论如何都是 1 uop,因为没有端口 0/1/5 uop 用于微融合或不微融合。 (这是读取,而不是读取修改)。
7 微指令占用最多 4 微指令的 2 个块,因此循环可以在 2 个周期内发出。没有其他瓶颈应该阻止执行单元跟上它,所以它应该 运行 每 2 个周期一个。
向量
有一个矢量指令可以准确地执行此操作:PAVGB
是无符号字节的平均打包(带有 9 位临时值以避免溢出,与您的 add/shr 相同)。
; no storage needed
segment .text
GLOBAL avgArray
avgArray:
; void avgArray (uint8_t *avg, const uint8_t *a1, const uint8_t *a2, size_t len)
; rdi = avg
; rsi = a1
; rdx = a2
; rcx = len
; same setup
; TODO: scalar loop here until [rdx+rcx] is aligned.
ALIGN 16
.loop:
; use movsx for signed char
movdqu xmm0, [rsi+rcx] ; 1 uop
pavgb xmm0, [rdx+rcx] ; 2 uops (no micro-fusion)
movdqu [rdi+rcx], xmm0 ; 2 uops: no micro-fusion
add rcx, 16
jl .loop ; 1 macro-fused uop add/branch
; TODO: scalar cleanup.
ret
正确设置循环退出条件很棘手,因为如果下一个 16B 超出数组末尾,您需要结束向量循环。概率。最好通过在将 rcx 添加到指针之前将 rcx 递减 15 或其他值来处理它。
所以,每次迭代 6 微指令/2 个周期,但每次迭代将执行 16 个字节。展开是理想的,因此您的循环是 4 微指令的倍数,因此您不会在循环结束时以小于 4 微指令的周期丢失问题率。每个周期 2 次加载/1 次存储是我们的瓶颈,因为 PAVGB
每个周期的吞吐量为 2。
16B / 周期在 Haswell 和更高版本上应该不难。对于使用 ymm 寄存器的 AVX2,您将获得 32B/周期。 (SnB/IvB 每个周期只能做两次内存操作,最多一次存储,除非你使用 256b loads/stores)。无论如何,在这一点上,您已经从矢量化中获得了 16 倍的巨大加速,通常这已经足够好了。我只是喜欢通过计算 uops 和展开来调整理论最大吞吐量。 :)
如果您要完全展开循环,那么值得增加指针而不仅仅是索引。 (因此 [rdx] 有两种用途,一种加法,而 [rdx+rcx] 有两种用途。
无论哪种方式,清理循环设置并将所有内容保存在寄存器中可以节省大量的指令字节和短数组的开销。