NASM

Question

我正在尝试用汇编程序编写一个小程序，它将三个 char 数组作为输入，计算第一个数组中每个元素的平均值，并将结果存储在第三个数组中，如下所示。

%macro prologue 0
    push    rbp
    mov     rbp,rsp
    push    rbx
    push    r12
    push    r13
    push    r14
    push    r15
%endmacro
%macro epilogue 0
    pop     r15
    pop     r14
    pop     r13
    pop     r12
    pop     rbx
    leave
    ret
%endmacro

segment .data
    offset  db  1
segment .bss
    a1      resq    1
    a2      resq    1
    avg     resq    1
    avgL    resd    1
segment .text
    global  avgArray 
avgArray:
    prologue

    mov [a1], rdi
    mov [a2], rsi
    mov [avg], rdx
    mov [avgL], rcx

    mov rsi, [a1]
    mov r9, [a2]
    mov rdi, [avg]

    mov rcx, rsi
    add rcx, [avgL]    ; array length

    xor rdx, rdx
    xor rax, rax
    xor rbx, rbx
avgArray_loop:
    mov al, [rsi]
    mov dl, [r9]
    add ax, dx
    shr ax, 1
    mov [rdi], al

    add rsi, [offset]
    add r9, [offset]
    add rdi, [offset]

    cmp rsi, rcx
    jb  avgArray_loop
    epilogue

将 [offset] 替换为 1 时效果非常好。但是，当使用 [offset] 确定下一个数组元素时，它似乎不会将其值添加到 rsi、rdi 和 r9。我已经使用 gdb 检查了它。 rsi中保存的地址在调用add rsi, [offset]后还是一样的。

有人能告诉我为什么使用 [offset] 不起作用但添加一个简单的 1 可以吗？

顺便说一句：Linuxx86_64机器

Answer 1

所以我找到了解决该问题的方法。

avgL 和 offset 的地址直接存储在彼此后面。当从 rcx 读取并将其存储到 avgL 时，它也覆盖了 offset 的值。将 avgL 声明为 QWORD 而不是 DWORD 可防止 mov 覆盖 offset 数据。

新的数据和 bss 段看起来像这样

segment .data
    offset  db  1
segment .bss
    a1      resq    1
    a2      resq    1
    avg     resq    1
    avgL    resq    1

Answer 2

亲自调试问题，干得不错。因为我已经开始看代码了，我会给你一些效率/风格评论作为补充评论：

%macro prologue 0
    push    rbp
    mov     rbp,rsp   ; you can drop this and the LEAVE.
;  Stack frames were useful before debuggers could keep track of things without them, and as a convenience
;  so local variables were always at the same offset from your base pointer, even while you were pushing/popping stuff on the stack.
; With the SysV ABI, you can use the red zone for locals without even
; fiddling with RSP at all, if you don't push/pop or call anything.
    push    rbx
    push    r12
    push    r13
    push    r14
    push    r15
%endmacro
%macro epilogue 0
    pop     r15
    pop     r14
    pop     r13
    pop     r12
    pop     rbx
    leave
    ret
%endmacro

segment .data
    offset  db  1
segment .bss    ; These should really be locals on the stack (or in regs!), not globals
    a1      resq    1
    a2      resq    1
    avg     resq    1
    avgL    resd    1

segment .text
; usually a comment with a C function prototype and description is a good idea for functions
    global  avgArray
avgArray:
    prologue

    mov [a1], rdi     ; what is this sillyness?  you have 16 registers for a reason.
    mov [a2], rsi     ; shuffling the values you want into the regs you want them in
    mov [avg], rdx    ; is best done with reg-reg moves.
    mov [avgL], rcx   ; I like to just put a comment at the top of a block of code
                      ; to document what goes in what reg.

    mov rsi, [a1]
    mov r9, [a2]
    mov rdi, [avg]

    mov rcx, rsi
    add rcx, [avgL]    ; This could be lea rcx, [rsi+rcx]
              ;  (since avgL is in rcx anyway as a function arg).

    xor rdx, rdx
    xor rax, rax
    xor rbx, rbx
avgArray_loop:   ; you can use a local label here, starting with a .
 ; You don't need a diff name for each loop: the assembler will branch to the most recent instance of that label
    mov al, [rsi]        ; there's a data dependency on the old value of ax
    mov dl, [r9]         ; since the CPU doesn't "know" that shr ax, 1 will always leave ah zeroed in this algorithm

    add ax, dx           ; Avoid ALU ops on 16bit regs whenever possible.  (8bit is fine, they have diff opcodes instead of a prefix)
                         ; to avoid decode stalls on Intel
    shr ax, 1            ; Better to use 32bit regs (movsx/movzx)
    mov [rdi], al

    add rsi, [offset]    ; These are 64bit adds, so you're reading 7 bytes after the 1 you set with db.
    add r9, [offset]
    add rdi, [offset]

    cmp rsi, rcx
    jb  avgArray_loop
    epilogue

您有大量可用的寄存器，为什么要在内存中保留循环增量？我希望它在调试/尝试时就这样结束了。

此外，1-reg addressing modes are only more efficient when used as mem operands for ALU ops。当你有很多指针（除非你正在展开循环）时，只需增加一个计数器并使用 base+offset*scale 寻址，尤其是。如果你用 mov.

加载它们

这是我的做法（对 Intel SnB 及更高版本进行性能分析）：

标量

; no storage needed
segment .text
GLOBAL  avgArray
avgArray:
    ; void avgArray (uint8_t *avg, const uint8_t *a1, const uint8_t *a2, size_t len)
    ; if you can choose your prototype, do it so args go where you want them anyway.
    ; prologue
    ; rdi = avg
    ; rsi = a1
    ; rdx = a2
    ; rcx = len

    ; mov    [rsp-8], rcx    ; if I wanted to spill  len  to memory

    add    rcx, rdi
    add    rcx, rsi
    add    rcx, rdx
    neg    rcx       ; now [rdi+rcx] is the start of dest, and we can count rcx upwards towards zero.
    ; We could also have just counted down towards zero
    ; but HW memory prefetchers have more stream slots for forward patterns than reverse.

ALIGN 16
.loop:
    ;  use movsx for signed char
    movzx  eax, [rsi+rcx]     ; dependency-breaker
    movzx  r8d, [rdx+rcx]     ; Using r8d to save push/pop of rbx
           ; on pre-Nehalem where insn decode can be a bottleneck even in tight loops
           ; using ebx or ebp would save a REX prefix (1 insn byte).
    add    eax, r8d
    shr    eax, 1
    mov    [rdi+rcx], al

    inc    rcx     ; No cmp needed: this is the point of counting up towards zero
    jl     .loop   ; inc/jl can Macro-fuse into one uop

    ; nothing to pop, we only used caller-saved regs.
    ret

在Intel上，loop是7 uops，（store是2 uops：store address和store data，不能micro-fuse），所以一个CPU一个循环可以发出4 uops将以每字节 2 个周期执行。 movzx（对于 32 位或 64 位 reg）无论如何都是 1 uop，因为没有端口 0/1/5 uop 用于微融合或不微融合。（这是读取，而不是读取修改）。

7 微指令占用最多 4 微指令的 2 个块，因此循环可以在 2 个周期内发出。没有其他瓶颈应该阻止执行单元跟上它，所以它应该运行每 2 个周期一个。

向量

有一个矢量指令可以准确地执行此操作：PAVGB 是无符号字节的平均打包（带有 9 位临时值以避免溢出，与您的 add/shr 相同）。

; no storage needed
segment .text
GLOBAL  avgArray
avgArray:
    ; void avgArray (uint8_t *avg, const uint8_t *a1, const uint8_t *a2, size_t len)
    ; rdi = avg
    ; rsi = a1
    ; rdx = a2
    ; rcx = len

; same setup
; TODO: scalar loop here until [rdx+rcx] is aligned.
ALIGN 16
.loop:
    ;  use movsx for signed char
    movdqu    xmm0, [rsi+rcx]    ; 1 uop
    pavgb     xmm0, [rdx+rcx]    ; 2 uops (no micro-fusion)
    movdqu    [rdi+rcx], xmm0    ; 2 uops: no micro-fusion

    add    rcx, 16
    jl     .loop          ; 1 macro-fused uop add/branch
    ; TODO: scalar cleanup.
    ret

正确设置循环退出条件很棘手，因为如果下一个 16B 超出数组末尾，您需要结束向量循环。概率。最好通过在将 rcx 添加到指针之前将 rcx 递减 15 或其他值来处理它。

所以，每次迭代 6 微指令/2 个周期，但每次迭代将执行 16 个字节。展开是理想的，因此您的循环是 4 微指令的倍数，因此您不会在循环结束时以小于 4 微指令的周期丢失问题率。每个周期 2 次加载/1 次存储是我们的瓶颈，因为 PAVGB 每个周期的吞吐量为 2。

16B / 周期在 Haswell 和更高版本上应该不难。对于使用 ymm 寄存器的 AVX2，您将获得 32B/周期。（SnB/IvB 每个周期只能做两次内存操作，最多一次存储，除非你使用 256b loads/stores）。无论如何，在这一点上，您已经从矢量化中获得了 16 倍的巨大加速，通常这已经足够好了。我只是喜欢通过计算 uops 和展开来调整理论最大吞吐量。 :)

如果您要完全展开循环，那么值得增加指针而不仅仅是索引。（因此 [rdx] 有两种用途，一种加法，而 [rdx+rcx] 有两种用途。

无论哪种方式，清理循环设置并将所有内容保存在寄存器中可以节省大量的指令字节和短数组的开销。

NASM - 使用标签作为数组偏移量

NASM - Using Labels as Array offsets

arrays

assembly

x86-64

标量

向量