通过堆栈的 32 位扩展乘法

Question

这是我一直用来实现两个 32 位数字的扩展乘法的代码。有没有办法通过制作子程序并通过参数传递使用堆栈来实现类似的逻辑？使用 MUL 指令还是不使用它？有人可以帮忙吗？

[org 0x0100]
jmp start

multiplicand: dd 123122,0
multiplier:   dd 66341
result:       dd 0,0

start:
initialize:   mov cl,32 

              mov bl,1
checkbit:     test bl,[multiplier]
              jz skip

multiply:     mov ax, [multiplicand]
              add [result],ax
              mov ax, [multiplicand+2]
              adc [result+2], ax
              mov ax, [multiplicand+4]
              adc [result+4], ax
              mov ax, [multiplicand+6] 
              adc [result+6], ax      

skip:         shl bl,1               
              shr word [multiplier+2],1 
              rcr word [multiplier],1 

              shl word [multiplicand],1 
              rcl word [multiplicand+2],1 
              rcl word [multiplicand+4],1 
              rcl word [multiplicand+6],1 
              dec cl
              jnz checkbit

              mov ax, 0x4c00
              int 0x21

Answer 1

[org 0x0100]
jmp start

multiplicand: dd 123122
multiplier:   dd 66341
result:       dd 0

start:
    push word [multiplier+2]
    push word [multiplier]
    push word [multiplicand+2]
    push word [multiplicand]
    call multiply
    add sp, 8            ; free arguments
    mov [result], ax     ; expect result in dx:ax
    mov [result+2], dx

    mov ax, 0x4c00
    int 0x21

multiply:
    push bp
    mov bp, sp

    mov ax, [bp+4]
    mul word [bp+8]      ; xl * yl
    mov cx, [bp+4]
    imul cx, [bp+10]     ; xl * yh
    add dx, cx
    mov cx, [bp+6]
    imul cx, [bp+8]      ; xh * yl
    add dx, cx

    mov sp, bp
    pop bp
    ret

不清楚是否需要 64 位结果，上面的代码生成 32 位。

64 位版本可能如下所示：

[org 0x0100]
jmp start

multiplicand: dd 123122
multiplier:   dd 66341
result:       dd 0, 0

start:
    push word [multiplier+2]
    push word [multiplier]
    push word [multiplicand+2]
    push word [multiplicand]
    push result           ; pointer for result
    call multiply
    add sp, 10            ; free arguments

    mov ax, 0x4c00
    int 0x21

multiply:
    push bp
    mov bp, sp
    push bx

    mov bx, [bp+4]       ; result

    mov ax, [bp+6]
    mul word [bp+10]     ; xl * yl
    mov [bx], ax         ; r0
    mov [bx+2], dx       ; r1

    mov ax, [bp+6]
    mul word [bp+12]     ; xl * yh
    add [bx+2], ax       ; r1
    adc dx, 0
    mov [bx+4], dx       ; r2

    mov ax, [bp+8]
    mul word [bp+10]     ; xh * yl
    add [bx+2], ax
    adc [bx+4], dx       ; carry into the highest limb is possible here
    mov dx, 0            ; inefficient but doesn't affect FLAGS
    adc dx, 0            ; setc dl
    mov [bx+6], dx       ; r3

    mov ax, [bp+8]
    mul word [bp+12]     ; xh * yh
    add [bx+4], ax       ; r2
    adc [bx+6], dx       ; r3

    mov ax, bx           ; return result
    pop bx
    mov sp, bp
    pop bp
    ret

（更有效的方法可能是在添加之前将最后两次乘法的结果都保存在寄存器中，这样我们就可以避免存储然后执行内存目标 adc。）

免责声明：我刚刚向后移植了通常的 32 位约定，其中一个额外的隐藏参数用于指向调用者为结果保留的位置，该指针也被返回。此代码有效，但不知道 16 位编译器是否真的使用此约定。

Answer 2

我想，您的问题是缺少 SP 的算术函数，例如[sp + 4]。您可以改用 BP。在您自己的汇编函数中，您可以自由地传递参数和结果。我将向您展示一种通过堆栈传递参数并将结果放在堆栈上的方法：

BITS 16
ORG 0x0100

jmp start

multiplicand: dd 123122,0                   ; 0102 0x0001E0F2 -> 0x00000000
                                            ; 0106 0x00000000 -> 0x0001E0F2
multiplier:   dd 66341                      ; 010A 0x00010325 -> 0x00000000
result:       dd 0,0                        ; 010E 0x00000000 -> 0x0023B1F6
                                            ; 0112 0x00000000 -> 0x00000000

start:
            push word [multiplicand + 6]    ; bp + 22
            push word [multiplicand + 4]    ; bp + 20
            push word [multiplicand + 2]    ; bp + 18
            push word [multiplicand + 0]    ; bp + 16

            push word [multiplier + 2]      ; bp + 14
            push word [multiplier + 0]      ; bp + 12

            push word [result + 6]          ; bp + 10
            push word [result + 4]          ; bp + 8
            push word [result + 2]          ; bp + 6
            push word [result + 0]          ; bp + 4

            call sub_mul

            pop word [result + 0]           ; Pop stack into `result`
            pop word [result + 2]
            pop word [result + 4]
            pop word [result + 6]
            add sp, 12                      ; Clean up the rest of the stack                        ;

            mov ax, 0x4c00
            int 0x21

sub_mul:
            push bp                         ; Prolog
            mov bp, sp

initialize:   mov cl,32

              mov bl,1
checkbit:     test bl,[bp + 12]
              jz skip

multiply:     mov ax, [bp + 16]
              add [bp + 4],ax
              mov ax, [bp + 18]
              adc [bp + 6], ax
              mov ax, [bp + 20]
              adc [bp + 8], ax
              mov ax, [bp + 22]
              adc [bp + 10], ax

skip:         shl bl,1
              shr word [bp + 14],1
              rcr word [bp + 12],1

              shl word [bp + 16],1
              rcl word [bp + 18],1
              rcl word [bp + 20],1
              rcl word [bp + 22],1
              dec cl
              jnz checkbit

        leave                           ; Epilog
        ret

Answer 3

正如 Jester 指出的那样，您可以使用 4x mul 指令执行 32x32 => 64 位乘法，并使用适当的 add/adc 将部分乘积添加到 64 位结果中。（如果你没有对现有的非零 64 位值进行乘法累加，那么在进位不能传播那么远的情况下，可以优化掉一些 adc [result+6], 0。）

您需要所有 4 种产品的一半组合，
low * low、low * high /high * low 叉积和 high * high。（如果你只想要结果的低 32 位，宽度与输入相同，你不需要 high * high 部分：那将完全在高半部分。）

（在更现代的 x86-64 上，请参阅 https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-large-integer-arithmetic-paper.pdf 以了解使用 64 位块的相同内容，以及如何使用 mulx（保持 CF 不变）以及 ADOX/ADCX 这对于具有超过 2 个输入块的较大产品变得有用。例如，512 位 x 512 位使用 64 位块，也就是四肢。白皮书有一些有用的图表，说明部分产品如何与重叠对齐。）

Jester 的答案针对可读性进行了优化，在每次乘法后将部分乘积存储/添加到内存中的结果中。

这是为了将它们保存在寄存器中，并避免一些冗余加载。（尽管其中一些可能以额外的机器代码大小为代价，这在原始 8086 上，尤其是 8088 上是昂贵的，除非在缓慢的 mul 指令的阴影下预取它。）我还进行了实验使用 stosw 存储结果并递增输出指针 (ES:DI)。我假设一个平面/小内存模型，其中 DS=ES。另一种选择是将结果存储在堆栈上的输入参数上，如果您将存储延迟到每个输入的最后一次加载之后。

如果不需要 save/restore，使用更多的寄存器会更有吸引力。我决定让 BX 被这个版本的函数销毁，并取一个已经在 DI 中的结果指针。（注册 arg）。在纯汇编语言中，使用自定义调用约定是有意义的，尤其是对于相对较小的辅助函数；显然，可以将更多 push/pop 添加到 save/restore 此代码使用的更多寄存器。

将第 3 次 mul 的 add/adc 工作延迟到最终 mul 之后似乎是一个很好的优化。

优化版本（63 字节：8088 / 8086 速度取决于大小）

我还尝试在 mul 之后安排快速指令（没有内存访问），以清空将在非常慢的 mul 期间填满的预取缓冲区。（假设我们正在针对实际 8088 进行调整，其中总成本 ~= 总字节数 stored/loaded，包括代码提取，除了像 mul 这样需要额外长时间的慢速指令，并给代码提取一个机会来填充4 字节缓冲区。)

;;; inputs:  result pointer in ES/DS:DI (unmodified on return), DS=ES assumed.
;;;          dword X and Y on the stack.
;;;          (For ease of comparison / adaptation, I left an unused word below them, where the result pointer was passed before)
      ;;to only ever use ES:DI, use an ES prefix on the [DI] addr modes, or just avoid STOSW and use (DS:) [DI+0..6] addressing.
;;; outputs: qword [ES:DI] = X * Y
;;; clobbers: AX, CX, DX,  BX.
;;;           SI saved/restored, DI restored.
;;; precondition: DF=0  (cld) assumed as part of the calling convention
;%define bp ebp
;%define di edi
;%define sp esp

multiply_v2:
    push  bp
    mov   bp, sp
    push  si
;%define bp ebp+2      ; saved-EBP is wider than saved-BP

    ;mov   di, [bp+4]       ; result pointer.  Instead, caller passes it

    mov   ax, [bp+6]       ; xl
    mov   si, ax           ; xl
    mov   cx, [bp+10]      ; yl

    MUL   cx               ;; xl * yl
    mov   bx, dx           ; r1   ; eventually store to [di-2 + 2]
    stosw                  ; r0;  ; mov  [di], ax  /  add di, 2

    mov   ax, [bp+8]       ; xh
    MUL   cx               ;; xh * yl
    xor   cx, cx
    add   ax, bx
    stosw                       ; r1 in mem, BX free, DI pointing at result+4 = r2
    adc   cx, dx                ; r2 in CX   (adc dx,0 / mov cx,dx)
    ; carry into r3 is impossible here: 0xffff^2 has a high half of 0xfffe

    mov   ax, [bp+12]      ; yh
    xchg  si, ax             ; save yh; we're done with xl.  (xchg-with-AX saves a byte vs. mov: worth it on 8088)
    MUL   si               ;; xl * yh
    mov   bx, dx             ; save xlyh_hi (r2)
    xchg  si, ax             ; save xlyh_lo (r1), get yh into AX for next mul

    MUL   word [bp+8]      ;; xh * yh  => r3:r2
    add   bx, cx           ; r2: combine partial products from r2:r1 cross-multiplies
    adc   dx, 0            ; r3: which can carry-out, into a real r3 instead of needing to materialize a 0 / 1

    add   [di-4 + 2], si   ; r1  at result+2, from 2nd cross-multiply
    adc   ax, bx           ; r2, *can* carry farther
    adc   dx, 0            ; r3

    stosw                  ; r2
    mov   [di], dx         ; r3 at [result+6] = [di-6 + 6]

    pop   si
    sub   di, 6            ; 3 bytes.  Still pays for itself in size savings of earlier stosw instructions vs. mov [di+4], on 8086/8088

;%define bp ebp
    ; mov sp, bp   ; redundant
    pop   bp
    ret

注释掉的 %define 行来自 Linux 下的 32 位模式测试，使用以下 C 调用程序。（它有效，包括一些极端情况，如 0xFFFFFFFF 平方）。 arg space 未使用的低 2 字节（DI 寄存器 arg 的“home space”？）在 32 位模式下由 return 地址的顶部填充，但 push ebp 也是 4 个字节，而不是 2 个字节，因此必须更改偏移量。

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>

__attribute__((regparm(1)))
extern void multest_32b(uint64_t *result, uint32_t x, uint32_t y);

int main(int argc, char **argv) {
    unsigned x = 0x12345, y = 0xffbcde98;
    if (argc>=3) {
        x = strtoll(argv[1], NULL, 0);
        y = strtoll(argv[2], NULL, 0);
    }
    uint64_t test;
    multest_32b(&test, x,y);
    printf ("%#x * %#x = %#llx.  (test ^ control = %#llx)\n", x, y, test, test ^ (x * (uint64_t)y)  );
}

当它们匹配时，64 位 XOR 结果当然是 0，GCC 知道如何用一条 mul 指令进行 32x32 => 64 位乘法。

我使用了一个包装函数来适应调用约定的差异。我希望我可以在不复制堆栈参数的情况下逃脱，所以我将 XMM0 用于 save/restore EDI，并且当我意识到这行不通时没有改变它。我也让它破坏了调用者的 BX，但似乎 GCC 生成的代码不依赖于 EBX，即使它是 i386 System V 调用约定中的调用保留寄存器。我可能刚刚完成 mov edi, eax / jmp 因为这个主要可能也不使用 EDI。

;; wrapper function adapter for 32-bit C callers
global multest_32b              ; __attribute__((regparm(1)))
multest_32b:
    movd  xmm0, edi
    mov   edi, eax
    push  dword [esp+8]    ; bleh, can't tail-call if we want to restore (E)DI
    push  dword [esp+8]
    call  multiply_v2
    movd  edi, xmm0
    add   esp, 8
    ret

$ nasm -felf32 mul-ext.asm && gcc -m32 mul-test.c mul-ext.o
$ ./a.out
$ ./a.out 0x7feffeff 0xffeffeff 
0x7feffeff * 0xffeffeff = 0x7fe7ff7ea0210201.  (test ^ control = 0)
$ ./a.out 0xffffffff 0xffffffff 
0xffffffff * 0xffffffff = 0xfffffffe00000001.  (test ^ control = 0)

$ ./a.out 
0x12345 * 0xffbcde98 = 0x122f89eeec6f8.  (test ^ control = 0)
$ while ./a.out $(( $RANDOM * $RANDOM )) $(( $RANDOM * $RANDOM )) ;do :;done | fgrep -v 'control = 0)'
 ... runs until you ^C with no output, i.e. no error

64x64 => 128 位，在 32 位模式下，使用一些 SSE2

我想看看如果用 SSE2 pmuludq 做两个部分产品会是什么样子会很有趣。此版本未经测试。 GCC 在 64 位目标上有一个 unsigned __int128，因此相同的测试策略可以与从寄存器参数到堆栈参数的包装函数一起使用。

版本 1 使用标量进行叉积，另外两个使用 SIMD，使用向量存储/标量重新加载而不是一堆混洗，因为我们想用标量接触 4 个元素中的 3 个，并且该商店的低双字已完成。

未测试，i386 System V 调用约定。（纯堆栈参数、EAX、ECX、EDX 和 XMM regs 调用被破坏）。

global multiply_64x64_128_SSE2
multiply_64x64_128_SSE2:             ; (uint64_t x, uint64_t y,  uint128 *result)  stack args
    push  ebx
%define ARGS esp+4
    mov   eax,  [ARGS + 0]    ; Xl
    MUL   dword [ARGS + 12]   ;; Xl * Yh       ; first cross product

     ;; assume aligned args, like Linux's version of i386 System V
    pshufd  xmm0, [ARGS + 0], 0b11_01_10_00    ; [ Yh, Xh, Yl, Xl ]
    pshufd  xmm1, xmm0, 0b1111_0101            ; [ Yh, Yh, Yl, Yl ]
    pmuludq xmm0, xmm1                         ; [   Xh*Yh,   Xl*Yl ]  hi * hi and low x low 64-bit halves where they should be in result.

    mov   ebx, eax              ; r1
    mov   ecx, edx              ; r2
    mov   eax, [ARGS + 4]       ; Xh
    MUL   dword [ARGS + 12]     ;; Xh * Yh    r2:r1
    add   eax, ebx              ; r1
    adc   edx, ecx              ; r2
    mov   ecx, 0                      ; can't xor-zero to avoid partial-register stalls unless we had another free register. setc / movzx is better on P6-family, false dep otherwise.
                                      ; or mov ecx, [ebx+12] / adc ecx,0   now instead of memory-source adc later: good on SnB-family, except store-forwarding neds to be ready sooner.
    setc  cl                    ; r3 carry-out from r2 materialized without reloading SIMD result yet

    mov   ebx, [ARGS+16]        ; uint128 *result
    movdqu  [ebx], xmm0         ; vector store, then accumulate cross products into it.

    add   [ebx+4], eax          ; r1
    adc   edx, [ebx+8]          ; r2
     mov   [ebx+8], edx             ; memory-destination ADC is inefficient on Intel
    adc   ecx, [ebx+12]         ; r3
     mov   [ebx+12], ecx
    pop   ebx
    ret

备用 SSE2 版本：对包含 Yl 的两种产品使用 SIMD。与早期版本相比需要额外的空闲寄存器，并在关键路径中更早地重新加载 SIMD 存储。但是确实节省了 1 个标量负载 uop。

global multiply_64x64_128_SSE2_v2
multiply_64x64_128_SSE2_v2:             ; (uint64_t x, uint64_t y,  uint128 *result)  stack args
    push  ebx
    push  edi
%define ARGS esp+8
    mov   eax,  [ARGS + 12]    ; Yh
    mov   edi,  eax
    MUL   dword [ARGS + 0]     ;; Xl * Yh     r2:r1 cross product

     ;; assume aligned args, like Linux's version of i386 System V
    movdqu  xmm0, [ARGS + 0]                   ; [ Yh, Yl, Xh, Xl ]
        ;  pshufd  xmm0, [ARGS + 0], 0b11'01'10'00    ; [ Yh, Xh, Yl, Xl ]
    pshufd  xmm1, xmm0, 0b11_01_00_10          ; [ Yh, Xh, Xl, Yl ]      ; do both partial products involving Yl, using only 1 shuffle
    pmuludq xmm0, xmm1                         ; [   Xh*Yl(r2:r1),   Xl*Yl (r1:r0)]   ; low dword fully finished, and one cross product out of place

    mov   ebx, eax              ; r1
    mov   eax, [ARGS + 4]               ; Xh.  mul [mem] micro-fuses on Intel SnB-family, so this is equal to mov eax,edi / mul [mem] only needing 1 free reg.  But we need more later.
    mov   ecx, edx              ; r2
    MUL   edi                   ;; Xh * Yh    r3:r2

     mov   edi, [ARGS+16]        ; uint128 *result
     movdqu  [edi], xmm0         ; vector store, then accumulate partial products into it.

    add   ebx, [edi+8]          ; r1   (from SIMD cross product)
    adc   eax, [edi+12]         ; r2
    adc   edx, 0

    add   [edi+4], ebx          ; r1   (from SIMD low * low)
    adc   eax, ecx              ; r2
     mov   [edi+8], eax             ; memory-destination ADC is inefficient on Intel
    adc   edx, 0                ; r3
     mov   [edi+12], edx

    pop   ebx
    ret

adc 在 Broadwell（或）之前在 Intel 上是 2 微指令，所以我试图在它之后安排其他指令以获得更好的解码器吞吐量。但是任何一个版本都可以以牺牲可读性为代价进行更积极的调整（将相关操作分散更多，而不是将它们分组）。

它们也没有真正针对 P6 系列进行调整，其中部分寄存器停顿是一个问题。

MULX 对于有两个显式输出而不是隐式 EDX:EAX 真的很好，就像不触及 FLAGS 一样。将输入保存在同一个寄存器中并将几个不同的东西乘以它到不同的输出寄存器中将节省大量 mov 指令。我真的明白他们为什么那样设计它，而不是让输出之一成为隐式操作数。

通过堆栈的 32 位扩展乘法

32-bit extended multiplication via stack

x86

assembly

nasm

multiplication

bigint

优化版本（63 字节：8088 / 8086 速度取决于大小）

64x64 => 128 位，在 32 位模式下，使用一些 SSE2