在 x86 上对 32 位块实现类似学校的划分

Question

假设我有两个大数字（定义如下），并且我想通过后退对他们实施分裂到 x86 可用算法

0008768376 - 1653656387 - 0437673667 - 0123767614 - 1039873878 - 2231712290 / 0038768167 - 3276287672 - 1665265628

C=A/B

这些数字存储为 32 位无符号整数的向量。第一个，A，是 6-unsigned-int 向量，B 是 3-unsigned-int 长向量 [我将此字段中的每一个命名为广义 'digit' 或 'field' 靠我自己]

生成的 C 将是一些 3-unsigned-int 向量但要计算它我需要退回到一些可用的 x86（32 位模式，虽然我也听说过 x64 也是，但这是次要的）算术

告诉我怎么算最少第一，最重要 C 结果向量的字段..

怎么做？

Answer 1

GMP's docs include a section on the algorithms it uses，包括除法。在 GMP 术语中，每个 32b 块都称为 "limbs".

64 位 CPUs 非常适合扩展精度数学，因为它们一次处理两倍的数量，每次操作的时间大致相同。如果你想自己实现扩展精度，而不是使用 LGPLed GMP library，我建议使用 x86-64 asm 并回退到 C。（除非你想将它用作你想要发布的东西的一部分仅作为旧版 32 位二进制文件。）

不过除法是这个规则的例外：在 Intel 和 AMD 最近的 x86 设计中，128b / 64b = 64b 的除法比 64b / 32b = 32b 的除法具有更差的延迟和更低的吞吐量。请参阅 http://agner.org/optimize/，在说明表中查找 div。 idiv 是有符号除法，div 是无符号除法。）相比之下，ADC（带进位加法）具有每周期 1 个吞吐量，与 64 位 MUL（和 IMUL).

实际上，至少在 Intel Sandybridge 系列上，MUL/IMUL 64 位 * 64 位 = 128 位比 IMUL/MUL 32 位 * 32 位 = 64 位快。 mul 32bit 是 3 微指令，每 2 个周期一个吞吐量，而 mul 64 位是 2 微指令，每个周期一个吞吐量。 mul 的单操作数形式执行 rdx:rax = rax * src。我想需要一个额外的周期来 split/shuffle 乘法器的 64 位结果进入 edx:eax，它被设置为产生 128b 结果的两个 64 位一半。

在 AMD Bulldozer 系列 CPU 上，32 位 mul 比 64 位 mul 快。我猜他们没有全宽乘法器硬件。

（对于编译器的正常使用，c = a*b 通常对所有变量具有相同的宽度，因此可以丢弃结果的上半部分。x86 具有 dest *= src 双操作数形式imul（但不是 mul），这与最快的单操作数形式的速度相同。所以不要担心使用 longs，它们会在正常代码中乘以更快.)

这适用于 CPU 是运行 32 位或 64 位代码，除了 32 位代码不能进行 64 位操作。

你问的是 div，我跑题了。 x86的div指令做rdx:rax/src，输出rax=商，rdx=余数。或者 edx:eax ... 对于 32 位版本。

Answer 2

这是来自 wikipedia 的任意大小操作数的无符号长除法伪代码：

if D == 0 then
    error(DivisionByZeroException) end
Q := 0                 -- initialize quotient and remainder to zero
R := 0                     
for i = n-1...0 do     -- where n is number of bits in N
  R := R << 1          -- left-shift R by 1 bit
  R(0) := N(i)         -- set the least-significant bit of R equal to bit i of the numerator
  if R >= D then
    R := R - D
    Q(i) := 1
  end
end

这也可以扩展为包括有符号除法（将结果四舍五入为零，而不是负无穷大）：

if D == 0 then
    error(DivisionByZeroException) end
Q := 0                 -- initialize quotient and remainder to zero
R := 0  
SaveN := N             -- save numerator
SaveD := D             -- save denominator
if N < 0 then N = -N   -- invert numerator if negative
if D < 0 then D = -D   -- invert denominator if negative
for i = n-1...0 do     -- where n is number of bits in N
  R := R << 1          -- left-shift R by 1 bit
  R(0) := N(i)         -- set the least-significant bit of R equal to bit i of the numerator
  if R >= D then
    R := R - D
    Q(i) := 1
  end
end
if SaveN < 0 then
    R = -R             -- numerator was negative, negative remainder
if (SaveN < 0 and SaveD >= 0) or (SaveN >= 0 and SaveD < 0) then
    Q = -Q             -- differing signs of inputs, result is negative

这是一个相对简单、未优化、未经测试的 x86 ASM 实现（NASM 语法）应该很容易理解：

        ; function div_192_96
        ; parameters:
        ;                 24 bytes: numerator, high words are stored after low words
        ;                 24 bytes: denominator, high words are stored after low words (only low 12 bytes are used)
        ;                  4 bytes: address to store 12 byte remainder in (must not be NULL)
        ;                  4 bytes: address to store 12 byte quotient in (must not be NULL)
        ; return value:   none
        ; error checking: none

        GLOBAL  div_192_96
div_192_96:
        pushl   ebp             ; set up stack frame
        movl    ebp, esp
        pushl   0               ; high word of remainder
        pushl   0               ; middle word of remainder
        pushl   0               ; low word of remainder
        pushl   0               ; high word of quotient
        pushl   0               ; middle word of quotient
        pushl   0               ; low word of quotient
        movl    ecx, 96
.div_loop:
        jecxz   .div_loop_done
        decl    ecx
        ; remainder = remainder << 1
        movl    eax, [ebp-8]    ; load middle word of remainder
        shld    [ebp-4], eax, 1 ; shift high word of remainder left by 1
        movl    eax, [ebp-12]   ; load low word of remainder
        shld    [ebp-8], eax, 1 ; shift middle word of remainder left by 1
        shll    [ebp-12], 1     ; shift low word of remainder left by 1
        ; quotient = quotient << 1
        movl    eax, [ebp-20]   ; load middle word of remainder
        shld    [ebp-16], eax, 1; shift high word of remainder left by 1
        movl    eax, [ebp-24]   ; load low word of remainder
        shld    [ebp-20], eax, 1; shift middle word of remainder left by 1
        shll    [ebp-24], 1     ; shift low word of remainder left by 1
        ; remainder(0) = numerator(127)
        movl    eax, [ebp+28]   ; load high word of numerator
        shrl    eax, 31         ; get top bit in bit 0
        orl     [ebp-12], eax   ; OR into low word of remainder
        ; numerator = numerator << 1
        movl    eax, [ebp+24]   ; load 5th word of numerator
        shld    [ebp+28], eax, 1; shift 6th word of numerator left by 1
        movl    eax, [ebp+20]   ; load 4th word of numerator
        shld    [ebp+24], eax, 1; shift 5th word of numerator left by 1
        movl    eax, [ebp+16]   ; load 3rd word of numerator
        shld    [ebp+20], eax, 1; shift 4th word of numerator left by 1
        movl    eax, [ebp+12]   ; load 2nd word of numerator
        shld    [ebp+16], eax, 1; shift 3rd word of numerator left by 1
        movl    eax, [ebp+8]    ; load 1st word of numerator
        shld    [ebp+12], eax, 1; shift 2nd word of numerator left by 1
        shll    [ebp+8], 1      ; shift 1st word of numerator left by 1
        ; if (remainder >= denominator)
        movl    eax, [ebp+40]   ; compare high word of denominator
        cmpl    eax, [ebp-4]    ; with high word of remainder
        jb      .div_loop
        ja      .div_subtract
        movl    eax, [ebp+36]   ; compare middle word of denominator
        cmpl    eax, [ebp-8]    ; with middle word of remainder
        jb      .div_loop
        ja      .div_subtract
        movl    eax, [ebp+32]   ; compare low word of denominator
        cmpl    eax, [ebp-12]   ; with low word of remainder
        jb      .div_loop
.div_subtract:
        ; remainder = remainder - denominator
        movl    eax, [ebp+32]   ; load low word of denominator
        subl    [ebp-12], eax   ; and subtract from low word of remainder
        movl    eax, [ebp+36]   ; load middle word of denominator
        sbbl    [ebp-8], eax    ; and subtract from middle word of remainder (with borrow)
        movl    eax, [ebp+40]   ; load high word of denominator
        sbbl    [ebp-4], eax    ; and subtract from high word of remainder (with borrow)
        ; quotient(0) = 1
        orl     [ebp-24], 1     ; OR 1 into low word of quotient
        jmp     .div_loop
.div_loop_done:
        movl    eax, [ebp+56]   ; load remainder storage pointer
        movl    edx, [ebp-12]   ; load low word of remainder
        movl    [eax+0], edx    ; store low word of remainder
        movl    edx, [ebp-8]    ; load middle word of remainder
        movl    [eax+4], edx    ; store middle word of remainder
        movl    edx, [ebp-4]    ; load high word of remainder
        movl    [eax+8], edx    ; store high word of remainder
        movl    eax, [ebp+60]   ; load quotient storage pointer
        movl    edx, [ebp-24]   ; load low word of quotient
        movl    [eax+0], edx    ; store low word of quotient
        movl    edx, [ebp-20]   ; load middle word of quotient
        movl    [eax+4], edx    ; store middle word of quotient
        movl    edx, [ebp-16]   ; load high word of quotient
        movl    [eax+8], edx    ; store high word of quotient
        addl    esp, 24
        popl    ebp
        ret

请注意，这只是给你一个大概的想法，并没有经过测试。顺便说一句，在程序集中计算两个大小相等的数字的商可能比尝试解决溢出问题（在上面的代码中完全未处理）更容易。

在 x86 上对 32 位块实现类似学校的划分

implementing school-like division on 32bit chunks on x86

c

algorithm

x86

division

integer-arithmetic