在 x86 上对 32 位块实现类似学校的划分
implementing school-like division on 32bit chunks on x86
假设我有两个大数字(定义如下),并且
我想通过后退对他们实施分裂
到 x86 可用算法
0008768376 - 1653656387 - 0437673667 - 0123767614 - 1039873878 - 2231712290
/
0038768167 - 3276287672 - 1665265628
C=A/B
这些数字存储为 32 位无符号整数的向量。
第一个,A,是 6-unsigned-int 向量,B 是 3-unsigned-int
长向量 [我将此字段中的每一个命名为广义 'digit' 或 'field'
靠我自己]
生成的 C 将是一些 3-unsigned-int 向量
但要计算它我需要退回到一些可用的
x86(32 位模式,虽然我也听说过 x64
也是,但这是次要的)算术
告诉我怎么算最少第一,最重要
C 结果向量的字段..
怎么做?
GMP's docs include a section on the algorithms it uses,包括除法。在 GMP 术语中,每个 32b 块都称为 "limbs".
64 位 CPUs 非常适合扩展精度数学,因为它们一次处理两倍的数量,每次操作的时间大致相同。如果你想自己实现扩展精度,而不是使用 LGPLed GMP library,我建议使用 x86-64 asm 并回退到 C。(除非你想将它用作你想要发布的东西的一部分仅作为旧版 32 位二进制文件。)
不过除法是这个规则的例外:在 Intel 和 AMD 最近的 x86 设计中,128b / 64b = 64b 的除法比 64b / 32b = 32b 的除法具有更差的延迟和更低的吞吐量。请参阅 http://agner.org/optimize/,在说明表中查找 div
。 idiv
是有符号除法,div
是无符号除法。)相比之下,ADC
(带进位加法)具有每周期 1 个吞吐量,与 64 位 MUL
(和 IMUL
).
实际上,至少在 Intel Sandybridge 系列上,MUL/IMUL
64 位 * 64 位 = 128 位比 IMUL/MUL
32 位 * 32 位 = 64 位快。 mul 32bit 是 3 微指令,每 2 个周期一个吞吐量,而 mul 64 位是 2 微指令,每个周期一个吞吐量。 mul
的单操作数形式执行 rdx:rax = rax * src
。我想需要一个额外的周期来 split/shuffle 乘法器的 64 位结果进入 edx:eax,它被设置为产生 128b 结果的两个 64 位一半。
在 AMD Bulldozer 系列 CPU 上,32 位 mul 比 64 位 mul 快。我猜他们没有全宽乘法器硬件。
(对于编译器的正常使用,c = a*b
通常对所有变量具有相同的宽度,因此可以丢弃结果的上半部分。x86 具有 dest *= src
双操作数形式imul
(但不是 mul
),这与最快的单操作数形式的速度相同。所以不要担心使用 long
s,它们会在正常代码中乘以更快.)
这适用于 CPU 是 运行 32 位或 64 位代码,除了 32 位代码不能进行 64 位操作。
你问的是 div
,我跑题了。 x86的div
指令做rdx:rax/src,输出rax=商,rdx=余数。或者 edx:eax ... 对于 32 位版本。
这是来自 wikipedia 的任意大小操作数的无符号长除法伪代码:
if D == 0 then
error(DivisionByZeroException) end
Q := 0 -- initialize quotient and remainder to zero
R := 0
for i = n-1...0 do -- where n is number of bits in N
R := R << 1 -- left-shift R by 1 bit
R(0) := N(i) -- set the least-significant bit of R equal to bit i of the numerator
if R >= D then
R := R - D
Q(i) := 1
end
end
这也可以扩展为包括有符号除法(将结果四舍五入为零,而不是负无穷大):
if D == 0 then
error(DivisionByZeroException) end
Q := 0 -- initialize quotient and remainder to zero
R := 0
SaveN := N -- save numerator
SaveD := D -- save denominator
if N < 0 then N = -N -- invert numerator if negative
if D < 0 then D = -D -- invert denominator if negative
for i = n-1...0 do -- where n is number of bits in N
R := R << 1 -- left-shift R by 1 bit
R(0) := N(i) -- set the least-significant bit of R equal to bit i of the numerator
if R >= D then
R := R - D
Q(i) := 1
end
end
if SaveN < 0 then
R = -R -- numerator was negative, negative remainder
if (SaveN < 0 and SaveD >= 0) or (SaveN >= 0 and SaveD < 0) then
Q = -Q -- differing signs of inputs, result is negative
这是一个相对简单、未优化、未经测试的 x86 ASM 实现(NASM 语法)应该很容易理解:
; function div_192_96
; parameters:
; 24 bytes: numerator, high words are stored after low words
; 24 bytes: denominator, high words are stored after low words (only low 12 bytes are used)
; 4 bytes: address to store 12 byte remainder in (must not be NULL)
; 4 bytes: address to store 12 byte quotient in (must not be NULL)
; return value: none
; error checking: none
GLOBAL div_192_96
div_192_96:
pushl ebp ; set up stack frame
movl ebp, esp
pushl 0 ; high word of remainder
pushl 0 ; middle word of remainder
pushl 0 ; low word of remainder
pushl 0 ; high word of quotient
pushl 0 ; middle word of quotient
pushl 0 ; low word of quotient
movl ecx, 96
.div_loop:
jecxz .div_loop_done
decl ecx
; remainder = remainder << 1
movl eax, [ebp-8] ; load middle word of remainder
shld [ebp-4], eax, 1 ; shift high word of remainder left by 1
movl eax, [ebp-12] ; load low word of remainder
shld [ebp-8], eax, 1 ; shift middle word of remainder left by 1
shll [ebp-12], 1 ; shift low word of remainder left by 1
; quotient = quotient << 1
movl eax, [ebp-20] ; load middle word of remainder
shld [ebp-16], eax, 1; shift high word of remainder left by 1
movl eax, [ebp-24] ; load low word of remainder
shld [ebp-20], eax, 1; shift middle word of remainder left by 1
shll [ebp-24], 1 ; shift low word of remainder left by 1
; remainder(0) = numerator(127)
movl eax, [ebp+28] ; load high word of numerator
shrl eax, 31 ; get top bit in bit 0
orl [ebp-12], eax ; OR into low word of remainder
; numerator = numerator << 1
movl eax, [ebp+24] ; load 5th word of numerator
shld [ebp+28], eax, 1; shift 6th word of numerator left by 1
movl eax, [ebp+20] ; load 4th word of numerator
shld [ebp+24], eax, 1; shift 5th word of numerator left by 1
movl eax, [ebp+16] ; load 3rd word of numerator
shld [ebp+20], eax, 1; shift 4th word of numerator left by 1
movl eax, [ebp+12] ; load 2nd word of numerator
shld [ebp+16], eax, 1; shift 3rd word of numerator left by 1
movl eax, [ebp+8] ; load 1st word of numerator
shld [ebp+12], eax, 1; shift 2nd word of numerator left by 1
shll [ebp+8], 1 ; shift 1st word of numerator left by 1
; if (remainder >= denominator)
movl eax, [ebp+40] ; compare high word of denominator
cmpl eax, [ebp-4] ; with high word of remainder
jb .div_loop
ja .div_subtract
movl eax, [ebp+36] ; compare middle word of denominator
cmpl eax, [ebp-8] ; with middle word of remainder
jb .div_loop
ja .div_subtract
movl eax, [ebp+32] ; compare low word of denominator
cmpl eax, [ebp-12] ; with low word of remainder
jb .div_loop
.div_subtract:
; remainder = remainder - denominator
movl eax, [ebp+32] ; load low word of denominator
subl [ebp-12], eax ; and subtract from low word of remainder
movl eax, [ebp+36] ; load middle word of denominator
sbbl [ebp-8], eax ; and subtract from middle word of remainder (with borrow)
movl eax, [ebp+40] ; load high word of denominator
sbbl [ebp-4], eax ; and subtract from high word of remainder (with borrow)
; quotient(0) = 1
orl [ebp-24], 1 ; OR 1 into low word of quotient
jmp .div_loop
.div_loop_done:
movl eax, [ebp+56] ; load remainder storage pointer
movl edx, [ebp-12] ; load low word of remainder
movl [eax+0], edx ; store low word of remainder
movl edx, [ebp-8] ; load middle word of remainder
movl [eax+4], edx ; store middle word of remainder
movl edx, [ebp-4] ; load high word of remainder
movl [eax+8], edx ; store high word of remainder
movl eax, [ebp+60] ; load quotient storage pointer
movl edx, [ebp-24] ; load low word of quotient
movl [eax+0], edx ; store low word of quotient
movl edx, [ebp-20] ; load middle word of quotient
movl [eax+4], edx ; store middle word of quotient
movl edx, [ebp-16] ; load high word of quotient
movl [eax+8], edx ; store high word of quotient
addl esp, 24
popl ebp
ret
请注意,这只是给你一个大概的想法,并没有经过测试。顺便说一句,在程序集中计算两个大小相等的数字的商可能比尝试解决溢出问题(在上面的代码中完全未处理)更容易。
假设我有两个大数字(定义如下),并且 我想通过后退对他们实施分裂 到 x86 可用算法
0008768376 - 1653656387 - 0437673667 - 0123767614 - 1039873878 - 2231712290 / 0038768167 - 3276287672 - 1665265628
C=A/B
这些数字存储为 32 位无符号整数的向量。 第一个,A,是 6-unsigned-int 向量,B 是 3-unsigned-int 长向量 [我将此字段中的每一个命名为广义 'digit' 或 'field' 靠我自己]
生成的 C 将是一些 3-unsigned-int 向量 但要计算它我需要退回到一些可用的 x86(32 位模式,虽然我也听说过 x64 也是,但这是次要的)算术
告诉我怎么算最少第一,最重要 C 结果向量的字段..
怎么做?
GMP's docs include a section on the algorithms it uses,包括除法。在 GMP 术语中,每个 32b 块都称为 "limbs".
64 位 CPUs 非常适合扩展精度数学,因为它们一次处理两倍的数量,每次操作的时间大致相同。如果你想自己实现扩展精度,而不是使用 LGPLed GMP library,我建议使用 x86-64 asm 并回退到 C。(除非你想将它用作你想要发布的东西的一部分仅作为旧版 32 位二进制文件。)
不过除法是这个规则的例外:在 Intel 和 AMD 最近的 x86 设计中,128b / 64b = 64b 的除法比 64b / 32b = 32b 的除法具有更差的延迟和更低的吞吐量。请参阅 http://agner.org/optimize/,在说明表中查找 div
。 idiv
是有符号除法,div
是无符号除法。)相比之下,ADC
(带进位加法)具有每周期 1 个吞吐量,与 64 位 MUL
(和 IMUL
).
实际上,至少在 Intel Sandybridge 系列上,MUL/IMUL
64 位 * 64 位 = 128 位比 IMUL/MUL
32 位 * 32 位 = 64 位快。 mul 32bit 是 3 微指令,每 2 个周期一个吞吐量,而 mul 64 位是 2 微指令,每个周期一个吞吐量。 mul
的单操作数形式执行 rdx:rax = rax * src
。我想需要一个额外的周期来 split/shuffle 乘法器的 64 位结果进入 edx:eax,它被设置为产生 128b 结果的两个 64 位一半。
在 AMD Bulldozer 系列 CPU 上,32 位 mul 比 64 位 mul 快。我猜他们没有全宽乘法器硬件。
(对于编译器的正常使用,c = a*b
通常对所有变量具有相同的宽度,因此可以丢弃结果的上半部分。x86 具有 dest *= src
双操作数形式imul
(但不是 mul
),这与最快的单操作数形式的速度相同。所以不要担心使用 long
s,它们会在正常代码中乘以更快.)
这适用于 CPU 是 运行 32 位或 64 位代码,除了 32 位代码不能进行 64 位操作。
你问的是 div
,我跑题了。 x86的div
指令做rdx:rax/src,输出rax=商,rdx=余数。或者 edx:eax ... 对于 32 位版本。
这是来自 wikipedia 的任意大小操作数的无符号长除法伪代码:
if D == 0 then
error(DivisionByZeroException) end
Q := 0 -- initialize quotient and remainder to zero
R := 0
for i = n-1...0 do -- where n is number of bits in N
R := R << 1 -- left-shift R by 1 bit
R(0) := N(i) -- set the least-significant bit of R equal to bit i of the numerator
if R >= D then
R := R - D
Q(i) := 1
end
end
这也可以扩展为包括有符号除法(将结果四舍五入为零,而不是负无穷大):
if D == 0 then
error(DivisionByZeroException) end
Q := 0 -- initialize quotient and remainder to zero
R := 0
SaveN := N -- save numerator
SaveD := D -- save denominator
if N < 0 then N = -N -- invert numerator if negative
if D < 0 then D = -D -- invert denominator if negative
for i = n-1...0 do -- where n is number of bits in N
R := R << 1 -- left-shift R by 1 bit
R(0) := N(i) -- set the least-significant bit of R equal to bit i of the numerator
if R >= D then
R := R - D
Q(i) := 1
end
end
if SaveN < 0 then
R = -R -- numerator was negative, negative remainder
if (SaveN < 0 and SaveD >= 0) or (SaveN >= 0 and SaveD < 0) then
Q = -Q -- differing signs of inputs, result is negative
这是一个相对简单、未优化、未经测试的 x86 ASM 实现(NASM 语法)应该很容易理解:
; function div_192_96
; parameters:
; 24 bytes: numerator, high words are stored after low words
; 24 bytes: denominator, high words are stored after low words (only low 12 bytes are used)
; 4 bytes: address to store 12 byte remainder in (must not be NULL)
; 4 bytes: address to store 12 byte quotient in (must not be NULL)
; return value: none
; error checking: none
GLOBAL div_192_96
div_192_96:
pushl ebp ; set up stack frame
movl ebp, esp
pushl 0 ; high word of remainder
pushl 0 ; middle word of remainder
pushl 0 ; low word of remainder
pushl 0 ; high word of quotient
pushl 0 ; middle word of quotient
pushl 0 ; low word of quotient
movl ecx, 96
.div_loop:
jecxz .div_loop_done
decl ecx
; remainder = remainder << 1
movl eax, [ebp-8] ; load middle word of remainder
shld [ebp-4], eax, 1 ; shift high word of remainder left by 1
movl eax, [ebp-12] ; load low word of remainder
shld [ebp-8], eax, 1 ; shift middle word of remainder left by 1
shll [ebp-12], 1 ; shift low word of remainder left by 1
; quotient = quotient << 1
movl eax, [ebp-20] ; load middle word of remainder
shld [ebp-16], eax, 1; shift high word of remainder left by 1
movl eax, [ebp-24] ; load low word of remainder
shld [ebp-20], eax, 1; shift middle word of remainder left by 1
shll [ebp-24], 1 ; shift low word of remainder left by 1
; remainder(0) = numerator(127)
movl eax, [ebp+28] ; load high word of numerator
shrl eax, 31 ; get top bit in bit 0
orl [ebp-12], eax ; OR into low word of remainder
; numerator = numerator << 1
movl eax, [ebp+24] ; load 5th word of numerator
shld [ebp+28], eax, 1; shift 6th word of numerator left by 1
movl eax, [ebp+20] ; load 4th word of numerator
shld [ebp+24], eax, 1; shift 5th word of numerator left by 1
movl eax, [ebp+16] ; load 3rd word of numerator
shld [ebp+20], eax, 1; shift 4th word of numerator left by 1
movl eax, [ebp+12] ; load 2nd word of numerator
shld [ebp+16], eax, 1; shift 3rd word of numerator left by 1
movl eax, [ebp+8] ; load 1st word of numerator
shld [ebp+12], eax, 1; shift 2nd word of numerator left by 1
shll [ebp+8], 1 ; shift 1st word of numerator left by 1
; if (remainder >= denominator)
movl eax, [ebp+40] ; compare high word of denominator
cmpl eax, [ebp-4] ; with high word of remainder
jb .div_loop
ja .div_subtract
movl eax, [ebp+36] ; compare middle word of denominator
cmpl eax, [ebp-8] ; with middle word of remainder
jb .div_loop
ja .div_subtract
movl eax, [ebp+32] ; compare low word of denominator
cmpl eax, [ebp-12] ; with low word of remainder
jb .div_loop
.div_subtract:
; remainder = remainder - denominator
movl eax, [ebp+32] ; load low word of denominator
subl [ebp-12], eax ; and subtract from low word of remainder
movl eax, [ebp+36] ; load middle word of denominator
sbbl [ebp-8], eax ; and subtract from middle word of remainder (with borrow)
movl eax, [ebp+40] ; load high word of denominator
sbbl [ebp-4], eax ; and subtract from high word of remainder (with borrow)
; quotient(0) = 1
orl [ebp-24], 1 ; OR 1 into low word of quotient
jmp .div_loop
.div_loop_done:
movl eax, [ebp+56] ; load remainder storage pointer
movl edx, [ebp-12] ; load low word of remainder
movl [eax+0], edx ; store low word of remainder
movl edx, [ebp-8] ; load middle word of remainder
movl [eax+4], edx ; store middle word of remainder
movl edx, [ebp-4] ; load high word of remainder
movl [eax+8], edx ; store high word of remainder
movl eax, [ebp+60] ; load quotient storage pointer
movl edx, [ebp-24] ; load low word of quotient
movl [eax+0], edx ; store low word of quotient
movl edx, [ebp-20] ; load middle word of quotient
movl [eax+4], edx ; store middle word of quotient
movl edx, [ebp-16] ; load high word of quotient
movl [eax+8], edx ; store high word of quotient
addl esp, 24
popl ebp
ret
请注意,这只是给你一个大概的想法,并没有经过测试。顺便说一句,在程序集中计算两个大小相等的数字的商可能比尝试解决溢出问题(在上面的代码中完全未处理)更容易。