使用 SSE 模拟标准 Math.pow 函数

Using SSE to mimic the standard Math.pow function

我正在尝试学习如何使用 SSE,我决定实现一个计算 n^d 的简单代码,使用一个由 C 程序调用的函数.

这是我的 NASM 代码:

section .data

resmsg:     db      '%d^%d = %d', 0

section .bss

section .text

extern printf


; ------------------------------------------------------------
; Function called from a c program, I only use n and d parameters but I left the others
; ------------------------------------------------------------

global main

T       equ     8
n       equ     12
d       equ     16
m       equ     20
Sid     equ     24
Sn      equ     28

main:
    ; ------------------------------------------------------------
    ; Function enter sequence
    ; ------------------------------------------------------------
    push    ebp             ; save Base Pointer
    mov     ebp, esp        ; Move Base Point to current frame
    sub     esp, 8          ; reserve space for two local vars
    push    ebx             ; save some registries (don't know if needed)
    push    esi
    push    edi

    ; ------------------------------------------------------------
    ; copy function's parameters to registries from stack
    ; ------------------------------------------------------------
    mov     eax, [ebp+T]        ; T
    mov     ebx, [ebp+n]        ; n
    mov     ecx, [ebp+d]        ; d
    mov     edx, [ebp+m]        ; m
    mov     esi, [ebp+Sid]      ; Sid
    mov     edi, [ebp+Sn]       ; Sn    
    mov     [ebp-8], ecx        ; copy ecx into one of the local vars

    ;
    ; pow is computed by doing n*n d times
    ;
    movss   xmm0, [ebp+n]   ; base
    movss   xmm1, [ebp+n]   ; another copy of the base because xmm0 will be overwritten by the result

loop:   mulss   xmm0, xmm1      ; scalar mult from sse
        dec     ecx             ; counter--
        cmp     ecx,0           ; check if counter is 0 to end loop
        jnz     loop            ; 

    ;
    ; let's store the result in eax by moving it to the stack and then copying to the registry (we use the other local var as support)
    ;
    movss   [ebp-4], xmm0       
    mov     eax, [ebp-4]

    ;
    ; Print using C's printf
    ;       
    push    eax                 ; result
    mov     ecx, [ebp-8]        ; copy the original d back since we used it as loop's counter
    push    ecx                 ; exponent
    push    ebx                 ; base
    push    resmsg              ; string format
    call    printf              ; printf call
    add     esp, 24             ; clean the stack from both our local and printf's vars

    ; ------------------------------------------------------------
    ; Function exit sequence
    ; ------------------------------------------------------------

    pop edi                     ; restore the registries
    pop esi
    pop ebx
    mov esp, ebp                ; restore the Stack Pointer
    pop ebp                     ; restore the Base Pointer
    ret                         ; get back to C program

现在,我希望打印出来

4^2 = 16

但是,相反,我得到了

4^2 = 0

我花了整个下午的时间来解决这个问题,我找不到解决办法,你有什么提示吗?

编辑:

由于格式问题,我尝试使用

转换数据
movss   [ebp-4], xmm0       
fld     dword [ebp-4]
mov     eax, dword [ebp-4]

而不是

movss   [ebp-4], xmm0       
mov     eax, [ebp-4]

但我得到了相同的结果。

MOVSS 移动单精度浮点数(32 位)。我假设 n 是一个整数,所以您不能使用 MOVSS 将它加载到 XMM 寄存器中。请改用 CVTSI2SSprintf 无法处理单精度浮点数,它会被编译器转换为双精度数。此时用CVTSS2SI就方便了。所以代码应该是这样的:

...
    ;
    ; pow is computed by doing n*n d times
    ;

    cvtsi2ss xmm0, [ebp+n]      ; load integer
    sub ecx, 1                  ; first step (n^1) is done
    cvtsi2ss xmm1, [ebp+n]      ; load integer

loop:
    mulss   xmm0, xmm1          ; scalar mult from sse
    sub     ecx, 1
    jnz     loop

    cvtss2si eax, xmm0          ; result as integer

    ;
    ; Print using C's printf
    ;
    push    eax                 ; result
    mov     ecx, [ebp-8]        ; copy the original d back since we used it as loop's counter
    push    ecx                 ; exponent
    push    ebx                 ; base
    push    resmsg              ; string format
    call    printf              ; printf call
    add     esp, 16             ; clean the stack only from printf's vars
...