删除边界检查的 F# 可破坏数组迭代?

F# Breakable Array Iteration With Bounds Checking Elided?

我有兴趣在高性能应用程序中试用 F#。我不想在迭代期间检查大数组的边界,并且缺少 break/return 语句令人担忧。

这是一个人为的例子,在找到一个值时会中断,但是有人可以告诉我边界检查是否也被省略了吗?

let innerExists (item: Char) (items: Char array): bool = 
    let mutable state = false
    let mutable i = 0
    while not state && i < items.Length do
        state <- item = items.[i]
        i <- i + 1

    state

let exists (input: Char array)(illegalChars: Char array): bool = 
    let mutable state = false
    let mutable i = 0
    while not state && i < input.Length do
        state <- innerExists input.[i] illegalChars
        i <- i + 1

    state

exists [|'A'..'z'|] [|'.';',';';'|]

相关反汇编如下:

    while not state && i < input.Length do
000007FE6EB4237A  cmp         dword ptr [rbp-14h],0  
000007FE6EB4237E  jne         000007FE6EB42383  
000007FE6EB42380  nop  
000007FE6EB42381  jmp         000007FE6EB42386  
000007FE6EB42383  nop  
000007FE6EB42384  jmp         000007FE6EB423A9  
000007FE6EB42386  nop  
000007FE6EB42387  mov         r8d,dword ptr [rbp-18h]  
000007FE6EB4238B  mov         rdx,qword ptr [rbp+18h]  
000007FE6EB4238F  cmp         r8d,dword ptr [rdx+8]  
000007FE6EB42393  setl        r8b  
000007FE6EB42397  movzx       r8d,r8b  
000007FE6EB4239B  mov         dword ptr [rbp-24h],r8d  
000007FE6EB4239F  mov         r8d,dword ptr [rbp-24h]  
000007FE6EB423A3  mov         dword ptr [rbp-1Ch],r8d  
000007FE6EB423A7  jmp         000007FE6EB423B1  
000007FE6EB423A9  nop  
000007FE6EB423AA  xor         r8d,r8d  
000007FE6EB423AD  mov         dword ptr [rbp-1Ch],r8d  
000007FE6EB423B1  cmp         dword ptr [rbp-1Ch],0  
000007FE6EB423B5  je          000007FE6EB42409  
            state <- innerExists input.[i] illegalChars
000007FE6EB423B7  mov         r8d,dword ptr [rbp-18h]  
000007FE6EB423BB  mov         rdx,qword ptr [rbp+18h]  
000007FE6EB423BF  cmp         r8,qword ptr [rdx+8]  
000007FE6EB423C3  jb          000007FE6EB423CA  
000007FE6EB423C5  call        000007FECD796850  
000007FE6EB423CA  lea         rdx,[rdx+r8*2+10h]  
000007FE6EB423CF  movzx       r8d,word ptr [rdx]  
000007FE6EB423D3  mov         rdx,qword ptr [rbp+10h]  
000007FE6EB423D7  mov         rdx,qword ptr [rdx+8]  
000007FE6EB423DB  mov         r9,qword ptr [rbp+20h]  
000007FE6EB423DF  mov         rcx,7FE6EEE0640h  
000007FE6EB423E9  call        000007FE6EB41E40  
000007FE6EB423EE  mov         dword ptr [rbp-20h],eax  
000007FE6EB423F1  mov         eax,dword ptr [rbp-20h]  
000007FE6EB423F4  movzx       eax,al  
000007FE6EB423F7  mov         dword ptr [rbp-14h],eax  
            i <- i + 1
000007FE6EB423FA  mov         eax,dword ptr [rbp-18h]  

Array.contains and Array.exists 函数有什么问题?

let exists input illegalChars = 
    input |> Array.exists (fun c -> illegalChars |> Array.contains c)

JIT 编译器消除了绑定检查,因此 F# 与 C# 的工作方式相同。您可以像示例中那样期望消除代码,以及 for

for i = 0 to data.Lenght - 1 do
    ...

还有尾递归函数,编译成循环。

内置的Array.contains和Array.exists(source code)的编写是为了让JIT编译器可以消除边界检查。

其他人指出使用现有函数 FSharp.Core 来获得相同的结果,但我认为 OP 会询问是否在循环中省略了数组的边界检查(因为它是在循环条件中检查的)。

对于像上面这样的简单代码,抖动应该能够忽略检查。要看到这一点,检查汇编代码是正确的,但重要的是不要 运行 附加 VS 调试器,因为抖动不会优化代码。无法在调试器中显示正确值的原因。

首先让我们看看exists优化的x64:

; not state?
00007ff9`1cd37551 85c0            test    eax,eax
; if state is true then exit the loop
00007ff9`1cd37553 7521            jne     00007ff9`1cd37576
; i < input.Length?
00007ff9`1cd37555 395e08          cmp     dword ptr [rsi+8],ebx
; Seems overly complex but perhaps this is as good as it gets?
00007ff9`1cd37558 0f9fc1          setg    cl
00007ff9`1cd3755b 0fb6c9          movzx   ecx,cl
00007ff9`1cd3755e 85c9            test    ecx,ecx
; if we have reached end of the array then exit
00007ff9`1cd37560 7414            je      00007ff9`1cd37576
; mov i in ebx to rcx, unnecessary but moves like these are very cheap
00007ff9`1cd37562 4863cb          movsxd  rcx,ebx
; input.[i] (note we don't check the boundary again)
00007ff9`1cd37565 0fb74c4e10      movzx   ecx,word ptr [rsi+rcx*2+10h]
; move illegalChars pointer to rdx
00007ff9`1cd3756a 488bd7          mov     rdx,rdi
; call innerExists
00007ff9`1cd3756d e8ee9affff      call    00007ff9`1cd31060
; i <- i + 1
00007ff9`1cd37572 ffc3            inc     ebx
; Jump top of loop
00007ff9`1cd37574 ebdb            jmp     00007ff9`1cd37551
; We are done!
00007ff9`1cd37576

所以代码看起来有点过于复杂,但它似乎只检查数组条件一次。

现在让我们看看 innerExists 优化的 x64:

# let mutable state = false
00007ff9`1cd375a0 33c0            xor     eax,eax
# let mutable i = 0
00007ff9`1cd375a2 4533c0          xor     r8d,r8d
; not state?
00007ff9`1cd375a5 85c0            test    eax,eax
; if state is true then exit the loop
00007ff9`1cd375a7 752b            jne     00007ff9`1cd375d4
; i < items.Length
00007ff9`1cd375a9 44394208        cmp     dword ptr [rdx+8],r8d
; Seems overly complex but perhaps this is as good as it gets?
00007ff9`1cd375ad 410f9fc1        setg    r9b
00007ff9`1cd375b1 450fb6c9        movzx   r9d,r9b
00007ff9`1cd375b5 4585c9          test    r9d,r9d
; if we have reached end of the array then exit
00007ff9`1cd375b8 741a            je      00007ff9`1cd375d4
; mov i in r8d to rax, unnecessary but moves like these are very cheap
00007ff9`1cd375ba 4963c0          movsxd  rax,r8d
; items.[i] (note we don't check the boundary again)
00007ff9`1cd375bd 0fb7444210      movzx   eax,word ptr [rdx+rax*2+10h]
; mov item in cx to r9d, unnecessary but moves like these are very cheap
00007ff9`1cd375c2 440fb7c9        movzx   r9d,cx
; item = items.[i]?
00007ff9`1cd375c6 413bc1          cmp     eax,r9d
00007ff9`1cd375c9 0f94c0          sete    al
; state <- ?
00007ff9`1cd375cc 0fb6c0          movzx   eax,al
; i <- i + 1
00007ff9`1cd375cf 41ffc0          inc     r8d
; Jump top of loop
00007ff9`1cd375d2 ebd1            jmp     00007ff9`1cd375a5
; We are done!
00007ff9`1cd375d4 c3              ret

因此看起来过于复杂,但至少它看起来只检查数组条件一次。

所以最后,抖动似乎消除了数组边界检查,因为它可以证明这已经在循环条件中成功检查,我相信这是 OP 想知道的。

x64 代码看起来并不干净,但从我的实验来看,经过清理的 x64 代码并没有表现得那么好,我怀疑 CPU 供应商优化了 CPU 因为糟糕的代码抖动产生。

一个有趣的练习是用 C++ 编写一个等效的程序,然后 运行 通过 https://godbolt.org/,选择 x86-64 gcc (trunk)(gcc 现在似乎做得最好)并指定选项 -O3 -march=native 并查看生成的 x64 代码。

更新

https://godbolt.org/ 中重写的代码使我们能够看到由 c++ 编译器生成的汇编代码:

template<int N>
bool innerExists(char item, char const (&items)[N]) {
    for (auto i = 0; i < N; ++i) {
        if (item == items[i]) return true;
    }
    return false;
}

template<int N1, int N2>
bool exists(char const (&input)[N1], char const (&illegalCharacters)[N2]) {
    for (auto i = 0; i < N1; ++i) {
        if (innerExists(input[i], illegalCharacters)) return true;
    }
    return false;
}

char const separators[] = { '.', ',', ';' };
char const str[58] = {  };

bool test() {
  return exists(str, separators);
}

x86-64 gcc (trunk) 选项-O3 -march=native 生成以下代码

; Load the string to test into edx
mov edx, OFFSET FLAT:str+1
.L2:
; Have we reached the end?
cmp rdx, OFFSET FLAT:str+58
; If yes, then jump to the end
je .L7
; Load a character
movzx ecx, BYTE PTR [rdx]
; Comparing the 3 separators are encoded in the assembler
;  because the compiler detected the array is always the same
mov eax, ecx
and eax, -3
cmp al, 44
sete al
cmp cl, 59
sete cl
; increase outer i
inc rdx
; Did we find a match?
or al, cl
; If no then loop to .L2
je .L2
; We are done!
ret
.L7:
; No match found, clear result
xor eax, eax
; We are done!
ret

看起来不错,但我在上面的代码中缺少的是使用 AVX 一次测试多个字符。