F# NativePtr.stackalloc 比 C# stackalloc 慢 - 包含反编译代码

F# NativePtr.stackalloc slower then C# stackalloc - Decompiled Code Included

继续我的 F# 性能测试。有关更多背景信息,请参见此处:

现在我可以在 F# 中使用堆栈数组了。然而,出于某种原因,等效的 C# 大约快 50 倍。我在下面包含了 ILSpy 反编译版本,看起来只有 1 行确实不同(在 stackAlloc 内)。




#nowarn "9"

open Microsoft.FSharp.NativeInterop
open System
open System.Diagnostics    
open System.Runtime.CompilerServices        

let stackAlloc x =
    let mutable ints:nativeptr<byte> = NativePtr.stackalloc x

let main argv = 
    printfn "%A" argv

    let size = 8192            
    let reps = 10000

    stackAlloc size // JIT
    let clock = Stopwatch()
    for i = 1 to reps do            
        stackAlloc size

    let elapsed = clock.Elapsed.TotalMilliseconds
    let description = "F# NativePtr.stackalloc"
    Console.WriteLine("{0} ({1} bytes, {2} reps): {3:#,##0.####}ms", description, size, reps, elapsed)

    Console.ReadKey() |> ignore


using System;
using System.Diagnostics;

namespace CSharpLanguageFeatures
    class CSharpStackArray
        static void Main(string[] args)
            int size = 8192;
            int reps = 10000;

            stackAlloc(size); // JIT
            Stopwatch clock = new Stopwatch();
            for (int i = 0; i < reps; i++)

            string elapsed = clock.Elapsed.TotalMilliseconds.ToString("#,##0.####");
            string description = "C# stackalloc";
            Console.WriteLine("{0} ({1} bytes, {2} reps): {3:#,##0.####}ms", description, size, reps, elapsed);

        public unsafe static void stackAlloc(int arraySize)
            byte* pArr = stackalloc byte[arraySize];


using Microsoft.FSharp.Core;
using System;
using System.Diagnostics;
using System.IO;
using System.Runtime.CompilerServices;

public static class FSharpStackArray
    public unsafe static void stackAlloc(int x)
        IntPtr ints = stackalloc byte[x * sizeof(byte)];

    public static int main(string[] argv)
        PrintfFormat<FSharpFunc<string[], Unit>, TextWriter, Unit, Unit> format = new PrintfFormat<FSharpFunc<string[], Unit>, TextWriter, Unit, Unit, string[]>("%A");
        PrintfModule.PrintFormatLineToTextWriter<FSharpFunc<string[], Unit>>(Console.Out, format).Invoke(argv);
        Stopwatch clock = new Stopwatch();
        for (int i = 1; i < 10001; i++)
        double elapsed = clock.Elapsed.TotalMilliseconds;
        Console.WriteLine("{0} ({1} bytes, {2} reps): {3:#,##0.####}ms", "F# NativePtr.stackalloc", 8192, 10000, elapsed);
        ConsoleKeyInfo consoleKeyInfo = Console.ReadKey();
        return 0;


using System;
using System.Diagnostics;

namespace CSharpLanguageFeatures
    internal class CSharpStackArray
        private static void Main(string[] args)
            int size = 8192;
            int reps = 10000;
            Stopwatch clock = new Stopwatch();
            for (int i = 0; i < reps; i++)
            string elapsed = clock.Elapsed.TotalMilliseconds.ToString("#,##0.####");
            string description = "C# stackalloc";
            Console.WriteLine("{0} ({1} bytes, {2} reps): {3:#,##0.####}ms", new object[]

        public unsafe static void stackAlloc(int arraySize)
            IntPtr arg_06_0 = stackalloc byte[checked(unchecked((UIntPtr)arraySize) * 1)];

F# 版本 IL - 字节分配

.method public static 
    void stackAlloc (
        int32 x
    ) cil managed noinlining 
    // Method begins at RVA 0x2050
    // Code size 13 (0xd)
    .maxstack 4
    .locals init (
        [0] native int ints

    IL_0000: nop
    IL_0001: ldarg.0
    IL_0002: sizeof [mscorlib]System.Byte
    IL_0008: mul
    IL_0009: localloc
    IL_000b: stloc.0
    IL_000c: ret
} // end of method FSharpStackArray::stackAlloc

C# 版本 IL - 字节分配

.method public hidebysig static 
    void stackAlloc (
        int32 arraySize
    ) cil managed 
    // Method begins at RVA 0x2094
    // Code size 8 (0x8)
    .maxstack 8

    IL_0000: ldarg.0
    IL_0001: conv.u
    IL_0002: ldc.i4.1
    IL_0003: mul.ovf.un
    IL_0004: localloc
    IL_0006: pop
    IL_0007: ret
} // end of method CSharpStackArray::stackAlloc   

已更新 F# IL - IntPtr 分配

.method public static 
    void stackAlloc (
        int32 x
    ) cil managed noinlining 
    // Method begins at RVA 0x2050
    // Code size 13 (0xd)
    .maxstack 4
    .locals init (
        [0] native int ints

    IL_0000: nop
    IL_0001: ldarg.0
    IL_0002: sizeof [mscorlib]System.IntPtr
    IL_0008: mul
    IL_0009: localloc
    IL_000b: stloc.0
    IL_000c: ret
} // end of method FSharpStackArray::stackAlloc

更新的 C# IL - IntPtr 分配

.method public hidebysig static 
    void stackAlloc (
        int32 arraySize
    ) cil managed 
    // Method begins at RVA 0x2415
    // Code size 13 (0xd)
    .maxstack 8

    IL_0000: ldarg.0
    IL_0001: conv.u
    IL_0002: sizeof [mscorlib]System.IntPtr
    IL_0008: mul.ovf.un
    IL_0009: localloc
    IL_000b: pop
    IL_000c: ret
} // end of method CSharpStackArray::stackAlloc


答案是 C# 编译器没有将指针存储为本地指针。这是因为从来不需要分配的内存。 "sizeof" 的缺失和不同的 "mul" 给了 C# 另一个小优势。

F# 汇编程序 - 差异已注释

.method public static 
    void stackAlloc (
        int32 x
    ) cil managed noinlining 
    // Method begins at RVA 0x2050
    // Code size 13 (0xd)
    .maxstack 4
    .locals init ( //***** Not in C# Version *****//
        [0] native int ints

    IL_0000: nop
    IL_0001: ldarg.0
    IL_0002: sizeof [mscorlib]System.Byte //***** C# just uses "1" *****//
    IL_0008: mul //***** C# uses "mul.ovf.un" *****//
    IL_0009: localloc
    IL_000b: stloc.0 //***** Not in C# Version *****//
    IL_000c: ret
} // end of method FSharpStackArray::stackAlloc

C# 汇编程序 - 差异已注释

.method public hidebysig static 
    void stackAlloc (
        int32 arraySize
    ) cil managed 
    // Method begins at RVA 0x2094
    // Code size 8 (0x8)
    .maxstack 8

    IL_0000: ldarg.0
    IL_0001: conv.u
    IL_0002: ldc.i4.1 //***** F# uses sizeof [mscorlib]System.Byte *****//
    IL_0003: mul.ovf.un //***** F# uses "mul" *****//
    IL_0004: localloc
    IL_0006: pop
    IL_0007: ret
} // end of method CSharpStackArray::stackAlloc  


  1. 编译器执行大量优化。显然,不同语言的相同高级代码可能会产生完全不同的机器指令集。
  2. 在对 dotnet 语言进行基准测试时,您可以阅读中间程序集以真正了解发生了什么。为此使用 ILSpy。
  3. 您可以使用ilasm.exe修改和编译中间程序集。
  4. C# 编译器在删除不必要的代码方面做得更好。在分配的内存中设置每个字节后,性能将与最初预期的非常相似。

最终 F# 代码

#nowarn "9"

open Microsoft.FSharp.NativeInterop
open System
open System.Diagnostics    
open System.Runtime.CompilerServices        

let stackAlloc x =
    let mutable bytes:nativeptr<byte> = NativePtr.stackalloc x
    for i = 0 to (x - 1) do
        NativePtr.set bytes i (byte i)

let main argv = 
    printfn "%A" argv

    let size = 8192            
    let reps = 10000

    stackAlloc size // JIT
    let clock = Stopwatch()
    for i = 1 to reps do            
        stackAlloc size

    let elapsed = clock.Elapsed.TotalMilliseconds
    let description = "F# NativePtr.stackalloc"
    Console.WriteLine("{0} ({1} bytes, {2} reps): {3:#,##0.####}ms", description, size, reps, elapsed)

    Console.ReadKey() |> ignore

最终 C# 代码

using System;
using System.Diagnostics;

namespace CSharpStackArray
    class Program
        static void Main(string[] args)
            int size = 8192;
            int reps = 10000;

            stackAlloc(size); // JIT
            Stopwatch clock = new Stopwatch();
            for (int i = 0; i < reps; i++)

            string elapsed = clock.Elapsed.TotalMilliseconds.ToString("#,##0.####");
            string description = "C# stackalloc";
            Console.WriteLine("{0} ({1} bytes, {2} reps): {3:#,##0.####}ms", description, size, reps, elapsed);

        public unsafe static void stackAlloc(int arraySize)
            byte* pArr = stackalloc byte[arraySize];
            for (int i = 0; i < arraySize; i++)
                pArr[i] = (byte)i;