为什么我的 MemoryManager 和我的 MemoryPool 实现这么慢?

Why is my MemoryManager respectively my MemoryPool implementation so slow?

出于学习目的,我尝试实现 MemoryManager 和 MemoryPool 并尝试它如何与标准实现竞争。但尤其是我的 MemoryManager 太慢了。有人可以指出我这里发生的事情吗顺便说一句,有什么问题吗?

我的内存池:

internal abstract class ByteMemoryPool : MemoryPool<byte>
{
    private const int POOL_USAGE_BORDER_BYTES = 85000;

    public override int MaxBufferSize => Int32.MaxValue;

    public new static ByteMemoryPool.Impl Shared { get; } = new ByteMemoryPool.Impl();

    public override IMemoryOwner<byte> Rent(int minBufferSize = -1)
    {
        return RentCore(minBufferSize);
    }

    protected override void Dispose(bool disposing)
    {

    }

    private Rental RentCore(int minBufferSize)
    {
        return new Rental(minBufferSize);
    }

    public sealed class Impl : ByteMemoryPool
    {
        public new Rental Rent(int minBufferSize) => RentCore(minBufferSize);
    }

    public struct Rental : IMemoryOwner<byte>
    {
        private byte[]? _array;
        private readonly bool _notRented;

        public Rental(int minBufferSize)
        {
            if (minBufferSize < POOL_USAGE_BORDER_BYTES)
            {
                _array = new byte[minBufferSize];
                _notRented = true;
            }
            else
            {
                _array = ArrayPool<byte>.Shared.Rent(minBufferSize);
                _notRented = false;
            }
        }

        public Memory<byte> Memory
        {
            get
            {
                if (_array == null)
                    throw new ObjectDisposedException(nameof(_array));

                return new Memory<byte>(_array);
            }
        }

        public void Dispose()
        {
            if (_array != null && !_notRented)
            {
                ArrayPool<byte>.Shared.Return(_array, true);
                _array = null;
            }
            else
            {
                _array = null;
            }
        }
    }
}

我的内存管理器:

 internal sealed class NativeByteMemoryManager : MemoryManager<byte>
{
    private IntPtr _memoryPtr;
    private readonly int _length;

    public unsafe NativeByteMemoryManager(int length)
    {
        _length = length;
        _memoryPtr = Marshal.AllocHGlobal(length);
        Unsafe.InitBlock((void*)_memoryPtr, 0, (uint)_length);
    }

    public override Memory<byte> Memory => CreateMemory(_length);

    public override unsafe Span<byte> GetSpan()
    {
        return new Span<byte>(_memoryPtr.ToPointer(), _length);
    }

    public override unsafe MemoryHandle Pin(int elementIndex = 0)
    {
        void* pointer = (void*) ((byte*) _memoryPtr + elementIndex);
        return new MemoryHandle(pointer, default, this);
    }

    public override void Unpin()
    {
        Marshal.FreeHGlobal(_memoryPtr);
        _memoryPtr = IntPtr.Zero;
    }

    protected override void Dispose(bool disposing)
    {
        if (_memoryPtr != IntPtr.Zero)
        {
            Marshal.FreeHGlobal(_memoryPtr);
            _memoryPtr = IntPtr.Zero;
        }
    }
}

基准:

    public class MemoryManagerBenchmark
{
    [Params(1000, 8000, 64000, 4000000)]
    [System.Diagnostics.CodeAnalysis.SuppressMessage("Design", "CA1051:Do not declare visible instance fields", Justification = "<Pending>")]
    public int ArraySize;

    [Benchmark(Baseline = true)]
    public int MemoryPoolDefault()
    {
        var x = ArrayPool<byte>.Shared.Rent(ArraySize);
        var l = x.Length;
        ArrayPool<byte>.Shared.Return(x, true);
        return l;
    }

    [Benchmark(Baseline = false)]
    public int MemoryPoolByte()
    {
        using var x = ByteMemoryPool.Shared.Rent(ArraySize);
        var l = x.Memory.Length;
        return l;
    }

    [Benchmark(Baseline = false)]
    public int MemoryManager()
    {
        using var x = new NativeByteMemoryManager(ArraySize);
        var l = x.Memory.Length;
        return l;
    }
}

结果:

BenchmarkDotNet=v0.11.5, OS=Windows 10.0.17763.107 (1809/October2018Update/Redstone5)
Intel Core i7-2600 CPU 3.40GHz (Sandy Bridge), 1 CPU, 8 logical and 4 physical cores
.NET Core SDK=3.0.100
  [Host]     : .NET Core 3.0.0 (CoreCLR 4.700.19.46205, CoreFX 4.700.19.46214), 64bit RyuJIT
  Job-MXYBLG : .NET Core 3.0.0 (CoreCLR 4.700.19.46205, CoreFX 4.700.19.46214), 64bit RyuJIT

Force=False  IterationCount=15  LaunchCount=2  
WarmupCount=10  

            Method | ArraySize |           Mean |         Error |        StdDev |          Median | Kurtosis | Skewness | Ratio | RatioSD | Rank | Baseline |    Gen 0 |    Gen 1 |    Gen 2 | Allocated |
------------------ |---------- |---------------:|--------------:|--------------:|----------------:|---------:|---------:|------:|--------:|-----:|--------- |---------:|---------:|---------:|----------:|
     **MemoryManager** |      **1000** |       **187.3 ns** |     **28.466 ns** |     **41.726 ns** |       **168.24 ns** |    **1.210** |   **0.1370** |  **1.88** |    **0.45** |    **3** |       **No** |   **0.0076** |        **-** |        **-** |      **32 B** |
    **MemoryPoolByte** |      **1000** |       **123.1 ns** |      **6.342 ns** |      **9.492 ns** |       **121.44 ns** |    **1.730** |   **0.4267** |  **1.24** |    **0.14** |    **2** |       **No** |   **0.2447** |        **-** |        **-** |    **1024 B** |
 **MemoryPoolDefault** |      **1000** |       **100.2 ns** |      **5.226 ns** |      **7.821 ns** |        **97.50 ns** |    **2.284** |   **0.6929** |  **1.00** |    **0.00** |    **1** |      **Yes** |        **-** |        **-** |        **-** |         **-** |
                   |           |                |               |               |                 |          |          |       |         |      |          |          |          |          |           |
     **MemoryManager** |      **8000** |       **374.1 ns** |     **25.279 ns** |     **37.054 ns** |       **349.88 ns** |    **1.264** |   **0.2485** |  **1.54** |    **0.22** |    **2** |       **No** |   **0.0076** |        **-** |        **-** |      **32 B** |
    **MemoryPoolByte** |      **8000** |       **842.4 ns** |     **12.637 ns** |     **18.523 ns** |       **839.46 ns** |    **2.485** |   **0.7287** |  **3.46** |    **0.26** |    **3** |       **No** |   **1.9150** |        **-** |        **-** |    **8024 B** |
 **MemoryPoolDefault** |      **8000** |       **245.1 ns** |     **12.542 ns** |     **17.988 ns** |       **236.31 ns** |    **5.935** |   **1.9246** |  **1.00** |    **0.00** |    **1** |      **Yes** |        **-** |        **-** |        **-** |         **-** |
                   |           |                |               |               |                 |          |          |       |         |      |          |          |          |          |           |
     **MemoryManager** |     **64000** |     **2,311.8 ns** |     **87.763 ns** |    **131.359 ns** |     **2,266.83 ns** |    **2.146** |   **0.6641** |  **1.06** |    **0.06** |    **2** |       **No** |   **0.0076** |        **-** |        **-** |      **32 B** |
    **MemoryPoolByte** |     **64000** |     **5,351.5 ns** |     **82.720 ns** |    **118.634 ns** |     **5,298.23 ns** |    **4.749** |   **1.5884** |  **2.46** |    **0.14** |    **3** |       **No** |  **15.1443** |        **-** |        **-** |   **64024 B** |
 **MemoryPoolDefault** |     **64000** |     **2,187.6 ns** |     **83.603 ns** |    **125.133 ns** |     **2,102.50 ns** |    **2.154** |   **0.9189** |  **1.00** |    **0.00** |    **1** |      **Yes** |        **-** |        **-** |        **-** |         **-** |
                   |           |                |               |               |                 |          |          |       |         |      |          |          |          |          |           |
     **MemoryManager** |   **4000000** | **2,188,789.3 ns** | **65,843.021 ns** | **98,550.733 ns** | **2,165,661.52 ns** |    **4.130** |   **1.3955** | **10.78** |    **0.72** |    **2** |       **No** |        **-** |        **-** |        **-** |      **32 B** |
    **MemoryPoolByte** |   **4000000** |   **199,434.5 ns** |  **2,634.057 ns** |  **3,777.686 ns** |   **198,360.50 ns** |    **3.567** |   **0.9854** |  **0.98** |    **0.04** |    **1** |       **No** | **999.7559** | **999.7559** | **999.7559** |         **-** |
 **MemoryPoolDefault** |   **4000000** |   **203,299.8 ns** |  **3,986.979 ns** |  **5,967.523 ns** |   **201,993.74 ns** |    **3.340** |   **0.7295** |  **1.00** |    **0.00** |    **1** |      **Yes** | **999.7559** | **999.7559** | **999.7559** |         **-** |

基准测试工具只能告诉您 运行 给定代码需要多长时间。 探查器可以告诉您为什么需要这么长时间。因此,您需要探查您的代码才能找出答案。

我在我的博客上描述了使用 BenchmarkDotNet 和 PerfView 进行的示例性能调查,这对您来说可能是一个很好的起点:https://adamsitnik.com/Sample-Perf-Investigation/

如果 PerfView 不满足您的要求,您可以尝试其他分析器:https://github.com/dotnet/performance/blob/master/docs/profiling-workflow-dotnet-runtime.md