在 .NET 中用整数枚举填充 Span<int> 的最快方法?

Fatest way to populate Span<int> with an integer enumeration in .NET?

我正在寻找最快的 C#/.NET Core 方法,该方法能够用枚举 0、1、2、3、...填充 Span<int> 天真的 for 循环 - 请参阅下面 - 已经足够快了,但可能有更快的 SIMD 选项。

Span<int> buffer = ..; // snipped
for(var i = 0; i < buffer.Length; i++)
    buffer[i] = i;

如何使用 SIMD 加速这种缓冲区填充方法?

下面进行一些优化尝试。第一个 Default 是基本的 for 循环。第二个 Batch4 相同,但在单个循环迭代中初始化 4 个索引。第 4 和第 5 与第 2 相似,但迭代效果更佳。

第三个是使用System.Numerics.Vector<T>的实现。这种数据类型为 jit 所知,它用 SIMD 对应物代替算术运算。在我的机器上,它比默认实现快两倍。

此处的缺点是缓冲区大小必须是 4 的倍数。(Batch16 / Batch16 为 8/16)。如果不是,最后一行必须在主循环之外手动处理。

using System;
using System.Numerics;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Jobs;
using BenchmarkDotNet.Running;

namespace bench
{
    class Program
    {
        static void Main(string[] args)
        {
            var summary = BenchmarkRunner.Run<Sp>();
        }
    }

    [SimpleJob]
    [MemoryDiagnoser]
    //[DisassemblyDiagnoser(printAsm: true, printIL: true, printSource: true, printDiff: true)]
    public class Sp
    {
        private readonly int[] spanBack = new int[100000];
        private readonly Vector<int> baseV;
        private readonly Vector<int> accV;

        public Sp()
        {
            if (spanBack.Length % Vector<int>.Count != 0) throw new Exception("Invalid array size");

            if (Vector<int>.Count == 4)
            {
                baseV = new Vector<int>(new[] { 4, 4, 4, 4 });
                accV = new Vector<int>(new[] { 0, 1, 2, 3, });
            }
            else if (Vector<int>.Count == 8)
            {
                baseV = new Vector<int>(new[] { 8, 8, 8, 8, 8, 8, 8, 8 });
                accV = new Vector<int>(new[] { 0, 1, 2, 3, 4, 5, 6, 7 });
            }
            else
            {
                throw new Exception("Invalid vector size");
            }
        }
        [Benchmark(Baseline = true)]
        public int[] Default()
        {
            Span<int> buffer = spanBack.AsSpan();
            for (var i = 0; i < buffer.Length; i++)
                buffer[i] = i;

            return spanBack;
        }

        [Benchmark]
        public int[] Batch4()
        {
            Span<int> buffer = spanBack.AsSpan();
            for (var i = 0; i < buffer.Length; i = i + 4)
            {
                buffer[i + 0] = i + 0;
                buffer[i + 1] = i + 1;
                buffer[i + 2] = i + 2;
                buffer[i + 3] = i + 3;
            }

            return spanBack;
        }

        [Benchmark]
        public int[] BatchSimd()
        {
            int batchSize = Vector<int>.Count;
            var accV = this.accV;

            Span<int> buffer = spanBack.AsSpan();
            for (var i = 0; i < buffer.Length; i = i + batchSize)
            {
                var currentSlice = buffer.Slice(i, batchSize);

                var v = new Vector<int>(currentSlice);
                v = v + accV;
                accV = accV + baseV;

                v.CopyTo(currentSlice);
            }

            return spanBack;
        }

        [Benchmark]
        public int[] Batch8()
        {
            Span<int> buffer = spanBack.AsSpan();
            for (var i = 0; i < buffer.Length; i = i + 8)
            {
                buffer[i + 0] = i + 0;
                buffer[i + 1] = i + 1;
                buffer[i + 2] = i + 2;
                buffer[i + 3] = i + 3;
                buffer[i + 4] = i + 4;
                buffer[i + 5] = i + 5;
                buffer[i + 6] = i + 6;
                buffer[i + 7] = i + 7;
            }

            return spanBack;
        }

        [Benchmark]
        public int[] Batch16()
        {
            Span<int> buffer = spanBack.AsSpan();
            for (var i = 0; i < buffer.Length; i = i + 16)
            {
                buffer[i + 0] = i + 0;
                buffer[i + 1] = i + 1;
                buffer[i + 2] = i + 2;
                buffer[i + 3] = i + 3;
                buffer[i + 4] = i + 4;
                buffer[i + 5] = i + 5;
                buffer[i + 6] = i + 6;
                buffer[i + 7] = i + 7;
                buffer[i + 8] = i + 8;
                buffer[i + 9] = i + 9;
                buffer[i + 10] = i + 10;
                buffer[i + 11] = i + 11;
                buffer[i + 12] = i + 12;
                buffer[i + 13] = i + 13;
                buffer[i + 14] = i + 14;
                buffer[i + 15] = i + 15;
            }

            return spanBack;
        }
    }
}

Csproj:

<Project Sdk="Microsoft.NET.Sdk">

  <PropertyGroup>
    <OutputType>Exe</OutputType>
    <TargetFramework>netcoreapp3.0</TargetFramework>
    <DebugType>pdbonly</DebugType>
    <DebugSymbols>true</DebugSymbols>
  </PropertyGroup>

  <ItemGroup>
    <PackageReference Include="BenchmarkDotNet" Version="0.12.0" />
  </ItemGroup>

</Project>

dotnet run -c Release的结果:

BenchmarkDotNet=v0.12.0, OS=Windows 10.0.18362
Intel Core i7-2600K CPU 3.40GHz (Sandy Bridge), 1 CPU, 8 logical and 4 physical cores
.NET Core SDK=3.1.100-preview1-014459
  [Host]     : .NET Core 3.1.0 (CoreCLR 4.700.19.50403, CoreFX 4.700.19.50410), X64 RyuJIT
  DefaultJob : .NET Core 3.1.0 (CoreCLR 4.700.19.50403, CoreFX 4.700.19.50410), X64 RyuJIT
|     Method |     Mean |    Error |   StdDev | Ratio | Gen 0 | Gen 1 | Gen 2 | Allocated |
|----------- |---------:|---------:|---------:|------:|------:|------:|------:|----------:|
|    Default | 45.55 us | 0.081 us | 0.067 us |  1.00 |     - |     - |     - |         - |
|     Batch4 | 34.23 us | 0.069 us | 0.065 us |  0.75 |     - |     - |     - |       1 B |
| Batch4Simd | 22.23 us | 0.054 us | 0.051 us |  0.49 |     - |     - |     - |         - |
|     Batch8 | 31.53 us | 0.160 us | 0.134 us |  0.69 |     - |     - |     - |         - |
|    Batch16 | 32.10 us | 0.197 us | 0.164 us |  0.70 |     - |     - |     - |         - |

编辑:@harold 的建议

[Benchmark]
public int[] BatchSimd_harold()
{
    int batchSize = Vector<int>.Count;
    var accV = this.accV;

    Span<int> buffer = spanBack.AsSpan();
    for (var i = 0; i < buffer.Length; i = i + batchSize)
    {
        var currentSlice = buffer.Slice(i, batchSize);

        accV.CopyTo(currentSlice);
        accV = accV + baseV;
    }

    return spanBack;
}

结果:

|           Method |     Mean |    Error |   StdDev | Ratio | Gen 0 | Gen 1 | Gen 2 | Allocated |
|----------------- |---------:|---------:|---------:|------:|------:|------:|------:|----------:|
|          Default | 46.08 us | 0.331 us | 0.310 us |  1.00 |     - |     - |     - |         - |
|        BatchSimd | 22.37 us | 0.150 us | 0.141 us |  0.49 |     - |     - |     - |         - |
| BatchSimd_harold | 18.72 us | 0.255 us | 0.239 us |  0.41 |     - |     - |     - |         - |

编辑 2:最近 cpu

BenchmarkDotNet=v0.12.0, OS=Windows 10.0.18362
Intel Core i7-6820HQ CPU 2.70GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
.NET Core SDK=3.0.100
  [Host]     : .NET Core 3.0.0 (CoreCLR 4.700.19.46205, CoreFX 4.700.19.46214), X64 RyuJIT
  DefaultJob : .NET Core 3.0.0 (CoreCLR 4.700.19.46205, CoreFX 4.700.19.46214), X64 RyuJIT
|           Method |     Mean |    Error |   StdDev |   Median | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
|----------------- |---------:|---------:|---------:|---------:|------:|--------:|------:|------:|------:|----------:|
|          Default | 59.05 us | 1.169 us | 2.362 us | 59.01 us |  1.00 |    0.00 |     - |     - |     - |         - |
|           Batch4 | 44.39 us | 0.865 us | 0.722 us | 44.48 us |  0.76 |    0.03 |     - |     - |     - |         - |
|        BatchSimd | 15.37 us | 0.364 us | 1.049 us | 15.07 us |  0.26 |    0.02 |     - |     - |     - |         - |
| BatchSimd_harold | 11.77 us | 0.219 us | 0.205 us | 11.80 us |  0.20 |    0.01 |     - |     - |     - |         - |
|           Batch8 | 43.62 us | 0.871 us | 1.838 us | 43.46 us |  0.74 |    0.04 |     - |     - |     - |         - |
|          Batch16 | 42.53 us | 0.846 us | 2.317 us | 41.92 us |  0.73 |    0.05 |     - |     - |     - |         - |