在 .NET 中用整数枚举填充 Span<int> 的最快方法?
Fatest way to populate Span<int> with an integer enumeration in .NET?
我正在寻找最快的 C#/.NET Core 方法,该方法能够用枚举 0、1、2、3、...填充 Span<int>
天真的 for
循环 - 请参阅下面 - 已经足够快了,但可能有更快的 SIMD 选项。
Span<int> buffer = ..; // snipped
for(var i = 0; i < buffer.Length; i++)
buffer[i] = i;
如何使用 SIMD 加速这种缓冲区填充方法?
下面进行一些优化尝试。第一个 Default
是基本的 for 循环。第二个 Batch4
相同,但在单个循环迭代中初始化 4 个索引。第 4 和第 5 与第 2 相似,但迭代效果更佳。
第三个是使用System.Numerics.Vector<T>
的实现。这种数据类型为 jit 所知,它用 SIMD 对应物代替算术运算。在我的机器上,它比默认实现快两倍。
此处的缺点是缓冲区大小必须是 4 的倍数。(Batch16
/ Batch16
为 8/16)。如果不是,最后一行必须在主循环之外手动处理。
using System;
using System.Numerics;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Jobs;
using BenchmarkDotNet.Running;
namespace bench
{
class Program
{
static void Main(string[] args)
{
var summary = BenchmarkRunner.Run<Sp>();
}
}
[SimpleJob]
[MemoryDiagnoser]
//[DisassemblyDiagnoser(printAsm: true, printIL: true, printSource: true, printDiff: true)]
public class Sp
{
private readonly int[] spanBack = new int[100000];
private readonly Vector<int> baseV;
private readonly Vector<int> accV;
public Sp()
{
if (spanBack.Length % Vector<int>.Count != 0) throw new Exception("Invalid array size");
if (Vector<int>.Count == 4)
{
baseV = new Vector<int>(new[] { 4, 4, 4, 4 });
accV = new Vector<int>(new[] { 0, 1, 2, 3, });
}
else if (Vector<int>.Count == 8)
{
baseV = new Vector<int>(new[] { 8, 8, 8, 8, 8, 8, 8, 8 });
accV = new Vector<int>(new[] { 0, 1, 2, 3, 4, 5, 6, 7 });
}
else
{
throw new Exception("Invalid vector size");
}
}
[Benchmark(Baseline = true)]
public int[] Default()
{
Span<int> buffer = spanBack.AsSpan();
for (var i = 0; i < buffer.Length; i++)
buffer[i] = i;
return spanBack;
}
[Benchmark]
public int[] Batch4()
{
Span<int> buffer = spanBack.AsSpan();
for (var i = 0; i < buffer.Length; i = i + 4)
{
buffer[i + 0] = i + 0;
buffer[i + 1] = i + 1;
buffer[i + 2] = i + 2;
buffer[i + 3] = i + 3;
}
return spanBack;
}
[Benchmark]
public int[] BatchSimd()
{
int batchSize = Vector<int>.Count;
var accV = this.accV;
Span<int> buffer = spanBack.AsSpan();
for (var i = 0; i < buffer.Length; i = i + batchSize)
{
var currentSlice = buffer.Slice(i, batchSize);
var v = new Vector<int>(currentSlice);
v = v + accV;
accV = accV + baseV;
v.CopyTo(currentSlice);
}
return spanBack;
}
[Benchmark]
public int[] Batch8()
{
Span<int> buffer = spanBack.AsSpan();
for (var i = 0; i < buffer.Length; i = i + 8)
{
buffer[i + 0] = i + 0;
buffer[i + 1] = i + 1;
buffer[i + 2] = i + 2;
buffer[i + 3] = i + 3;
buffer[i + 4] = i + 4;
buffer[i + 5] = i + 5;
buffer[i + 6] = i + 6;
buffer[i + 7] = i + 7;
}
return spanBack;
}
[Benchmark]
public int[] Batch16()
{
Span<int> buffer = spanBack.AsSpan();
for (var i = 0; i < buffer.Length; i = i + 16)
{
buffer[i + 0] = i + 0;
buffer[i + 1] = i + 1;
buffer[i + 2] = i + 2;
buffer[i + 3] = i + 3;
buffer[i + 4] = i + 4;
buffer[i + 5] = i + 5;
buffer[i + 6] = i + 6;
buffer[i + 7] = i + 7;
buffer[i + 8] = i + 8;
buffer[i + 9] = i + 9;
buffer[i + 10] = i + 10;
buffer[i + 11] = i + 11;
buffer[i + 12] = i + 12;
buffer[i + 13] = i + 13;
buffer[i + 14] = i + 14;
buffer[i + 15] = i + 15;
}
return spanBack;
}
}
}
Csproj:
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>netcoreapp3.0</TargetFramework>
<DebugType>pdbonly</DebugType>
<DebugSymbols>true</DebugSymbols>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="BenchmarkDotNet" Version="0.12.0" />
</ItemGroup>
</Project>
dotnet run -c Release
的结果:
BenchmarkDotNet=v0.12.0, OS=Windows 10.0.18362
Intel Core i7-2600K CPU 3.40GHz (Sandy Bridge), 1 CPU, 8 logical and 4 physical cores
.NET Core SDK=3.1.100-preview1-014459
[Host] : .NET Core 3.1.0 (CoreCLR 4.700.19.50403, CoreFX 4.700.19.50410), X64 RyuJIT
DefaultJob : .NET Core 3.1.0 (CoreCLR 4.700.19.50403, CoreFX 4.700.19.50410), X64 RyuJIT
| Method | Mean | Error | StdDev | Ratio | Gen 0 | Gen 1 | Gen 2 | Allocated |
|----------- |---------:|---------:|---------:|------:|------:|------:|------:|----------:|
| Default | 45.55 us | 0.081 us | 0.067 us | 1.00 | - | - | - | - |
| Batch4 | 34.23 us | 0.069 us | 0.065 us | 0.75 | - | - | - | 1 B |
| Batch4Simd | 22.23 us | 0.054 us | 0.051 us | 0.49 | - | - | - | - |
| Batch8 | 31.53 us | 0.160 us | 0.134 us | 0.69 | - | - | - | - |
| Batch16 | 32.10 us | 0.197 us | 0.164 us | 0.70 | - | - | - | - |
编辑:@harold 的建议
[Benchmark]
public int[] BatchSimd_harold()
{
int batchSize = Vector<int>.Count;
var accV = this.accV;
Span<int> buffer = spanBack.AsSpan();
for (var i = 0; i < buffer.Length; i = i + batchSize)
{
var currentSlice = buffer.Slice(i, batchSize);
accV.CopyTo(currentSlice);
accV = accV + baseV;
}
return spanBack;
}
结果:
| Method | Mean | Error | StdDev | Ratio | Gen 0 | Gen 1 | Gen 2 | Allocated |
|----------------- |---------:|---------:|---------:|------:|------:|------:|------:|----------:|
| Default | 46.08 us | 0.331 us | 0.310 us | 1.00 | - | - | - | - |
| BatchSimd | 22.37 us | 0.150 us | 0.141 us | 0.49 | - | - | - | - |
| BatchSimd_harold | 18.72 us | 0.255 us | 0.239 us | 0.41 | - | - | - | - |
编辑 2:最近 cpu
BenchmarkDotNet=v0.12.0, OS=Windows 10.0.18362
Intel Core i7-6820HQ CPU 2.70GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
.NET Core SDK=3.0.100
[Host] : .NET Core 3.0.0 (CoreCLR 4.700.19.46205, CoreFX 4.700.19.46214), X64 RyuJIT
DefaultJob : .NET Core 3.0.0 (CoreCLR 4.700.19.46205, CoreFX 4.700.19.46214), X64 RyuJIT
| Method | Mean | Error | StdDev | Median | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
|----------------- |---------:|---------:|---------:|---------:|------:|--------:|------:|------:|------:|----------:|
| Default | 59.05 us | 1.169 us | 2.362 us | 59.01 us | 1.00 | 0.00 | - | - | - | - |
| Batch4 | 44.39 us | 0.865 us | 0.722 us | 44.48 us | 0.76 | 0.03 | - | - | - | - |
| BatchSimd | 15.37 us | 0.364 us | 1.049 us | 15.07 us | 0.26 | 0.02 | - | - | - | - |
| BatchSimd_harold | 11.77 us | 0.219 us | 0.205 us | 11.80 us | 0.20 | 0.01 | - | - | - | - |
| Batch8 | 43.62 us | 0.871 us | 1.838 us | 43.46 us | 0.74 | 0.04 | - | - | - | - |
| Batch16 | 42.53 us | 0.846 us | 2.317 us | 41.92 us | 0.73 | 0.05 | - | - | - | - |
我正在寻找最快的 C#/.NET Core 方法,该方法能够用枚举 0、1、2、3、...填充 Span<int>
天真的 for
循环 - 请参阅下面 - 已经足够快了,但可能有更快的 SIMD 选项。
Span<int> buffer = ..; // snipped
for(var i = 0; i < buffer.Length; i++)
buffer[i] = i;
如何使用 SIMD 加速这种缓冲区填充方法?
下面进行一些优化尝试。第一个 Default
是基本的 for 循环。第二个 Batch4
相同,但在单个循环迭代中初始化 4 个索引。第 4 和第 5 与第 2 相似,但迭代效果更佳。
第三个是使用System.Numerics.Vector<T>
的实现。这种数据类型为 jit 所知,它用 SIMD 对应物代替算术运算。在我的机器上,它比默认实现快两倍。
此处的缺点是缓冲区大小必须是 4 的倍数。(Batch16
/ Batch16
为 8/16)。如果不是,最后一行必须在主循环之外手动处理。
using System;
using System.Numerics;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Jobs;
using BenchmarkDotNet.Running;
namespace bench
{
class Program
{
static void Main(string[] args)
{
var summary = BenchmarkRunner.Run<Sp>();
}
}
[SimpleJob]
[MemoryDiagnoser]
//[DisassemblyDiagnoser(printAsm: true, printIL: true, printSource: true, printDiff: true)]
public class Sp
{
private readonly int[] spanBack = new int[100000];
private readonly Vector<int> baseV;
private readonly Vector<int> accV;
public Sp()
{
if (spanBack.Length % Vector<int>.Count != 0) throw new Exception("Invalid array size");
if (Vector<int>.Count == 4)
{
baseV = new Vector<int>(new[] { 4, 4, 4, 4 });
accV = new Vector<int>(new[] { 0, 1, 2, 3, });
}
else if (Vector<int>.Count == 8)
{
baseV = new Vector<int>(new[] { 8, 8, 8, 8, 8, 8, 8, 8 });
accV = new Vector<int>(new[] { 0, 1, 2, 3, 4, 5, 6, 7 });
}
else
{
throw new Exception("Invalid vector size");
}
}
[Benchmark(Baseline = true)]
public int[] Default()
{
Span<int> buffer = spanBack.AsSpan();
for (var i = 0; i < buffer.Length; i++)
buffer[i] = i;
return spanBack;
}
[Benchmark]
public int[] Batch4()
{
Span<int> buffer = spanBack.AsSpan();
for (var i = 0; i < buffer.Length; i = i + 4)
{
buffer[i + 0] = i + 0;
buffer[i + 1] = i + 1;
buffer[i + 2] = i + 2;
buffer[i + 3] = i + 3;
}
return spanBack;
}
[Benchmark]
public int[] BatchSimd()
{
int batchSize = Vector<int>.Count;
var accV = this.accV;
Span<int> buffer = spanBack.AsSpan();
for (var i = 0; i < buffer.Length; i = i + batchSize)
{
var currentSlice = buffer.Slice(i, batchSize);
var v = new Vector<int>(currentSlice);
v = v + accV;
accV = accV + baseV;
v.CopyTo(currentSlice);
}
return spanBack;
}
[Benchmark]
public int[] Batch8()
{
Span<int> buffer = spanBack.AsSpan();
for (var i = 0; i < buffer.Length; i = i + 8)
{
buffer[i + 0] = i + 0;
buffer[i + 1] = i + 1;
buffer[i + 2] = i + 2;
buffer[i + 3] = i + 3;
buffer[i + 4] = i + 4;
buffer[i + 5] = i + 5;
buffer[i + 6] = i + 6;
buffer[i + 7] = i + 7;
}
return spanBack;
}
[Benchmark]
public int[] Batch16()
{
Span<int> buffer = spanBack.AsSpan();
for (var i = 0; i < buffer.Length; i = i + 16)
{
buffer[i + 0] = i + 0;
buffer[i + 1] = i + 1;
buffer[i + 2] = i + 2;
buffer[i + 3] = i + 3;
buffer[i + 4] = i + 4;
buffer[i + 5] = i + 5;
buffer[i + 6] = i + 6;
buffer[i + 7] = i + 7;
buffer[i + 8] = i + 8;
buffer[i + 9] = i + 9;
buffer[i + 10] = i + 10;
buffer[i + 11] = i + 11;
buffer[i + 12] = i + 12;
buffer[i + 13] = i + 13;
buffer[i + 14] = i + 14;
buffer[i + 15] = i + 15;
}
return spanBack;
}
}
}
Csproj:
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>netcoreapp3.0</TargetFramework>
<DebugType>pdbonly</DebugType>
<DebugSymbols>true</DebugSymbols>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="BenchmarkDotNet" Version="0.12.0" />
</ItemGroup>
</Project>
dotnet run -c Release
的结果:
BenchmarkDotNet=v0.12.0, OS=Windows 10.0.18362
Intel Core i7-2600K CPU 3.40GHz (Sandy Bridge), 1 CPU, 8 logical and 4 physical cores
.NET Core SDK=3.1.100-preview1-014459
[Host] : .NET Core 3.1.0 (CoreCLR 4.700.19.50403, CoreFX 4.700.19.50410), X64 RyuJIT
DefaultJob : .NET Core 3.1.0 (CoreCLR 4.700.19.50403, CoreFX 4.700.19.50410), X64 RyuJIT
| Method | Mean | Error | StdDev | Ratio | Gen 0 | Gen 1 | Gen 2 | Allocated |
|----------- |---------:|---------:|---------:|------:|------:|------:|------:|----------:|
| Default | 45.55 us | 0.081 us | 0.067 us | 1.00 | - | - | - | - |
| Batch4 | 34.23 us | 0.069 us | 0.065 us | 0.75 | - | - | - | 1 B |
| Batch4Simd | 22.23 us | 0.054 us | 0.051 us | 0.49 | - | - | - | - |
| Batch8 | 31.53 us | 0.160 us | 0.134 us | 0.69 | - | - | - | - |
| Batch16 | 32.10 us | 0.197 us | 0.164 us | 0.70 | - | - | - | - |
编辑:@harold 的建议
[Benchmark]
public int[] BatchSimd_harold()
{
int batchSize = Vector<int>.Count;
var accV = this.accV;
Span<int> buffer = spanBack.AsSpan();
for (var i = 0; i < buffer.Length; i = i + batchSize)
{
var currentSlice = buffer.Slice(i, batchSize);
accV.CopyTo(currentSlice);
accV = accV + baseV;
}
return spanBack;
}
结果:
| Method | Mean | Error | StdDev | Ratio | Gen 0 | Gen 1 | Gen 2 | Allocated |
|----------------- |---------:|---------:|---------:|------:|------:|------:|------:|----------:|
| Default | 46.08 us | 0.331 us | 0.310 us | 1.00 | - | - | - | - |
| BatchSimd | 22.37 us | 0.150 us | 0.141 us | 0.49 | - | - | - | - |
| BatchSimd_harold | 18.72 us | 0.255 us | 0.239 us | 0.41 | - | - | - | - |
编辑 2:最近 cpu
BenchmarkDotNet=v0.12.0, OS=Windows 10.0.18362
Intel Core i7-6820HQ CPU 2.70GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
.NET Core SDK=3.0.100
[Host] : .NET Core 3.0.0 (CoreCLR 4.700.19.46205, CoreFX 4.700.19.46214), X64 RyuJIT
DefaultJob : .NET Core 3.0.0 (CoreCLR 4.700.19.46205, CoreFX 4.700.19.46214), X64 RyuJIT
| Method | Mean | Error | StdDev | Median | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
|----------------- |---------:|---------:|---------:|---------:|------:|--------:|------:|------:|------:|----------:|
| Default | 59.05 us | 1.169 us | 2.362 us | 59.01 us | 1.00 | 0.00 | - | - | - | - |
| Batch4 | 44.39 us | 0.865 us | 0.722 us | 44.48 us | 0.76 | 0.03 | - | - | - | - |
| BatchSimd | 15.37 us | 0.364 us | 1.049 us | 15.07 us | 0.26 | 0.02 | - | - | - | - |
| BatchSimd_harold | 11.77 us | 0.219 us | 0.205 us | 11.80 us | 0.20 | 0.01 | - | - | - | - |
| Batch8 | 43.62 us | 0.871 us | 1.838 us | 43.46 us | 0.74 | 0.04 | - | - | - | - |
| Batch16 | 42.53 us | 0.846 us | 2.317 us | 41.92 us | 0.73 | 0.05 | - | - | - | - |