使用 SIMD (System.Numerics) 编写向量求和函数并使其比 for 循环更快
Writing a vector sum function with SIMD (System.Numerics) and making it faster than a for loop
我写了一个函数来使用 SIMD (System.Numerics.Vector
) 将 double[]
数组的所有元素相加,性能比 naïve 方法差。
在我的计算机上 Vector<double>.Count
是 4,这意味着我可以创建一个包含 4 个值的累加器,并且 运行 通过数组将元素按组相加。
例如一个 10 元素的数组,有一个 4 元素的累加器和 2 个剩余元素我会得到
// | loop | remainder
acc[0] = vector[0] + vector[4] + vector[8]
acc[1] = vector[1] + vector[5] + vector[9]
acc[2] = vector[2] + vector[6]
acc[3] = vector[3] + vector[7]
结果sum = acc[0]+acc[1]+acc[2]+acc[3]
下面的代码产生了正确的结果,但与仅将值相加的循环相比速度不高
public static double SumSimd(this Span<double> a)
{
var n = System.Numerics.Vector<double>.Count;
var count = a.Length;
// divide array into n=4 element groups
// Example, 57 = 14*4 + 3
var groups = Math.DivRem(count, n, out int remain);
var buffer = new double[n];
// Create buffer with remaining elements (not in groups)
a.Slice(groups*n, remain).CopyTo(buffer);
// Scan through all groups and accumulate
var accumulator = new System.Numerics.Vector<double>(buffer);
for (int i = 0; i < groups; i++)
{
//var next = new System.Numerics.Vector<double>(a, n * i);
var next = new System.Numerics.Vector<double>(a.Slice(n * i, n));
accumulator += next;
}
var sum = 0.0;
// Add up the elements of the accumulator vs
for (int j = 0; j < n; j++)
{
sum += accumulator[j];
}
return sum;
}
所以我的问题是为什么我没有意识到 SIMD 的任何好处?
基线
基线代码如下所示
public static double LinAlgSum(this ReadOnlySpan<double> span)
{
double sum = 0;
for (int i = 0; i < span.Length; i++)
{
sum += span[i];
}
return sum;
}
在与上述比较的 SIMD 代码基准测试中,size=7
的 SIMD 代码慢 5 倍,size=144
慢 2.5 倍,size=770
大致相同。
我运行正在使用 BenchmarkDotNet
发布模式。这里是 driver class
[GroupBenchmarksBy(BenchmarkLogicalGroupRule.ByCategory)]
[CategoriesColumn]
public class LinearAlgebraBench
{
[Params(7, 35, 77, 144, 195, 311, 722)]
public int Size { get; set; }
[IterationSetup]
public void SetupData()
{
A = new LinearAlgebra.Vector(Size, (iter) => 2 * Size - iter).ToArray();
B = new LinearAlgebra.Vector(Size, (iter) => Size/2 + 2* iter).ToArray();
}
public double[] A { get; set; }
public double[] B { get; set; }
[BenchmarkCategory("Sum"), Benchmark(Baseline = true)]
public double BenchLinAlgSum()
{
return LinearAlgebra.LinearAlgebra.Sum(A.AsSpan().AsReadOnly());
}
[BenchmarkCategory("Sum"), Benchmark]
public double BenchSimdSum()
{
return LinearAlgebra.LinearAlgebra.SumSimd(A);
}
}
我建议您看一下这篇探索 SIMD performance in .Net 的文章。
使用常规向量化求和的整体算法看起来相同。一个区别是在切片数组时可以避免乘法:
while (i < lastBlockIndex)
{
vresult += new Vector<int>(source.Slice(i));
i += Vector<int>.Count;
}
一次乘法对性能的影响应该很小,但对于这种代码,它可能是相关的。
还值得注意的是,编译器似乎无法使用通用 API 生成非常高效的 SIMD 代码。对 32768 个项目求和的性能:
- SumUnrolled - 8,979.690 纳秒
- SumVectorT - 6,689.829 纳秒
- SumIntrinstics - 2,200.996 纳秒
因此,SIMD 的通用版本仅获得约 30% 的性能,而内部版本获得约 400% 的性能,接近理论最大值。
试试这个版本。它使用四个独立的累加器试图隐藏 vaddpd
指令的延迟,这在现代 AVX CPU 上为 3-4 个周期。未经测试。
public static double vectorSum( this ReadOnlySpan<double> span )
{
int vs = Vector<double>.Count;
int end = span.Length;
int endVectors = ( end / vs ) * vs;
// Using 4 independent accumulators because on modern CPUs the latency of `vaddpd` is 3-4 cycles.
// One batch consumes 4 vectors.
int endBatches = ( endVectors / 4 ) * 4;
Vector<double> a0 = Vector<double>.Zero;
Vector<double> a1 = Vector<double>.Zero;
Vector<double> a2 = Vector<double>.Zero;
Vector<double> a3 = Vector<double>.Zero;
// Handle majority of data unrolling by 4 vectors (e.g. 16 scalars with AVX)
int i;
for( i = 0; i < endBatches; i += vs * 4 )
{
a0 += new Vector<double>( span.Slice( i, vs ) );
a1 += new Vector<double>( span.Slice( i + vs, vs ) );
a2 += new Vector<double>( span.Slice( i + vs * 2, vs ) );
a3 += new Vector<double>( span.Slice( i + vs * 3, vs ) );
}
// Handle a few remaining complete vectors
for( ; i < endVectors; i += vs )
a0 += new Vector<double>( span.Slice( i, vs ) );
// Add the accumulators together
a0 += a1;
a2 += a3;
a0 += a2;
// Compute horizontal sum of a0
double sum = 0;
for( int j = 0; j < vs; j++ )
sum += a0[ j ];
// Add the remaining few scalars
for( ; i < end; i++ )
sum += span[ i ];
return sum;
}
没有进行基准测试,但查看了 sharplab.io 上的反汇编。虽然不如等效的 C++ 好(循环中有太多标量代码),但它看起来并不糟糕:使用 AVX,没有函数调用或在主循环中不需要 loads/stores。
根据@JonasH 的回答
It is also worth noting that the compiler does not seem to produce very efficient SIMD code with the generic API.
我不同意。只有确保该方法得到正确实施才值得。在某些情况下 - 是的,直接使用 Intrinsics
而不是 Numerics
向量会产生很大的提升,但并非总是如此。
这里的问题是测量非常小的迭代。 Benchmark.NET一般做不到。可能的解决方案是将目标方法包装在一个循环中。
对我来说,写一个有代表性的基准测试是一项艰苦的工作,我可能做得不够好。不过我会努力的。
public class SumTest
{
[Params(7, 35, 77, 144, 195, 311, 722)]
public int Size { get; set; }
[IterationSetup]
public void SetupData()
{
A = Enumerable.Range(0, Size).Select(x => 1.1).ToArray();
}
public double[] A { get; set; }
[BenchmarkCategory("Sum"), Benchmark(Baseline = true)]
public double BenchScalarSum()
{
double result = 0;
for (int i = 0; i < 10000; i++)
result = SumScalar(A);
return result;
}
[BenchmarkCategory("Sum"), Benchmark]
public double BenchNumericsSum()
{
double result = 0;
for (int i = 0; i < 10000; i++)
result = SumNumerics(A);
return result;
}
[BenchmarkCategory("Sum"), Benchmark]
public double BenchIntrinsicsSum()
{
double result = 0;
for (int i = 0; i < 10000; i++)
result = SumIntrinsics(A);
return result;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public double SumScalar(ReadOnlySpan<double> numbers)
{
double result = 0;
for (int i = 0; i < numbers.Length; i++)
{
result += numbers[i];
}
return result;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public double SumNumerics(ReadOnlySpan<double> numbers)
{
ReadOnlySpan<Vector<double>> vectors = MemoryMarshal.Cast<double, Vector<double>>(numbers);
Vector<double> acc = Vector<double>.Zero;
for (int i = 0; i < vectors.Length; i++)
{
acc += vectors[i];
}
double result = Vector.Dot(acc, Vector<double>.One);
for (int i = vectors.Length * Vector<double>.Count; i < numbers.Length; i++)
result += numbers[i];
return result;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public double SumIntrinsics(ReadOnlySpan<double> numbers)
{
ReadOnlySpan<Vector256<double>> vectors = MemoryMarshal.Cast<double, Vector256<double>>(numbers);
Vector256<double> acc = Vector256<double>.Zero;
for (int i = 0; i < vectors.Length; i++)
{
acc = Avx.Add(acc, vectors[i]);
}
Vector128<double> r = Sse2.Add(acc.GetUpper(), acc.GetLower());
double result = Sse3.HorizontalAdd(r, r).GetElement(0); // I'm aware that VHADDPD probably not enough efficient but leaving it for simplicity here
for (int i = vectors.Length * Vector256<double>.Count; i < numbers.Length; i++)
result += numbers[i];
return result;
}
}
BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19042
Intel Core i7-4700HQ CPU 2.40GHz (Haswell), 1 CPU, 8 logical and 4 physical cores
.NET Core SDK=5.0.203
[Host] : .NET Core 5.0.6 (CoreCLR 5.0.621.22011, CoreFX 5.0.621.22011), X64 RyuJIT
Job-NQCIIR : .NET Core 5.0.6 (CoreCLR 5.0.621.22011, CoreFX 5.0.621.22011), X64 RyuJIT
Method
Size
Mean
Error
StdDev
Median
Ratio
RatioSD
BenchScalarSum
7
53.34 us
0.056 us
0.050 us
53.30 us
1.00
0.00
BenchNumericsSum
7
48.95 us
2.262 us
6.671 us
44.95 us
0.95
0.10
BenchIntrinsicsSum
7
55.85 us
2.089 us
6.128 us
51.90 us
1.07
0.10
BenchScalarSum
35
258.46 us
2.319 us
3.541 us
257.00 us
1.00
0.00
BenchNumericsSum
35
94.14 us
1.989 us
5.705 us
91.00 us
0.36
0.02
BenchIntrinsicsSum
35
90.82 us
2.465 us
7.073 us
92.10 us
0.35
0.03
BenchScalarSum
77
541.18 us
10.401 us
11.129 us
536.95 us
1.00
0.00
BenchNumericsSum
77
161.05 us
3.171 us
7.475 us
159.30 us
0.30
0.01
BenchIntrinsicsSum
77
153.19 us
3.063 us
7.906 us
150.50 us
0.29
0.02
BenchScalarSum
144
1,166.72 us
6.945 us
5.422 us
1,166.10 us
1.00
0.00
BenchNumericsSum
144
294.72 us
5.675 us
10.520 us
292.50 us
0.26
0.01
BenchIntrinsicsSum
144
287.18 us
5.661 us
13.671 us
284.20 us
0.25
0.01
BenchScalarSum
195
1,671.83 us
32.634 us
34.918 us
1,663.30 us
1.00
0.00
BenchNumericsSum
195
443.19 us
7.916 us
11.354 us
443.10 us
0.26
0.01
BenchIntrinsicsSum
195
444.21 us
8.876 us
7.868 us
443.55 us
0.27
0.01
BenchScalarSum
311
2,742.78 us
35.797 us
29.892 us
2,745.70 us
1.00
0.00
BenchNumericsSum
311
778.00 us
34.173 us
100.759 us
719.20 us
0.30
0.04
BenchIntrinsicsSum
311
776.30 us
29.304 us
86.404 us
727.45 us
0.29
0.03
BenchScalarSum
722
6,607.72 us
79.263 us
74.143 us
6,601.20 us
1.00
0.00
BenchNumericsSum
722
1,870.81 us
43.390 us
127.936 us
1,850.30 us
0.28
0.02
BenchIntrinsicsSum
722
1,867.57 us
39.718 us
117.110 us
1,851.50 us
0.28
0.02
看起来使用 Vectors 的效率至少不低于基线方法。
作为奖励,让我们看看使用 https://sharplab.io/ (x64)
的输出汇编代码
SumTest.SumScalar(System.ReadOnlySpan`1<Double>)
L0000: vzeroupper
L0003: mov rax, [rdx]
L0006: mov edx, [rdx+8]
L0009: vxorps xmm0, xmm0, xmm0
L000d: xor ecx, ecx
L000f: test edx, edx
L0011: jle short L0022
L0013: movsxd r8, ecx
L0016: vaddsd xmm0, xmm0, [rax+r8*8]
L001c: inc ecx
L001e: cmp ecx, edx
L0020: jl short L0013
L0022: ret
SumTest.SumNumerics(System.ReadOnlySpan`1<Double>)
L0000: sub rsp, 0x28
L0004: vzeroupper
L0007: mov rax, [rdx]
L000a: mov edx, [rdx+8]
L000d: mov ecx, edx
L000f: shl rcx, 3
L0013: shr rcx, 5
L0017: cmp rcx, 0x7fffffff
L001e: ja short L0078
L0020: vxorps ymm0, ymm0, ymm0
L0024: xor r8d, r8d
L0027: test ecx, ecx
L0029: jle short L0040
L002b: movsxd r9, r8d
L002e: shl r9, 5
L0032: vaddpd ymm0, ymm0, [rax+r9]
L0038: inc r8d
L003b: cmp r8d, ecx
L003e: jl short L002b
L0040: vmulpd ymm0, ymm0, [SumTest.SumNumerics(System.ReadOnlySpan`1<Double>)]
L0048: vhaddpd ymm0, ymm0, ymm0
L004c: vextractf128 xmm1, ymm0, 1
L0052: vaddpd xmm0, xmm0, xmm1
L0056: shl ecx, 2
L0059: cmp ecx, edx
L005b: jge short L0070
L005d: cmp ecx, edx
L005f: jae short L007e
L0061: movsxd r8, ecx
L0064: vaddsd xmm0, xmm0, [rax+r8*8]
L006a: inc ecx
L006c: cmp ecx, edx
L006e: jl short L005d
L0070: vzeroupper
L0073: add rsp, 0x28
L0077: ret
L0078: call 0x00007ffc9de2b710
L007d: int3
L007e: call 0x00007ffc9de2bc70
L0083: int3
SumTest.SumIntrinsics(System.ReadOnlySpan`1<Double>)
L0000: sub rsp, 0x28
L0004: vzeroupper
L0007: mov rax, [rdx]
L000a: mov edx, [rdx+8]
L000d: mov ecx, edx
L000f: shl rcx, 3
L0013: shr rcx, 5
L0017: cmp rcx, 0x7fffffff
L001e: ja short L0070
L0020: vxorps ymm0, ymm0, ymm0
L0024: xor r8d, r8d
L0027: test ecx, ecx
L0029: jle short L0040
L002b: movsxd r9, r8d
L002e: shl r9, 5
L0032: vaddpd ymm0, ymm0, [rax+r9]
L0038: inc r8d
L003b: cmp r8d, ecx
L003e: jl short L002b
L0040: vextractf128 xmm1, ymm0, 1
L0046: vaddpd xmm0, xmm1, xmm0
L004a: vhaddpd xmm0, xmm0, xmm0
L004e: shl ecx, 2
L0051: cmp ecx, edx
L0053: jge short L0068
L0055: cmp ecx, edx
L0057: jae short L0076
L0059: movsxd r8, ecx
L005c: vaddsd xmm0, xmm0, [rax+r8*8]
L0062: inc ecx
L0064: cmp ecx, edx
L0066: jl short L0055
L0068: vzeroupper
L006b: add rsp, 0x28
L006f: ret
L0070: call 0x00007ffc9de2b710
L0075: int3
L0076: call 0x00007ffc9de2bc70
L007b: int3
在这里您可以看到 JIT 为 Vector<T>
生成的代码与为 Vector256<T>
.
生成的代码几乎相同
我写了一个函数来使用 SIMD (System.Numerics.Vector
) 将 double[]
数组的所有元素相加,性能比 naïve 方法差。
在我的计算机上 Vector<double>.Count
是 4,这意味着我可以创建一个包含 4 个值的累加器,并且 运行 通过数组将元素按组相加。
例如一个 10 元素的数组,有一个 4 元素的累加器和 2 个剩余元素我会得到
// | loop | remainder
acc[0] = vector[0] + vector[4] + vector[8]
acc[1] = vector[1] + vector[5] + vector[9]
acc[2] = vector[2] + vector[6]
acc[3] = vector[3] + vector[7]
结果sum = acc[0]+acc[1]+acc[2]+acc[3]
下面的代码产生了正确的结果,但与仅将值相加的循环相比速度不高
public static double SumSimd(this Span<double> a)
{
var n = System.Numerics.Vector<double>.Count;
var count = a.Length;
// divide array into n=4 element groups
// Example, 57 = 14*4 + 3
var groups = Math.DivRem(count, n, out int remain);
var buffer = new double[n];
// Create buffer with remaining elements (not in groups)
a.Slice(groups*n, remain).CopyTo(buffer);
// Scan through all groups and accumulate
var accumulator = new System.Numerics.Vector<double>(buffer);
for (int i = 0; i < groups; i++)
{
//var next = new System.Numerics.Vector<double>(a, n * i);
var next = new System.Numerics.Vector<double>(a.Slice(n * i, n));
accumulator += next;
}
var sum = 0.0;
// Add up the elements of the accumulator vs
for (int j = 0; j < n; j++)
{
sum += accumulator[j];
}
return sum;
}
所以我的问题是为什么我没有意识到 SIMD 的任何好处?
基线
基线代码如下所示
public static double LinAlgSum(this ReadOnlySpan<double> span)
{
double sum = 0;
for (int i = 0; i < span.Length; i++)
{
sum += span[i];
}
return sum;
}
在与上述比较的 SIMD 代码基准测试中,size=7
的 SIMD 代码慢 5 倍,size=144
慢 2.5 倍,size=770
大致相同。
我运行正在使用 BenchmarkDotNet
发布模式。这里是 driver class
[GroupBenchmarksBy(BenchmarkLogicalGroupRule.ByCategory)]
[CategoriesColumn]
public class LinearAlgebraBench
{
[Params(7, 35, 77, 144, 195, 311, 722)]
public int Size { get; set; }
[IterationSetup]
public void SetupData()
{
A = new LinearAlgebra.Vector(Size, (iter) => 2 * Size - iter).ToArray();
B = new LinearAlgebra.Vector(Size, (iter) => Size/2 + 2* iter).ToArray();
}
public double[] A { get; set; }
public double[] B { get; set; }
[BenchmarkCategory("Sum"), Benchmark(Baseline = true)]
public double BenchLinAlgSum()
{
return LinearAlgebra.LinearAlgebra.Sum(A.AsSpan().AsReadOnly());
}
[BenchmarkCategory("Sum"), Benchmark]
public double BenchSimdSum()
{
return LinearAlgebra.LinearAlgebra.SumSimd(A);
}
}
我建议您看一下这篇探索 SIMD performance in .Net 的文章。
使用常规向量化求和的整体算法看起来相同。一个区别是在切片数组时可以避免乘法:
while (i < lastBlockIndex)
{
vresult += new Vector<int>(source.Slice(i));
i += Vector<int>.Count;
}
一次乘法对性能的影响应该很小,但对于这种代码,它可能是相关的。
还值得注意的是,编译器似乎无法使用通用 API 生成非常高效的 SIMD 代码。对 32768 个项目求和的性能:
- SumUnrolled - 8,979.690 纳秒
- SumVectorT - 6,689.829 纳秒
- SumIntrinstics - 2,200.996 纳秒
因此,SIMD 的通用版本仅获得约 30% 的性能,而内部版本获得约 400% 的性能,接近理论最大值。
试试这个版本。它使用四个独立的累加器试图隐藏 vaddpd
指令的延迟,这在现代 AVX CPU 上为 3-4 个周期。未经测试。
public static double vectorSum( this ReadOnlySpan<double> span )
{
int vs = Vector<double>.Count;
int end = span.Length;
int endVectors = ( end / vs ) * vs;
// Using 4 independent accumulators because on modern CPUs the latency of `vaddpd` is 3-4 cycles.
// One batch consumes 4 vectors.
int endBatches = ( endVectors / 4 ) * 4;
Vector<double> a0 = Vector<double>.Zero;
Vector<double> a1 = Vector<double>.Zero;
Vector<double> a2 = Vector<double>.Zero;
Vector<double> a3 = Vector<double>.Zero;
// Handle majority of data unrolling by 4 vectors (e.g. 16 scalars with AVX)
int i;
for( i = 0; i < endBatches; i += vs * 4 )
{
a0 += new Vector<double>( span.Slice( i, vs ) );
a1 += new Vector<double>( span.Slice( i + vs, vs ) );
a2 += new Vector<double>( span.Slice( i + vs * 2, vs ) );
a3 += new Vector<double>( span.Slice( i + vs * 3, vs ) );
}
// Handle a few remaining complete vectors
for( ; i < endVectors; i += vs )
a0 += new Vector<double>( span.Slice( i, vs ) );
// Add the accumulators together
a0 += a1;
a2 += a3;
a0 += a2;
// Compute horizontal sum of a0
double sum = 0;
for( int j = 0; j < vs; j++ )
sum += a0[ j ];
// Add the remaining few scalars
for( ; i < end; i++ )
sum += span[ i ];
return sum;
}
没有进行基准测试,但查看了 sharplab.io 上的反汇编。虽然不如等效的 C++ 好(循环中有太多标量代码),但它看起来并不糟糕:使用 AVX,没有函数调用或在主循环中不需要 loads/stores。
根据@JonasH 的回答
It is also worth noting that the compiler does not seem to produce very efficient SIMD code with the generic API.
我不同意。只有确保该方法得到正确实施才值得。在某些情况下 - 是的,直接使用 Intrinsics
而不是 Numerics
向量会产生很大的提升,但并非总是如此。
这里的问题是测量非常小的迭代。 Benchmark.NET一般做不到。可能的解决方案是将目标方法包装在一个循环中。
对我来说,写一个有代表性的基准测试是一项艰苦的工作,我可能做得不够好。不过我会努力的。
public class SumTest
{
[Params(7, 35, 77, 144, 195, 311, 722)]
public int Size { get; set; }
[IterationSetup]
public void SetupData()
{
A = Enumerable.Range(0, Size).Select(x => 1.1).ToArray();
}
public double[] A { get; set; }
[BenchmarkCategory("Sum"), Benchmark(Baseline = true)]
public double BenchScalarSum()
{
double result = 0;
for (int i = 0; i < 10000; i++)
result = SumScalar(A);
return result;
}
[BenchmarkCategory("Sum"), Benchmark]
public double BenchNumericsSum()
{
double result = 0;
for (int i = 0; i < 10000; i++)
result = SumNumerics(A);
return result;
}
[BenchmarkCategory("Sum"), Benchmark]
public double BenchIntrinsicsSum()
{
double result = 0;
for (int i = 0; i < 10000; i++)
result = SumIntrinsics(A);
return result;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public double SumScalar(ReadOnlySpan<double> numbers)
{
double result = 0;
for (int i = 0; i < numbers.Length; i++)
{
result += numbers[i];
}
return result;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public double SumNumerics(ReadOnlySpan<double> numbers)
{
ReadOnlySpan<Vector<double>> vectors = MemoryMarshal.Cast<double, Vector<double>>(numbers);
Vector<double> acc = Vector<double>.Zero;
for (int i = 0; i < vectors.Length; i++)
{
acc += vectors[i];
}
double result = Vector.Dot(acc, Vector<double>.One);
for (int i = vectors.Length * Vector<double>.Count; i < numbers.Length; i++)
result += numbers[i];
return result;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public double SumIntrinsics(ReadOnlySpan<double> numbers)
{
ReadOnlySpan<Vector256<double>> vectors = MemoryMarshal.Cast<double, Vector256<double>>(numbers);
Vector256<double> acc = Vector256<double>.Zero;
for (int i = 0; i < vectors.Length; i++)
{
acc = Avx.Add(acc, vectors[i]);
}
Vector128<double> r = Sse2.Add(acc.GetUpper(), acc.GetLower());
double result = Sse3.HorizontalAdd(r, r).GetElement(0); // I'm aware that VHADDPD probably not enough efficient but leaving it for simplicity here
for (int i = vectors.Length * Vector256<double>.Count; i < numbers.Length; i++)
result += numbers[i];
return result;
}
}
BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19042
Intel Core i7-4700HQ CPU 2.40GHz (Haswell), 1 CPU, 8 logical and 4 physical cores
.NET Core SDK=5.0.203
[Host] : .NET Core 5.0.6 (CoreCLR 5.0.621.22011, CoreFX 5.0.621.22011), X64 RyuJIT
Job-NQCIIR : .NET Core 5.0.6 (CoreCLR 5.0.621.22011, CoreFX 5.0.621.22011), X64 RyuJIT
Method | Size | Mean | Error | StdDev | Median | Ratio | RatioSD |
---|---|---|---|---|---|---|---|
BenchScalarSum | 7 | 53.34 us | 0.056 us | 0.050 us | 53.30 us | 1.00 | 0.00 |
BenchNumericsSum | 7 | 48.95 us | 2.262 us | 6.671 us | 44.95 us | 0.95 | 0.10 |
BenchIntrinsicsSum | 7 | 55.85 us | 2.089 us | 6.128 us | 51.90 us | 1.07 | 0.10 |
BenchScalarSum | 35 | 258.46 us | 2.319 us | 3.541 us | 257.00 us | 1.00 | 0.00 |
BenchNumericsSum | 35 | 94.14 us | 1.989 us | 5.705 us | 91.00 us | 0.36 | 0.02 |
BenchIntrinsicsSum | 35 | 90.82 us | 2.465 us | 7.073 us | 92.10 us | 0.35 | 0.03 |
BenchScalarSum | 77 | 541.18 us | 10.401 us | 11.129 us | 536.95 us | 1.00 | 0.00 |
BenchNumericsSum | 77 | 161.05 us | 3.171 us | 7.475 us | 159.30 us | 0.30 | 0.01 |
BenchIntrinsicsSum | 77 | 153.19 us | 3.063 us | 7.906 us | 150.50 us | 0.29 | 0.02 |
BenchScalarSum | 144 | 1,166.72 us | 6.945 us | 5.422 us | 1,166.10 us | 1.00 | 0.00 |
BenchNumericsSum | 144 | 294.72 us | 5.675 us | 10.520 us | 292.50 us | 0.26 | 0.01 |
BenchIntrinsicsSum | 144 | 287.18 us | 5.661 us | 13.671 us | 284.20 us | 0.25 | 0.01 |
BenchScalarSum | 195 | 1,671.83 us | 32.634 us | 34.918 us | 1,663.30 us | 1.00 | 0.00 |
BenchNumericsSum | 195 | 443.19 us | 7.916 us | 11.354 us | 443.10 us | 0.26 | 0.01 |
BenchIntrinsicsSum | 195 | 444.21 us | 8.876 us | 7.868 us | 443.55 us | 0.27 | 0.01 |
BenchScalarSum | 311 | 2,742.78 us | 35.797 us | 29.892 us | 2,745.70 us | 1.00 | 0.00 |
BenchNumericsSum | 311 | 778.00 us | 34.173 us | 100.759 us | 719.20 us | 0.30 | 0.04 |
BenchIntrinsicsSum | 311 | 776.30 us | 29.304 us | 86.404 us | 727.45 us | 0.29 | 0.03 |
BenchScalarSum | 722 | 6,607.72 us | 79.263 us | 74.143 us | 6,601.20 us | 1.00 | 0.00 |
BenchNumericsSum | 722 | 1,870.81 us | 43.390 us | 127.936 us | 1,850.30 us | 0.28 | 0.02 |
BenchIntrinsicsSum | 722 | 1,867.57 us | 39.718 us | 117.110 us | 1,851.50 us | 0.28 | 0.02 |
看起来使用 Vectors 的效率至少不低于基线方法。
作为奖励,让我们看看使用 https://sharplab.io/ (x64)
的输出汇编代码SumTest.SumScalar(System.ReadOnlySpan`1<Double>)
L0000: vzeroupper
L0003: mov rax, [rdx]
L0006: mov edx, [rdx+8]
L0009: vxorps xmm0, xmm0, xmm0
L000d: xor ecx, ecx
L000f: test edx, edx
L0011: jle short L0022
L0013: movsxd r8, ecx
L0016: vaddsd xmm0, xmm0, [rax+r8*8]
L001c: inc ecx
L001e: cmp ecx, edx
L0020: jl short L0013
L0022: ret
SumTest.SumNumerics(System.ReadOnlySpan`1<Double>)
L0000: sub rsp, 0x28
L0004: vzeroupper
L0007: mov rax, [rdx]
L000a: mov edx, [rdx+8]
L000d: mov ecx, edx
L000f: shl rcx, 3
L0013: shr rcx, 5
L0017: cmp rcx, 0x7fffffff
L001e: ja short L0078
L0020: vxorps ymm0, ymm0, ymm0
L0024: xor r8d, r8d
L0027: test ecx, ecx
L0029: jle short L0040
L002b: movsxd r9, r8d
L002e: shl r9, 5
L0032: vaddpd ymm0, ymm0, [rax+r9]
L0038: inc r8d
L003b: cmp r8d, ecx
L003e: jl short L002b
L0040: vmulpd ymm0, ymm0, [SumTest.SumNumerics(System.ReadOnlySpan`1<Double>)]
L0048: vhaddpd ymm0, ymm0, ymm0
L004c: vextractf128 xmm1, ymm0, 1
L0052: vaddpd xmm0, xmm0, xmm1
L0056: shl ecx, 2
L0059: cmp ecx, edx
L005b: jge short L0070
L005d: cmp ecx, edx
L005f: jae short L007e
L0061: movsxd r8, ecx
L0064: vaddsd xmm0, xmm0, [rax+r8*8]
L006a: inc ecx
L006c: cmp ecx, edx
L006e: jl short L005d
L0070: vzeroupper
L0073: add rsp, 0x28
L0077: ret
L0078: call 0x00007ffc9de2b710
L007d: int3
L007e: call 0x00007ffc9de2bc70
L0083: int3
SumTest.SumIntrinsics(System.ReadOnlySpan`1<Double>)
L0000: sub rsp, 0x28
L0004: vzeroupper
L0007: mov rax, [rdx]
L000a: mov edx, [rdx+8]
L000d: mov ecx, edx
L000f: shl rcx, 3
L0013: shr rcx, 5
L0017: cmp rcx, 0x7fffffff
L001e: ja short L0070
L0020: vxorps ymm0, ymm0, ymm0
L0024: xor r8d, r8d
L0027: test ecx, ecx
L0029: jle short L0040
L002b: movsxd r9, r8d
L002e: shl r9, 5
L0032: vaddpd ymm0, ymm0, [rax+r9]
L0038: inc r8d
L003b: cmp r8d, ecx
L003e: jl short L002b
L0040: vextractf128 xmm1, ymm0, 1
L0046: vaddpd xmm0, xmm1, xmm0
L004a: vhaddpd xmm0, xmm0, xmm0
L004e: shl ecx, 2
L0051: cmp ecx, edx
L0053: jge short L0068
L0055: cmp ecx, edx
L0057: jae short L0076
L0059: movsxd r8, ecx
L005c: vaddsd xmm0, xmm0, [rax+r8*8]
L0062: inc ecx
L0064: cmp ecx, edx
L0066: jl short L0055
L0068: vzeroupper
L006b: add rsp, 0x28
L006f: ret
L0070: call 0x00007ffc9de2b710
L0075: int3
L0076: call 0x00007ffc9de2bc70
L007b: int3
在这里您可以看到 JIT 为 Vector<T>
生成的代码与为 Vector256<T>
.