如何使用 SIMD 计算数组中某个字节的出现次数?
How can I count the occurrence of a byte in array using SIMD?
给定以下输入字节:
var vBytes = new Vector<byte>(new byte[] {72, 101, 55, 08, 108, 111, 55, 87, 111, 114, 108, 55, 100, 55, 55, 20});
和给定的掩码:
var mask = new Vector<byte>(55);
如何在输入数组中找到字节数 55
?
我已经尝试 xoring vBytes
与 mask
:
var xored = Vector.Xor(mask, vBytes);
给出:
<127, 82, 0, 91, 91, 88, 0, 96, 88, 69, 91, 0, 83, 0, 0, 35>
但不知道如何从中得到计数。
为简单起见,我们假设输入字节长度始终等于 Vector<byte>.Count
.
的大小
感谢 Marc Gravell 的提示,以下工作:
var areEqual = Vector.Equals(vBytes, mask);
var negation = Vector.Negate(areEqual);
var count = Vector.Dot(negation, Vector<byte>.One);
Marc 有一个 blog post 关于这个主题的更多信息。
(以下想法的 AVX2 C 内在函数实现,以防具体示例有所帮助:How to count character occurrences using SIMD)
在 asm 中,您希望 pcmpeqb
生成 0 或 0xFF 的向量。被视为有符号整数,即 0/-1。
然后 使用 compare-result 作为整数值 和 psubb
将 0 / 1 添加到该元素的计数器。 (减 -1 = 加 +1)
这可能会在 256 次迭代后溢出,所以在此之前的某个时间,使用 psadbw
对 _mm_setzero_si128()
将那些无符号字节(没有溢出)水平求和为 64 位整数(一个 64 位整数每组 8 个字节)。然后paddq
累加64位总和
可以使用嵌套循环或仅在常规展开循环的末尾完成溢出前的累积。 psadbw
很快(因为它是视频编码的关键组成部分 motion-search),所以每 4 次比较甚至每 1 次累积并跳过 psubb
.[= 也不错35=]
有关 x86 的更多详细信息,请参阅 Agner Fog's optimization guides。根据他的指令表,psadbw xmm
/ vpsadbw ymm
运行s 在 Skylake 上每个时钟周期 1 个向量,有 3 个周期延迟。 (只有 1 uop 的 front-end 带宽。)上面提到的所有指令在多个端口上也是 single-uop 和 运行(因此吞吐量不一定相互冲突) .他们的 128 位版本只需要 SSE2。
如果你真的一次只有一个向量要计算,并且没有循环内存,那么可能 pcmpeqb
/ psadbw
/ pshufd
(将高半部分复制到low) / paddd
/ movd eax, xmm0
给你 255 * 整数寄存器中的匹配数。一个额外的向量指令(例如从零减去,或与 1 进行与运算,或 pabsb
(绝对值)将删除 x255 比例因子。
IDK 如何在 C# SIMD 中编写,但您肯定不想要一个dot-product!解包并转换为 FP 将比上面慢 4 倍,只是因为 fixed-width 向量比浮点数多 4 倍的字节,而 dpps
(_mm_dp_ps
) 是 不快。 4 微指令,Skylake 上每 1.5 个周期吞吐量一个。如果你做必须horizontal-sum除无符号字节以外的东西,请参阅Fastest way to do horizontal SSE vector sum (or other reduction)(我的回答也包括整数)。
或者如果 Vector.Dot
使用 pmaddubsw
/ pmaddwd
作为整数向量,那么这可能不会那么糟糕,但是对每个向量做 multi-step 水平求和与 psadbw
相比,比较结果很糟糕,尤其是与偶尔只进行水平求和的字节累加器相比。
或者,如果 C# 优化了与常量向量 1
的任何实际乘法。无论如何,这个答案的第一部分是您希望 CPU 成为 运行ning 的代码。无论您喜欢什么,都可以使用任何源代码实现它。
这里是 C:
中的快速 SSE2 实现
size_t memcount_sse2(const void *s, int c, size_t n) {
__m128i cv = _mm_set1_epi8(c), sum = _mm_setzero_si128(), acr0,acr1,acr2,acr3;
const char *p,*pe;
for(p = s; p != (char *)s+(n- (n % (252*16)));) {
for(acr0 = acr1 = acr2 = acr3 = _mm_setzero_si128(),pe = p+252*16; p != pe; p += 64) {
acr0 = _mm_add_epi8(acr0, _mm_cmpeq_epi8(cv, _mm_loadu_si128((const __m128i *)p)));
acr1 = _mm_add_epi8(acr1, _mm_cmpeq_epi8(cv, _mm_loadu_si128((const __m128i *)(p+16))));
acr2 = _mm_add_epi8(acr2, _mm_cmpeq_epi8(cv, _mm_loadu_si128((const __m128i *)(p+32))));
acr3 = _mm_add_epi8(acr3, _mm_cmpeq_epi8(cv, _mm_loadu_si128((const __m128i *)(p+48))));
__builtin_prefetch(p+1024);
}
sum = _mm_add_epi64(sum, _mm_sad_epu8(_mm_sub_epi8(_mm_setzero_si128(), acr0), _mm_setzero_si128()));
sum = _mm_add_epi64(sum, _mm_sad_epu8(_mm_sub_epi8(_mm_setzero_si128(), acr1), _mm_setzero_si128()));
sum = _mm_add_epi64(sum, _mm_sad_epu8(_mm_sub_epi8(_mm_setzero_si128(), acr2), _mm_setzero_si128()));
sum = _mm_add_epi64(sum, _mm_sad_epu8(_mm_sub_epi8(_mm_setzero_si128(), acr3), _mm_setzero_si128()));
}
// may require SSE4, rewrite this part for actual SSE2.
size_t count = _mm_extract_epi64(sum, 0) + _mm_extract_epi64(sum, 1);
// scalar cleanup. Could be optimized.
while(p != (char *)s + n) count += *p++ == c;
return count;
}
并查看:https://gist.github.com/powturbo 和 avx2 实现。
我知道我来晚了,但到目前为止 none 这里的答案实际上提供了完整的解决方案。这是我最好的尝试之一,源自 this Gist and the DotNet source code。所有功劳归于 DotNet 团队和社区成员(尤其是 @Peter Cordes)。
用法:
var bytes = Encoding.ASCII.GetBytes("The quick brown fox jumps over the lazy dog.");
var byteCount = bytes.OccurrencesOf(32);
var chars = "The quick brown fox jumps over the lazy dog.";
var charCount = chars.OccurrencesOf(' ');
代码:
public static class VectorExtensions
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static nuint GetByteVector128SpanLength(nuint offset, int length) =>
((nuint)(uint)((length - (int)offset) & ~(Vector128<byte>.Count - 1)));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static nuint GetByteVector256SpanLength(nuint offset, int length) =>
((nuint)(uint)((length - (int)offset) & ~(Vector256<byte>.Count - 1)));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static nint GetCharVector128SpanLength(nint offset, nint length) =>
((length - offset) & ~(Vector128<ushort>.Count - 1));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static nint GetCharVector256SpanLength(nint offset, nint length) =>
((length - offset) & ~(Vector256<ushort>.Count - 1));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector128<byte> LoadVector128(ref byte start, nuint offset) =>
Unsafe.ReadUnaligned<Vector128<byte>>(ref Unsafe.AddByteOffset(ref start, offset));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector256<byte> LoadVector256(ref byte start, nuint offset) =>
Unsafe.ReadUnaligned<Vector256<byte>>(ref Unsafe.AddByteOffset(ref start, offset));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector128<ushort> LoadVector128(ref char start, nint offset) =>
Unsafe.ReadUnaligned<Vector128<ushort>>(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref start, offset)));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector256<ushort> LoadVector256(ref char start, nint offset) =>
Unsafe.ReadUnaligned<Vector256<ushort>>(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref start, offset)));
[MethodImpl(MethodImplOptions.AggressiveOptimization)]
private static unsafe int OccurrencesOf(ref byte searchSpace, byte value, int length) {
var lengthToExamine = ((nuint)length);
var offset = ((nuint)0);
var result = 0L;
if (Sse2.IsSupported || Avx2.IsSupported) {
if (31 < length) {
lengthToExamine = UnalignedCountVector128(ref searchSpace);
}
}
SequentialScan:
while (7 < lengthToExamine) {
ref byte current = ref Unsafe.AddByteOffset(ref searchSpace, offset);
if (value == current) {
++result;
}
if (value == Unsafe.AddByteOffset(ref current, 1)) {
++result;
}
if (value == Unsafe.AddByteOffset(ref current, 2)) {
++result;
}
if (value == Unsafe.AddByteOffset(ref current, 3)) {
++result;
}
if (value == Unsafe.AddByteOffset(ref current, 4)) {
++result;
}
if (value == Unsafe.AddByteOffset(ref current, 5)) {
++result;
}
if (value == Unsafe.AddByteOffset(ref current, 6)) {
++result;
}
if (value == Unsafe.AddByteOffset(ref current, 7)) {
++result;
}
lengthToExamine -= 8;
offset += 8;
}
while (3 < lengthToExamine) {
ref byte current = ref Unsafe.AddByteOffset(ref searchSpace, offset);
if (value == current) {
++result;
}
if (value == Unsafe.AddByteOffset(ref current, 1)) {
++result;
}
if (value == Unsafe.AddByteOffset(ref current, 2)) {
++result;
}
if (value == Unsafe.AddByteOffset(ref current, 3)) {
++result;
}
lengthToExamine -= 4;
offset += 4;
}
while (0 < lengthToExamine) {
if (value == Unsafe.AddByteOffset(ref searchSpace, offset)) {
++result;
}
--lengthToExamine;
++offset;
}
if (offset < ((nuint)(uint)length)) {
if (Avx2.IsSupported) {
if (0 != (((nuint)(uint)Unsafe.AsPointer(ref searchSpace) + offset) & (nuint)(Vector256<byte>.Count - 1))) {
var sum = Sse2.SumAbsoluteDifferences(Sse2.Subtract(Vector128<byte>.Zero, Sse2.CompareEqual(Vector128.Create(value), LoadVector128(ref searchSpace, offset))).AsByte(), Vector128<byte>.Zero).AsInt64();
offset += 16;
result += (sum.GetElement(0) + sum.GetElement(1));
}
lengthToExamine = GetByteVector256SpanLength(offset, length);
var searchMask = Vector256.Create(value);
if (127 < lengthToExamine) {
var sum = Vector256<long>.Zero;
do {
var accumulator0 = Vector256<byte>.Zero;
var accumulator1 = Vector256<byte>.Zero;
var accumulator2 = Vector256<byte>.Zero;
var accumulator3 = Vector256<byte>.Zero;
var loopIndex = ((nuint)0);
var loopLimit = Math.Min(255, (lengthToExamine / 128));
do {
accumulator0 = Avx2.Subtract(accumulator0, Avx2.CompareEqual(searchMask, LoadVector256(ref searchSpace, offset)));
accumulator1 = Avx2.Subtract(accumulator1, Avx2.CompareEqual(searchMask, LoadVector256(ref searchSpace, (offset + 32))));
accumulator2 = Avx2.Subtract(accumulator2, Avx2.CompareEqual(searchMask, LoadVector256(ref searchSpace, (offset + 64))));
accumulator3 = Avx2.Subtract(accumulator3, Avx2.CompareEqual(searchMask, LoadVector256(ref searchSpace, (offset + 96))));
loopIndex++;
offset += 128;
} while (loopIndex < loopLimit);
lengthToExamine -= (128 * loopLimit);
sum = Avx2.Add(sum, Avx2.SumAbsoluteDifferences(accumulator0.AsByte(), Vector256<byte>.Zero).AsInt64());
sum = Avx2.Add(sum, Avx2.SumAbsoluteDifferences(accumulator1.AsByte(), Vector256<byte>.Zero).AsInt64());
sum = Avx2.Add(sum, Avx2.SumAbsoluteDifferences(accumulator2.AsByte(), Vector256<byte>.Zero).AsInt64());
sum = Avx2.Add(sum, Avx2.SumAbsoluteDifferences(accumulator3.AsByte(), Vector256<byte>.Zero).AsInt64());
} while (127 < lengthToExamine);
var sumX = Avx2.ExtractVector128(sum, 0);
var sumY = Avx2.ExtractVector128(sum, 1);
var sumZ = Sse2.Add(sumX, sumY);
result += (sumZ.GetElement(0) + sumZ.GetElement(1));
}
if (31 < lengthToExamine) {
var sum = Vector256<long>.Zero;
do {
sum = Avx2.Add(sum, Avx2.SumAbsoluteDifferences(Avx2.Subtract(Vector256<byte>.Zero, Avx2.CompareEqual(searchMask, LoadVector256(ref searchSpace, offset))).AsByte(), Vector256<byte>.Zero).AsInt64());
lengthToExamine -= 32;
offset += 32;
} while (31 < lengthToExamine);
var sumX = Avx2.ExtractVector128(sum, 0);
var sumY = Avx2.ExtractVector128(sum, 1);
var sumZ = Sse2.Add(sumX, sumY);
result += (sumZ.GetElement(0) + sumZ.GetElement(1));
}
if (offset < ((nuint)(uint)length)) {
lengthToExamine = (((nuint)(uint)length) - offset);
goto SequentialScan;
}
}
else if (Sse2.IsSupported) {
lengthToExamine = GetByteVector128SpanLength(offset, length);
var searchMask = Vector128.Create(value);
if (63 < lengthToExamine) {
var sum = Vector128<long>.Zero;
do {
var accumulator0 = Vector128<byte>.Zero;
var accumulator1 = Vector128<byte>.Zero;
var accumulator2 = Vector128<byte>.Zero;
var accumulator3 = Vector128<byte>.Zero;
var loopIndex = ((nuint)0);
var loopLimit = Math.Min(255, (lengthToExamine / 64));
do {
accumulator0 = Sse2.Subtract(accumulator0, Sse2.CompareEqual(searchMask, LoadVector128(ref searchSpace, offset)));
accumulator1 = Sse2.Subtract(accumulator1, Sse2.CompareEqual(searchMask, LoadVector128(ref searchSpace, (offset + 16))));
accumulator2 = Sse2.Subtract(accumulator2, Sse2.CompareEqual(searchMask, LoadVector128(ref searchSpace, (offset + 32))));
accumulator3 = Sse2.Subtract(accumulator3, Sse2.CompareEqual(searchMask, LoadVector128(ref searchSpace, (offset + 48))));
loopIndex++;
offset += 64;
} while (loopIndex < loopLimit);
lengthToExamine -= (64 * loopLimit);
sum = Sse2.Add(sum, Sse2.SumAbsoluteDifferences(accumulator0.AsByte(), Vector128<byte>.Zero).AsInt64());
sum = Sse2.Add(sum, Sse2.SumAbsoluteDifferences(accumulator1.AsByte(), Vector128<byte>.Zero).AsInt64());
sum = Sse2.Add(sum, Sse2.SumAbsoluteDifferences(accumulator2.AsByte(), Vector128<byte>.Zero).AsInt64());
sum = Sse2.Add(sum, Sse2.SumAbsoluteDifferences(accumulator3.AsByte(), Vector128<byte>.Zero).AsInt64());
} while (63 < lengthToExamine);
result += (sum.GetElement(0) + sum.GetElement(1));
}
if (15 < lengthToExamine) {
var sum = Vector128<long>.Zero;
do {
sum = Sse2.Add(sum, Sse2.SumAbsoluteDifferences(Sse2.Subtract(Vector128<byte>.Zero, Sse2.CompareEqual(searchMask, LoadVector128(ref searchSpace, offset))).AsByte(), Vector128<byte>.Zero).AsInt64());
lengthToExamine -= 16;
offset += 16;
} while (15 < lengthToExamine);
result += (sum.GetElement(0) + sum.GetElement(1));
}
if (offset < ((nuint)(uint)length)) {
lengthToExamine = (((nuint)(uint)length) - offset);
goto SequentialScan;
}
}
}
return ((int)result);
}
[MethodImpl(MethodImplOptions.AggressiveOptimization)]
private static unsafe int OccurrencesOf(ref char searchSpace, char value, int length) {
var lengthToExamine = ((nint)length);
var offset = ((nint)0);
var result = 0L;
if (0 != ((int)Unsafe.AsPointer(ref searchSpace) & 1)) { }
else if (Sse2.IsSupported || Avx2.IsSupported) {
if (15 < length) {
lengthToExamine = UnalignedCountVector128(ref searchSpace);
}
}
SequentialScan:
while (3 < lengthToExamine) {
ref char current = ref Unsafe.Add(ref searchSpace, offset);
if (value == current) {
++result;
}
if (value == Unsafe.Add(ref current, 1)) {
++result;
}
if (value == Unsafe.Add(ref current, 2)) {
++result;
}
if (value == Unsafe.Add(ref current, 3)) {
++result;
}
lengthToExamine -= 4;
offset += 4;
}
while (0 < lengthToExamine) {
if (value == Unsafe.Add(ref searchSpace, offset)) {
++result;
}
--lengthToExamine;
++offset;
}
if (offset < length) {
if (Avx2.IsSupported) {
if (0 != (((nint)Unsafe.AsPointer(ref Unsafe.Add(ref searchSpace, offset))) & (Vector256<byte>.Count - 1))) {
var sum = Sse2.SumAbsoluteDifferences(Sse2.Subtract(Vector128<ushort>.Zero, Sse2.CompareEqual(Vector128.Create(value), LoadVector128(ref searchSpace, offset))).AsByte(), Vector128<byte>.Zero).AsInt64();
offset += 8;
result += (sum.GetElement(0) + sum.GetElement(1));
}
lengthToExamine = GetCharVector256SpanLength(offset, length);
var searchMask = Vector256.Create(value);
if (63 < lengthToExamine) {
var sum = Vector256<long>.Zero;
do {
var accumulator0 = Vector256<ushort>.Zero;
var accumulator1 = Vector256<ushort>.Zero;
var accumulator2 = Vector256<ushort>.Zero;
var accumulator3 = Vector256<ushort>.Zero;
var loopIndex = 0;
var loopLimit = Math.Min(255, (lengthToExamine / 64));
do {
accumulator0 = Avx2.Subtract(accumulator0, Avx2.CompareEqual(searchMask, LoadVector256(ref searchSpace, offset)));
accumulator1 = Avx2.Subtract(accumulator1, Avx2.CompareEqual(searchMask, LoadVector256(ref searchSpace, (offset + 16))));
accumulator2 = Avx2.Subtract(accumulator2, Avx2.CompareEqual(searchMask, LoadVector256(ref searchSpace, (offset + 32))));
accumulator3 = Avx2.Subtract(accumulator3, Avx2.CompareEqual(searchMask, LoadVector256(ref searchSpace, (offset + 48))));
loopIndex++;
offset += 64;
} while (loopIndex < loopLimit);
lengthToExamine -= (64 * loopLimit);
sum = Avx2.Add(sum, Avx2.SumAbsoluteDifferences(accumulator0.AsByte(), Vector256<byte>.Zero).AsInt64());
sum = Avx2.Add(sum, Avx2.SumAbsoluteDifferences(accumulator1.AsByte(), Vector256<byte>.Zero).AsInt64());
sum = Avx2.Add(sum, Avx2.SumAbsoluteDifferences(accumulator2.AsByte(), Vector256<byte>.Zero).AsInt64());
sum = Avx2.Add(sum, Avx2.SumAbsoluteDifferences(accumulator3.AsByte(), Vector256<byte>.Zero).AsInt64());
} while (63 < lengthToExamine);
var sumX = Avx2.ExtractVector128(sum, 0);
var sumY = Avx2.ExtractVector128(sum, 1);
var sumZ = Sse2.Add(sumX, sumY);
result += (sumZ.GetElement(0) + sumZ.GetElement(1));
}
if (15 < lengthToExamine) {
var sum = Vector256<long>.Zero;
do {
sum = Avx2.Add(sum, Avx2.SumAbsoluteDifferences(Avx2.Subtract(Vector256<ushort>.Zero, Avx2.CompareEqual(searchMask, LoadVector256(ref searchSpace, offset))).AsByte(), Vector256<byte>.Zero).AsInt64());
lengthToExamine -= 16;
offset += 16;
} while (15 < lengthToExamine);
var sumX = Avx2.ExtractVector128(sum, 0);
var sumY = Avx2.ExtractVector128(sum, 1);
var sumZ = Sse2.Add(sumX, sumY);
result += (sumZ.GetElement(0) + sumZ.GetElement(1));
}
if (offset < length) {
lengthToExamine = (length - offset);
goto SequentialScan;
}
}
else if (Sse2.IsSupported) {
lengthToExamine = GetCharVector128SpanLength(offset, length);
var searchMask = Vector128.Create(value);
if (31 < lengthToExamine) {
var sum = Vector128<long>.Zero;
do {
var accumulator0 = Vector128<ushort>.Zero;
var accumulator1 = Vector128<ushort>.Zero;
var accumulator2 = Vector128<ushort>.Zero;
var accumulator3 = Vector128<ushort>.Zero;
var loopIndex = 0;
var loopLimit = Math.Min(255, (lengthToExamine / 32));
do {
accumulator0 = Sse2.Subtract(accumulator0, Sse2.CompareEqual(searchMask, LoadVector128(ref searchSpace, offset)));
accumulator1 = Sse2.Subtract(accumulator1, Sse2.CompareEqual(searchMask, LoadVector128(ref searchSpace, (offset + 8))));
accumulator2 = Sse2.Subtract(accumulator2, Sse2.CompareEqual(searchMask, LoadVector128(ref searchSpace, (offset + 16))));
accumulator3 = Sse2.Subtract(accumulator3, Sse2.CompareEqual(searchMask, LoadVector128(ref searchSpace, (offset + 24))));
loopIndex++;
offset += 32;
} while (loopIndex < loopLimit);
lengthToExamine -= (32 * loopLimit);
sum = Sse2.Add(sum, Sse2.SumAbsoluteDifferences(accumulator0.AsByte(), Vector128<byte>.Zero).AsInt64());
sum = Sse2.Add(sum, Sse2.SumAbsoluteDifferences(accumulator1.AsByte(), Vector128<byte>.Zero).AsInt64());
sum = Sse2.Add(sum, Sse2.SumAbsoluteDifferences(accumulator2.AsByte(), Vector128<byte>.Zero).AsInt64());
sum = Sse2.Add(sum, Sse2.SumAbsoluteDifferences(accumulator3.AsByte(), Vector128<byte>.Zero).AsInt64());
} while (31 < lengthToExamine);
result += (sum.GetElement(0) + sum.GetElement(1));
}
if (7 < lengthToExamine) {
var sum = Vector128<long>.Zero;
do {
sum = Sse2.Add(sum, Sse2.SumAbsoluteDifferences(Sse2.Subtract(Vector128<ushort>.Zero, Sse2.CompareEqual(searchMask, LoadVector128(ref searchSpace, offset))).AsByte(), Vector128<byte>.Zero).AsInt64());
lengthToExamine -= 8;
offset += 8;
} while (7 < lengthToExamine);
result += (sum.GetElement(0) + sum.GetElement(1));
}
if (offset < length) {
lengthToExamine = (length - offset);
goto SequentialScan;
}
}
}
return ((int)result);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe nuint UnalignedCountVector128(ref byte searchSpace) {
nint unaligned = ((nint)Unsafe.AsPointer(ref searchSpace) & (Vector128<byte>.Count - 1));
return ((nuint)(uint)((Vector128<byte>.Count - unaligned) & (Vector128<byte>.Count - 1)));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe nint UnalignedCountVector128(ref char searchSpace) {
const int ElementsPerByte = (sizeof(ushort) / sizeof(byte));
return ((nint)(uint)(-(int)Unsafe.AsPointer(ref searchSpace) / ElementsPerByte) & (Vector128<ushort>.Count - 1));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int OccurrencesOf(this ReadOnlySpan<byte> span, byte value) =>
OccurrencesOf(
length: span.Length,
searchSpace: ref MemoryMarshal.GetReference(span),
value: value
);
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int OccurrencesOf(this Span<byte> span, byte value) =>
((ReadOnlySpan<byte>)span).OccurrencesOf(value);
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int OccurrencesOf(this ReadOnlySpan<char> span, char value) =>
OccurrencesOf(
length: span.Length,
searchSpace: ref MemoryMarshal.GetReference(span),
value: value
);
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int OccurrencesOf(this Span<char> span, char value) =>
((ReadOnlySpan<char>)span).OccurrencesOf(value);
}
给定以下输入字节:
var vBytes = new Vector<byte>(new byte[] {72, 101, 55, 08, 108, 111, 55, 87, 111, 114, 108, 55, 100, 55, 55, 20});
和给定的掩码:
var mask = new Vector<byte>(55);
如何在输入数组中找到字节数 55
?
我已经尝试 xoring vBytes
与 mask
:
var xored = Vector.Xor(mask, vBytes);
给出:
<127, 82, 0, 91, 91, 88, 0, 96, 88, 69, 91, 0, 83, 0, 0, 35>
但不知道如何从中得到计数。
为简单起见,我们假设输入字节长度始终等于 Vector<byte>.Count
.
感谢 Marc Gravell 的提示,以下工作:
var areEqual = Vector.Equals(vBytes, mask);
var negation = Vector.Negate(areEqual);
var count = Vector.Dot(negation, Vector<byte>.One);
Marc 有一个 blog post 关于这个主题的更多信息。
(以下想法的 AVX2 C 内在函数实现,以防具体示例有所帮助:How to count character occurrences using SIMD)
在 asm 中,您希望 pcmpeqb
生成 0 或 0xFF 的向量。被视为有符号整数,即 0/-1。
然后 使用 compare-result 作为整数值 和 psubb
将 0 / 1 添加到该元素的计数器。 (减 -1 = 加 +1)
这可能会在 256 次迭代后溢出,所以在此之前的某个时间,使用 psadbw
对 _mm_setzero_si128()
将那些无符号字节(没有溢出)水平求和为 64 位整数(一个 64 位整数每组 8 个字节)。然后paddq
累加64位总和
可以使用嵌套循环或仅在常规展开循环的末尾完成溢出前的累积。 psadbw
很快(因为它是视频编码的关键组成部分 motion-search),所以每 4 次比较甚至每 1 次累积并跳过 psubb
.[= 也不错35=]
有关 x86 的更多详细信息,请参阅 Agner Fog's optimization guides。根据他的指令表,psadbw xmm
/ vpsadbw ymm
运行s 在 Skylake 上每个时钟周期 1 个向量,有 3 个周期延迟。 (只有 1 uop 的 front-end 带宽。)上面提到的所有指令在多个端口上也是 single-uop 和 运行(因此吞吐量不一定相互冲突) .他们的 128 位版本只需要 SSE2。
如果你真的一次只有一个向量要计算,并且没有循环内存,那么可能 pcmpeqb
/ psadbw
/ pshufd
(将高半部分复制到low) / paddd
/ movd eax, xmm0
给你 255 * 整数寄存器中的匹配数。一个额外的向量指令(例如从零减去,或与 1 进行与运算,或 pabsb
(绝对值)将删除 x255 比例因子。
IDK 如何在 C# SIMD 中编写,但您肯定不想要一个dot-product!解包并转换为 FP 将比上面慢 4 倍,只是因为 fixed-width 向量比浮点数多 4 倍的字节,而 dpps
(_mm_dp_ps
) 是 不快。 4 微指令,Skylake 上每 1.5 个周期吞吐量一个。如果你做必须horizontal-sum除无符号字节以外的东西,请参阅Fastest way to do horizontal SSE vector sum (or other reduction)(我的回答也包括整数)。
或者如果 Vector.Dot
使用 pmaddubsw
/ pmaddwd
作为整数向量,那么这可能不会那么糟糕,但是对每个向量做 multi-step 水平求和与 psadbw
相比,比较结果很糟糕,尤其是与偶尔只进行水平求和的字节累加器相比。
或者,如果 C# 优化了与常量向量 1
的任何实际乘法。无论如何,这个答案的第一部分是您希望 CPU 成为 运行ning 的代码。无论您喜欢什么,都可以使用任何源代码实现它。
这里是 C:
中的快速 SSE2 实现size_t memcount_sse2(const void *s, int c, size_t n) {
__m128i cv = _mm_set1_epi8(c), sum = _mm_setzero_si128(), acr0,acr1,acr2,acr3;
const char *p,*pe;
for(p = s; p != (char *)s+(n- (n % (252*16)));) {
for(acr0 = acr1 = acr2 = acr3 = _mm_setzero_si128(),pe = p+252*16; p != pe; p += 64) {
acr0 = _mm_add_epi8(acr0, _mm_cmpeq_epi8(cv, _mm_loadu_si128((const __m128i *)p)));
acr1 = _mm_add_epi8(acr1, _mm_cmpeq_epi8(cv, _mm_loadu_si128((const __m128i *)(p+16))));
acr2 = _mm_add_epi8(acr2, _mm_cmpeq_epi8(cv, _mm_loadu_si128((const __m128i *)(p+32))));
acr3 = _mm_add_epi8(acr3, _mm_cmpeq_epi8(cv, _mm_loadu_si128((const __m128i *)(p+48))));
__builtin_prefetch(p+1024);
}
sum = _mm_add_epi64(sum, _mm_sad_epu8(_mm_sub_epi8(_mm_setzero_si128(), acr0), _mm_setzero_si128()));
sum = _mm_add_epi64(sum, _mm_sad_epu8(_mm_sub_epi8(_mm_setzero_si128(), acr1), _mm_setzero_si128()));
sum = _mm_add_epi64(sum, _mm_sad_epu8(_mm_sub_epi8(_mm_setzero_si128(), acr2), _mm_setzero_si128()));
sum = _mm_add_epi64(sum, _mm_sad_epu8(_mm_sub_epi8(_mm_setzero_si128(), acr3), _mm_setzero_si128()));
}
// may require SSE4, rewrite this part for actual SSE2.
size_t count = _mm_extract_epi64(sum, 0) + _mm_extract_epi64(sum, 1);
// scalar cleanup. Could be optimized.
while(p != (char *)s + n) count += *p++ == c;
return count;
}
并查看:https://gist.github.com/powturbo 和 avx2 实现。
我知道我来晚了,但到目前为止 none 这里的答案实际上提供了完整的解决方案。这是我最好的尝试之一,源自 this Gist and the DotNet source code。所有功劳归于 DotNet 团队和社区成员(尤其是 @Peter Cordes)。
用法:
var bytes = Encoding.ASCII.GetBytes("The quick brown fox jumps over the lazy dog.");
var byteCount = bytes.OccurrencesOf(32);
var chars = "The quick brown fox jumps over the lazy dog.";
var charCount = chars.OccurrencesOf(' ');
代码:
public static class VectorExtensions
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static nuint GetByteVector128SpanLength(nuint offset, int length) =>
((nuint)(uint)((length - (int)offset) & ~(Vector128<byte>.Count - 1)));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static nuint GetByteVector256SpanLength(nuint offset, int length) =>
((nuint)(uint)((length - (int)offset) & ~(Vector256<byte>.Count - 1)));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static nint GetCharVector128SpanLength(nint offset, nint length) =>
((length - offset) & ~(Vector128<ushort>.Count - 1));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static nint GetCharVector256SpanLength(nint offset, nint length) =>
((length - offset) & ~(Vector256<ushort>.Count - 1));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector128<byte> LoadVector128(ref byte start, nuint offset) =>
Unsafe.ReadUnaligned<Vector128<byte>>(ref Unsafe.AddByteOffset(ref start, offset));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector256<byte> LoadVector256(ref byte start, nuint offset) =>
Unsafe.ReadUnaligned<Vector256<byte>>(ref Unsafe.AddByteOffset(ref start, offset));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector128<ushort> LoadVector128(ref char start, nint offset) =>
Unsafe.ReadUnaligned<Vector128<ushort>>(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref start, offset)));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector256<ushort> LoadVector256(ref char start, nint offset) =>
Unsafe.ReadUnaligned<Vector256<ushort>>(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref start, offset)));
[MethodImpl(MethodImplOptions.AggressiveOptimization)]
private static unsafe int OccurrencesOf(ref byte searchSpace, byte value, int length) {
var lengthToExamine = ((nuint)length);
var offset = ((nuint)0);
var result = 0L;
if (Sse2.IsSupported || Avx2.IsSupported) {
if (31 < length) {
lengthToExamine = UnalignedCountVector128(ref searchSpace);
}
}
SequentialScan:
while (7 < lengthToExamine) {
ref byte current = ref Unsafe.AddByteOffset(ref searchSpace, offset);
if (value == current) {
++result;
}
if (value == Unsafe.AddByteOffset(ref current, 1)) {
++result;
}
if (value == Unsafe.AddByteOffset(ref current, 2)) {
++result;
}
if (value == Unsafe.AddByteOffset(ref current, 3)) {
++result;
}
if (value == Unsafe.AddByteOffset(ref current, 4)) {
++result;
}
if (value == Unsafe.AddByteOffset(ref current, 5)) {
++result;
}
if (value == Unsafe.AddByteOffset(ref current, 6)) {
++result;
}
if (value == Unsafe.AddByteOffset(ref current, 7)) {
++result;
}
lengthToExamine -= 8;
offset += 8;
}
while (3 < lengthToExamine) {
ref byte current = ref Unsafe.AddByteOffset(ref searchSpace, offset);
if (value == current) {
++result;
}
if (value == Unsafe.AddByteOffset(ref current, 1)) {
++result;
}
if (value == Unsafe.AddByteOffset(ref current, 2)) {
++result;
}
if (value == Unsafe.AddByteOffset(ref current, 3)) {
++result;
}
lengthToExamine -= 4;
offset += 4;
}
while (0 < lengthToExamine) {
if (value == Unsafe.AddByteOffset(ref searchSpace, offset)) {
++result;
}
--lengthToExamine;
++offset;
}
if (offset < ((nuint)(uint)length)) {
if (Avx2.IsSupported) {
if (0 != (((nuint)(uint)Unsafe.AsPointer(ref searchSpace) + offset) & (nuint)(Vector256<byte>.Count - 1))) {
var sum = Sse2.SumAbsoluteDifferences(Sse2.Subtract(Vector128<byte>.Zero, Sse2.CompareEqual(Vector128.Create(value), LoadVector128(ref searchSpace, offset))).AsByte(), Vector128<byte>.Zero).AsInt64();
offset += 16;
result += (sum.GetElement(0) + sum.GetElement(1));
}
lengthToExamine = GetByteVector256SpanLength(offset, length);
var searchMask = Vector256.Create(value);
if (127 < lengthToExamine) {
var sum = Vector256<long>.Zero;
do {
var accumulator0 = Vector256<byte>.Zero;
var accumulator1 = Vector256<byte>.Zero;
var accumulator2 = Vector256<byte>.Zero;
var accumulator3 = Vector256<byte>.Zero;
var loopIndex = ((nuint)0);
var loopLimit = Math.Min(255, (lengthToExamine / 128));
do {
accumulator0 = Avx2.Subtract(accumulator0, Avx2.CompareEqual(searchMask, LoadVector256(ref searchSpace, offset)));
accumulator1 = Avx2.Subtract(accumulator1, Avx2.CompareEqual(searchMask, LoadVector256(ref searchSpace, (offset + 32))));
accumulator2 = Avx2.Subtract(accumulator2, Avx2.CompareEqual(searchMask, LoadVector256(ref searchSpace, (offset + 64))));
accumulator3 = Avx2.Subtract(accumulator3, Avx2.CompareEqual(searchMask, LoadVector256(ref searchSpace, (offset + 96))));
loopIndex++;
offset += 128;
} while (loopIndex < loopLimit);
lengthToExamine -= (128 * loopLimit);
sum = Avx2.Add(sum, Avx2.SumAbsoluteDifferences(accumulator0.AsByte(), Vector256<byte>.Zero).AsInt64());
sum = Avx2.Add(sum, Avx2.SumAbsoluteDifferences(accumulator1.AsByte(), Vector256<byte>.Zero).AsInt64());
sum = Avx2.Add(sum, Avx2.SumAbsoluteDifferences(accumulator2.AsByte(), Vector256<byte>.Zero).AsInt64());
sum = Avx2.Add(sum, Avx2.SumAbsoluteDifferences(accumulator3.AsByte(), Vector256<byte>.Zero).AsInt64());
} while (127 < lengthToExamine);
var sumX = Avx2.ExtractVector128(sum, 0);
var sumY = Avx2.ExtractVector128(sum, 1);
var sumZ = Sse2.Add(sumX, sumY);
result += (sumZ.GetElement(0) + sumZ.GetElement(1));
}
if (31 < lengthToExamine) {
var sum = Vector256<long>.Zero;
do {
sum = Avx2.Add(sum, Avx2.SumAbsoluteDifferences(Avx2.Subtract(Vector256<byte>.Zero, Avx2.CompareEqual(searchMask, LoadVector256(ref searchSpace, offset))).AsByte(), Vector256<byte>.Zero).AsInt64());
lengthToExamine -= 32;
offset += 32;
} while (31 < lengthToExamine);
var sumX = Avx2.ExtractVector128(sum, 0);
var sumY = Avx2.ExtractVector128(sum, 1);
var sumZ = Sse2.Add(sumX, sumY);
result += (sumZ.GetElement(0) + sumZ.GetElement(1));
}
if (offset < ((nuint)(uint)length)) {
lengthToExamine = (((nuint)(uint)length) - offset);
goto SequentialScan;
}
}
else if (Sse2.IsSupported) {
lengthToExamine = GetByteVector128SpanLength(offset, length);
var searchMask = Vector128.Create(value);
if (63 < lengthToExamine) {
var sum = Vector128<long>.Zero;
do {
var accumulator0 = Vector128<byte>.Zero;
var accumulator1 = Vector128<byte>.Zero;
var accumulator2 = Vector128<byte>.Zero;
var accumulator3 = Vector128<byte>.Zero;
var loopIndex = ((nuint)0);
var loopLimit = Math.Min(255, (lengthToExamine / 64));
do {
accumulator0 = Sse2.Subtract(accumulator0, Sse2.CompareEqual(searchMask, LoadVector128(ref searchSpace, offset)));
accumulator1 = Sse2.Subtract(accumulator1, Sse2.CompareEqual(searchMask, LoadVector128(ref searchSpace, (offset + 16))));
accumulator2 = Sse2.Subtract(accumulator2, Sse2.CompareEqual(searchMask, LoadVector128(ref searchSpace, (offset + 32))));
accumulator3 = Sse2.Subtract(accumulator3, Sse2.CompareEqual(searchMask, LoadVector128(ref searchSpace, (offset + 48))));
loopIndex++;
offset += 64;
} while (loopIndex < loopLimit);
lengthToExamine -= (64 * loopLimit);
sum = Sse2.Add(sum, Sse2.SumAbsoluteDifferences(accumulator0.AsByte(), Vector128<byte>.Zero).AsInt64());
sum = Sse2.Add(sum, Sse2.SumAbsoluteDifferences(accumulator1.AsByte(), Vector128<byte>.Zero).AsInt64());
sum = Sse2.Add(sum, Sse2.SumAbsoluteDifferences(accumulator2.AsByte(), Vector128<byte>.Zero).AsInt64());
sum = Sse2.Add(sum, Sse2.SumAbsoluteDifferences(accumulator3.AsByte(), Vector128<byte>.Zero).AsInt64());
} while (63 < lengthToExamine);
result += (sum.GetElement(0) + sum.GetElement(1));
}
if (15 < lengthToExamine) {
var sum = Vector128<long>.Zero;
do {
sum = Sse2.Add(sum, Sse2.SumAbsoluteDifferences(Sse2.Subtract(Vector128<byte>.Zero, Sse2.CompareEqual(searchMask, LoadVector128(ref searchSpace, offset))).AsByte(), Vector128<byte>.Zero).AsInt64());
lengthToExamine -= 16;
offset += 16;
} while (15 < lengthToExamine);
result += (sum.GetElement(0) + sum.GetElement(1));
}
if (offset < ((nuint)(uint)length)) {
lengthToExamine = (((nuint)(uint)length) - offset);
goto SequentialScan;
}
}
}
return ((int)result);
}
[MethodImpl(MethodImplOptions.AggressiveOptimization)]
private static unsafe int OccurrencesOf(ref char searchSpace, char value, int length) {
var lengthToExamine = ((nint)length);
var offset = ((nint)0);
var result = 0L;
if (0 != ((int)Unsafe.AsPointer(ref searchSpace) & 1)) { }
else if (Sse2.IsSupported || Avx2.IsSupported) {
if (15 < length) {
lengthToExamine = UnalignedCountVector128(ref searchSpace);
}
}
SequentialScan:
while (3 < lengthToExamine) {
ref char current = ref Unsafe.Add(ref searchSpace, offset);
if (value == current) {
++result;
}
if (value == Unsafe.Add(ref current, 1)) {
++result;
}
if (value == Unsafe.Add(ref current, 2)) {
++result;
}
if (value == Unsafe.Add(ref current, 3)) {
++result;
}
lengthToExamine -= 4;
offset += 4;
}
while (0 < lengthToExamine) {
if (value == Unsafe.Add(ref searchSpace, offset)) {
++result;
}
--lengthToExamine;
++offset;
}
if (offset < length) {
if (Avx2.IsSupported) {
if (0 != (((nint)Unsafe.AsPointer(ref Unsafe.Add(ref searchSpace, offset))) & (Vector256<byte>.Count - 1))) {
var sum = Sse2.SumAbsoluteDifferences(Sse2.Subtract(Vector128<ushort>.Zero, Sse2.CompareEqual(Vector128.Create(value), LoadVector128(ref searchSpace, offset))).AsByte(), Vector128<byte>.Zero).AsInt64();
offset += 8;
result += (sum.GetElement(0) + sum.GetElement(1));
}
lengthToExamine = GetCharVector256SpanLength(offset, length);
var searchMask = Vector256.Create(value);
if (63 < lengthToExamine) {
var sum = Vector256<long>.Zero;
do {
var accumulator0 = Vector256<ushort>.Zero;
var accumulator1 = Vector256<ushort>.Zero;
var accumulator2 = Vector256<ushort>.Zero;
var accumulator3 = Vector256<ushort>.Zero;
var loopIndex = 0;
var loopLimit = Math.Min(255, (lengthToExamine / 64));
do {
accumulator0 = Avx2.Subtract(accumulator0, Avx2.CompareEqual(searchMask, LoadVector256(ref searchSpace, offset)));
accumulator1 = Avx2.Subtract(accumulator1, Avx2.CompareEqual(searchMask, LoadVector256(ref searchSpace, (offset + 16))));
accumulator2 = Avx2.Subtract(accumulator2, Avx2.CompareEqual(searchMask, LoadVector256(ref searchSpace, (offset + 32))));
accumulator3 = Avx2.Subtract(accumulator3, Avx2.CompareEqual(searchMask, LoadVector256(ref searchSpace, (offset + 48))));
loopIndex++;
offset += 64;
} while (loopIndex < loopLimit);
lengthToExamine -= (64 * loopLimit);
sum = Avx2.Add(sum, Avx2.SumAbsoluteDifferences(accumulator0.AsByte(), Vector256<byte>.Zero).AsInt64());
sum = Avx2.Add(sum, Avx2.SumAbsoluteDifferences(accumulator1.AsByte(), Vector256<byte>.Zero).AsInt64());
sum = Avx2.Add(sum, Avx2.SumAbsoluteDifferences(accumulator2.AsByte(), Vector256<byte>.Zero).AsInt64());
sum = Avx2.Add(sum, Avx2.SumAbsoluteDifferences(accumulator3.AsByte(), Vector256<byte>.Zero).AsInt64());
} while (63 < lengthToExamine);
var sumX = Avx2.ExtractVector128(sum, 0);
var sumY = Avx2.ExtractVector128(sum, 1);
var sumZ = Sse2.Add(sumX, sumY);
result += (sumZ.GetElement(0) + sumZ.GetElement(1));
}
if (15 < lengthToExamine) {
var sum = Vector256<long>.Zero;
do {
sum = Avx2.Add(sum, Avx2.SumAbsoluteDifferences(Avx2.Subtract(Vector256<ushort>.Zero, Avx2.CompareEqual(searchMask, LoadVector256(ref searchSpace, offset))).AsByte(), Vector256<byte>.Zero).AsInt64());
lengthToExamine -= 16;
offset += 16;
} while (15 < lengthToExamine);
var sumX = Avx2.ExtractVector128(sum, 0);
var sumY = Avx2.ExtractVector128(sum, 1);
var sumZ = Sse2.Add(sumX, sumY);
result += (sumZ.GetElement(0) + sumZ.GetElement(1));
}
if (offset < length) {
lengthToExamine = (length - offset);
goto SequentialScan;
}
}
else if (Sse2.IsSupported) {
lengthToExamine = GetCharVector128SpanLength(offset, length);
var searchMask = Vector128.Create(value);
if (31 < lengthToExamine) {
var sum = Vector128<long>.Zero;
do {
var accumulator0 = Vector128<ushort>.Zero;
var accumulator1 = Vector128<ushort>.Zero;
var accumulator2 = Vector128<ushort>.Zero;
var accumulator3 = Vector128<ushort>.Zero;
var loopIndex = 0;
var loopLimit = Math.Min(255, (lengthToExamine / 32));
do {
accumulator0 = Sse2.Subtract(accumulator0, Sse2.CompareEqual(searchMask, LoadVector128(ref searchSpace, offset)));
accumulator1 = Sse2.Subtract(accumulator1, Sse2.CompareEqual(searchMask, LoadVector128(ref searchSpace, (offset + 8))));
accumulator2 = Sse2.Subtract(accumulator2, Sse2.CompareEqual(searchMask, LoadVector128(ref searchSpace, (offset + 16))));
accumulator3 = Sse2.Subtract(accumulator3, Sse2.CompareEqual(searchMask, LoadVector128(ref searchSpace, (offset + 24))));
loopIndex++;
offset += 32;
} while (loopIndex < loopLimit);
lengthToExamine -= (32 * loopLimit);
sum = Sse2.Add(sum, Sse2.SumAbsoluteDifferences(accumulator0.AsByte(), Vector128<byte>.Zero).AsInt64());
sum = Sse2.Add(sum, Sse2.SumAbsoluteDifferences(accumulator1.AsByte(), Vector128<byte>.Zero).AsInt64());
sum = Sse2.Add(sum, Sse2.SumAbsoluteDifferences(accumulator2.AsByte(), Vector128<byte>.Zero).AsInt64());
sum = Sse2.Add(sum, Sse2.SumAbsoluteDifferences(accumulator3.AsByte(), Vector128<byte>.Zero).AsInt64());
} while (31 < lengthToExamine);
result += (sum.GetElement(0) + sum.GetElement(1));
}
if (7 < lengthToExamine) {
var sum = Vector128<long>.Zero;
do {
sum = Sse2.Add(sum, Sse2.SumAbsoluteDifferences(Sse2.Subtract(Vector128<ushort>.Zero, Sse2.CompareEqual(searchMask, LoadVector128(ref searchSpace, offset))).AsByte(), Vector128<byte>.Zero).AsInt64());
lengthToExamine -= 8;
offset += 8;
} while (7 < lengthToExamine);
result += (sum.GetElement(0) + sum.GetElement(1));
}
if (offset < length) {
lengthToExamine = (length - offset);
goto SequentialScan;
}
}
}
return ((int)result);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe nuint UnalignedCountVector128(ref byte searchSpace) {
nint unaligned = ((nint)Unsafe.AsPointer(ref searchSpace) & (Vector128<byte>.Count - 1));
return ((nuint)(uint)((Vector128<byte>.Count - unaligned) & (Vector128<byte>.Count - 1)));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe nint UnalignedCountVector128(ref char searchSpace) {
const int ElementsPerByte = (sizeof(ushort) / sizeof(byte));
return ((nint)(uint)(-(int)Unsafe.AsPointer(ref searchSpace) / ElementsPerByte) & (Vector128<ushort>.Count - 1));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int OccurrencesOf(this ReadOnlySpan<byte> span, byte value) =>
OccurrencesOf(
length: span.Length,
searchSpace: ref MemoryMarshal.GetReference(span),
value: value
);
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int OccurrencesOf(this Span<byte> span, byte value) =>
((ReadOnlySpan<byte>)span).OccurrencesOf(value);
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int OccurrencesOf(this ReadOnlySpan<char> span, char value) =>
OccurrencesOf(
length: span.Length,
searchSpace: ref MemoryMarshal.GetReference(span),
value: value
);
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int OccurrencesOf(this Span<char> span, char value) =>
((ReadOnlySpan<char>)span).OccurrencesOf(value);
}