位分片:找到最小值
bit slicing: finding minimum value
简短版
我需要找到编码为位片的 64 个 uint8_t
个变量的最小值。
即变量的每一位都被编码成八个独立的 uint64_t
:
//Normal layout:
uint8_t values[64]; // This is what you normally use.
// Finding minimum would be a simple
// matter of a for loop
/***********************/
// BITSLICE layout:
uint64_t slices[8]; // This is what I have, due to performance
// reasons in other parts of the code (not shown here)
slice[0]; //LSB: Least signignificant bit (for all 64 values)
slice[7]; //MSB: Most significant bit (for all 64 values)
现在,我如何找出这些的最小值?(我不关心它的位置,只关心它的值)
更多上下文:
实际上,出于性能原因,我在一个已经使用位切片的算法中有一个更长的值数组(超过 64 个)。
所以我得到的实际上更像是(上面的问题被简化了):
uint64_t slices[8][100];
所以我真正需要的是所有 100*64 值中的最小值。但我认为这可以通过将答案应用于上面的简化问题,在常规 for 循环中完成。
编辑:显然我的问题没有我想的那么清楚所以已经更新了
使用联合:
#include <stdio.h>
#include <inttypes.h>
int main()
{
union
{
uint64_t slices[8];
uint8_t bits[64];
} a_union;
int i;
uint8_t min;
for(i = 0 ; i < sizeof(a_union.slices)/sizeof(a_union.slices[0]) ; ++i)
{
a_union.slices[i] = (i+1) * 0x1122334455667788;
printf("a_union.slices[%d] = 0x%"PRIX64"\n", i, a_union.slices[i]);
}
for(i = 0, min = 255 ; i < sizeof(a_union.bits) ; ++i)
if(a_union.bits[i] < min)
min = a_union.bits[i];
printf("min = %u (0x%X)\n", min, min);
}
编辑
更好 - 使用 Duff 的设备。
#include <stdio.h>
#include <inttypes.h>
#include <limits.h>
#include <stdlib.h>
uint8_t min_in_mem_block(uint8_t *p, size_t len)
{
/* Find the minimum byte value in the block of memory of length len pointed to by p */
size_t n = (len + 7) / 8;
uint8_t min = UINT8_MAX;
switch (len % 8)
{
case 0: do { min = *p < min ? *p : min; p++;
case 7: min = *p < min ? *p : min; p++;
case 6: min = *p < min ? *p : min; p++;
case 5: min = *p < min ? *p : min; p++;
case 4: min = *p < min ? *p : min; p++;
case 3: min = *p < min ? *p : min; p++;
case 2: min = *p < min ? *p : min; p++;
case 1: min = *p < min ? *p : min; p++;
} while (--n > 0);
}
return min;
}
int main()
{
uint64_t block[8];
for(size_t i = 0 ; i < sizeof(block)/sizeof(block[0]) ; ++i)
{
block[i] = ((i+1) * 0x1122334455667788u) | 0x0101010101010101;
printf("block[%zu] = 0x%"PRIX64"\n", i, block[i]);
}
uint8_t min = min_in_mem_block((uint8_t *)block, sizeof(block));
printf("min = %" PRIX8 "\n", min);
}
我至少能想到两种方法。最简单的方法就是对其进行暴力破解:通过适当的按位算法一次重构 64 个整数中的每一个,并跟踪最小结果。沿着这些线的东西:
uint8_t min = 0xff;
// iterate over the collection of values
for (uint64_t which = 1; which; which <<= 1) {
// reconstitute one value in 'test'
uint8_t test = 0;
for (int bit = 0; bit < 8; bit++) {
// verify this decoding -- your bit order may be different:
test += (!!(slices[bit] & which)) << bit;
}
// track the minimum
if (test < min) {
min = test;
}
}
另一方面,通过只扫描一次 slices
并直接累加最小值应该也可以更快地完成。我没有时间测试这个,但它应该传达了总体思路:
uint8_t min = 0xff;
uint64_t mask = ~(uint64_t)0; // a mask of candidate positions; all bits initially set
for (int i = 7; i >= 0; i--) { // assumes slice 7 is most significant
// which of the remaining candidates have this bit set:
uint64_t bits_set = slice[i] & mask;
// If at least one of the remaining candidates does not have this bit set
if (bits_set != mask) {
min ^= (1 << i); // turn off this bit in the result
mask ^= bits_set; // remove the candidates that do have this bit set
}
}
后者类似于基数排序。
这里有一些简单而高效的函数,用于计算编码为 8 uint64_t
包的 64 字节值集的最小值和最大值,每个包存储 64 个值中的每个值的 1 位:
#include <stdint.h>
uint8_t maxslice(const uint64_t s[8]) {
uint8_t max = 0, bit = 0x80;
uint64_t mask = ~0ULL;
for (int i = 8; i-- > 0; bit >>= 1) {
uint64_t x = s[i] & mask;
if (x) {
max |= bit;
mask &= x;
}
}
return max;
}
uint8_t minslice(const uint64_t s[8]) {
uint8_t min = 0, bit = 0x80;
uint64_t mask = ~0ULL;
for (int i = 8; i-- > 0; bit >>= 1) {
uint64_t x = ~s[i] & mask;
if (x) {
min |= bit;
mask &= x;
}
}
return ~min;
}
正如可以在 Godbolt's Compiler Explorer 上验证的那样,clang
为两个函数生成了无分支代码。
为了计算以这种方式组织的较大值集的最小值 uint64_t slices[8][100]
的扩展目标,您可以简单地在数组上迭代此代码并逐步计算最小值。如果已经找到 0
的绝对最小值,则可能值得在此循环的每一步进行测试。棘手的部分是数组的组织方式:
uint64_t slices[8][100]
定义了 8 个数组 100 uint64_t
的数组。换句话说,内存中的布局是 6400 位低位,然后是 6400 位 2 位,...,最后是 6400 位权重 128.
uint8_t minarray(const uint64_t s[8][100]) {
uint8_t all_max = 0;
for (int j = 0; j < 100; j++) {
uint8_t max = 0, bit = 0x80;
uint64_t mask = ~0ULL;
for (int i = 8; i-- > 0; bit >>= 1) {
uint64_t x = ~s[i][j] & mask;
if (x) {
max |= bit;
mask &= x;
}
}
if (all_max < max) {
all_max = max;
if (all_max == 255)
break;
}
}
return ~all_max;
}
为了向量化此代码,我们可以转置循环:使用 x
和 mask
作为 100 uint64_t
的数组进行计算将产生相同的结果,但会让编译器向量化一些内部循环:
uint8_t minarray1(const uint64_t s[8][100]) {
uint8_t max = 0, bit = 0x80;
uint64_t mask[100] = {
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
};
for (int i = 8; i-- > 0; bit >>= 1) {
uint64_t x[100];
uint64_t xall = 0;
for (int j = 0; j < 100; j++) {
x[j] = ~s[i][j] & mask[j];
xall |= x[j];
}
if (xall) {
max |= bit;
for (int j = 0; j < 100; j++) {
mask[j] &= x[j];
}
}
}
return ~max;
}
再次生成 clang unrolled vectorized code。基准测试会告诉我们这种方法是否比以前的方法有更好的性能。
简短版
我需要找到编码为位片的 64 个 uint8_t
个变量的最小值。
即变量的每一位都被编码成八个独立的 uint64_t
:
//Normal layout:
uint8_t values[64]; // This is what you normally use.
// Finding minimum would be a simple
// matter of a for loop
/***********************/
// BITSLICE layout:
uint64_t slices[8]; // This is what I have, due to performance
// reasons in other parts of the code (not shown here)
slice[0]; //LSB: Least signignificant bit (for all 64 values)
slice[7]; //MSB: Most significant bit (for all 64 values)
现在,我如何找出这些的最小值?(我不关心它的位置,只关心它的值)
更多上下文:
实际上,出于性能原因,我在一个已经使用位切片的算法中有一个更长的值数组(超过 64 个)。
所以我得到的实际上更像是(上面的问题被简化了):
uint64_t slices[8][100];
所以我真正需要的是所有 100*64 值中的最小值。但我认为这可以通过将答案应用于上面的简化问题,在常规 for 循环中完成。
编辑:显然我的问题没有我想的那么清楚所以已经更新了
使用联合:
#include <stdio.h>
#include <inttypes.h>
int main()
{
union
{
uint64_t slices[8];
uint8_t bits[64];
} a_union;
int i;
uint8_t min;
for(i = 0 ; i < sizeof(a_union.slices)/sizeof(a_union.slices[0]) ; ++i)
{
a_union.slices[i] = (i+1) * 0x1122334455667788;
printf("a_union.slices[%d] = 0x%"PRIX64"\n", i, a_union.slices[i]);
}
for(i = 0, min = 255 ; i < sizeof(a_union.bits) ; ++i)
if(a_union.bits[i] < min)
min = a_union.bits[i];
printf("min = %u (0x%X)\n", min, min);
}
编辑
更好 - 使用 Duff 的设备。
#include <stdio.h>
#include <inttypes.h>
#include <limits.h>
#include <stdlib.h>
uint8_t min_in_mem_block(uint8_t *p, size_t len)
{
/* Find the minimum byte value in the block of memory of length len pointed to by p */
size_t n = (len + 7) / 8;
uint8_t min = UINT8_MAX;
switch (len % 8)
{
case 0: do { min = *p < min ? *p : min; p++;
case 7: min = *p < min ? *p : min; p++;
case 6: min = *p < min ? *p : min; p++;
case 5: min = *p < min ? *p : min; p++;
case 4: min = *p < min ? *p : min; p++;
case 3: min = *p < min ? *p : min; p++;
case 2: min = *p < min ? *p : min; p++;
case 1: min = *p < min ? *p : min; p++;
} while (--n > 0);
}
return min;
}
int main()
{
uint64_t block[8];
for(size_t i = 0 ; i < sizeof(block)/sizeof(block[0]) ; ++i)
{
block[i] = ((i+1) * 0x1122334455667788u) | 0x0101010101010101;
printf("block[%zu] = 0x%"PRIX64"\n", i, block[i]);
}
uint8_t min = min_in_mem_block((uint8_t *)block, sizeof(block));
printf("min = %" PRIX8 "\n", min);
}
我至少能想到两种方法。最简单的方法就是对其进行暴力破解:通过适当的按位算法一次重构 64 个整数中的每一个,并跟踪最小结果。沿着这些线的东西:
uint8_t min = 0xff;
// iterate over the collection of values
for (uint64_t which = 1; which; which <<= 1) {
// reconstitute one value in 'test'
uint8_t test = 0;
for (int bit = 0; bit < 8; bit++) {
// verify this decoding -- your bit order may be different:
test += (!!(slices[bit] & which)) << bit;
}
// track the minimum
if (test < min) {
min = test;
}
}
另一方面,通过只扫描一次 slices
并直接累加最小值应该也可以更快地完成。我没有时间测试这个,但它应该传达了总体思路:
uint8_t min = 0xff;
uint64_t mask = ~(uint64_t)0; // a mask of candidate positions; all bits initially set
for (int i = 7; i >= 0; i--) { // assumes slice 7 is most significant
// which of the remaining candidates have this bit set:
uint64_t bits_set = slice[i] & mask;
// If at least one of the remaining candidates does not have this bit set
if (bits_set != mask) {
min ^= (1 << i); // turn off this bit in the result
mask ^= bits_set; // remove the candidates that do have this bit set
}
}
后者类似于基数排序。
这里有一些简单而高效的函数,用于计算编码为 8 uint64_t
包的 64 字节值集的最小值和最大值,每个包存储 64 个值中的每个值的 1 位:
#include <stdint.h>
uint8_t maxslice(const uint64_t s[8]) {
uint8_t max = 0, bit = 0x80;
uint64_t mask = ~0ULL;
for (int i = 8; i-- > 0; bit >>= 1) {
uint64_t x = s[i] & mask;
if (x) {
max |= bit;
mask &= x;
}
}
return max;
}
uint8_t minslice(const uint64_t s[8]) {
uint8_t min = 0, bit = 0x80;
uint64_t mask = ~0ULL;
for (int i = 8; i-- > 0; bit >>= 1) {
uint64_t x = ~s[i] & mask;
if (x) {
min |= bit;
mask &= x;
}
}
return ~min;
}
正如可以在 Godbolt's Compiler Explorer 上验证的那样,clang
为两个函数生成了无分支代码。
为了计算以这种方式组织的较大值集的最小值 uint64_t slices[8][100]
的扩展目标,您可以简单地在数组上迭代此代码并逐步计算最小值。如果已经找到 0
的绝对最小值,则可能值得在此循环的每一步进行测试。棘手的部分是数组的组织方式:
uint64_t slices[8][100]
定义了 8 个数组 100 uint64_t
的数组。换句话说,内存中的布局是 6400 位低位,然后是 6400 位 2 位,...,最后是 6400 位权重 128.
uint8_t minarray(const uint64_t s[8][100]) {
uint8_t all_max = 0;
for (int j = 0; j < 100; j++) {
uint8_t max = 0, bit = 0x80;
uint64_t mask = ~0ULL;
for (int i = 8; i-- > 0; bit >>= 1) {
uint64_t x = ~s[i][j] & mask;
if (x) {
max |= bit;
mask &= x;
}
}
if (all_max < max) {
all_max = max;
if (all_max == 255)
break;
}
}
return ~all_max;
}
为了向量化此代码,我们可以转置循环:使用 x
和 mask
作为 100 uint64_t
的数组进行计算将产生相同的结果,但会让编译器向量化一些内部循环:
uint8_t minarray1(const uint64_t s[8][100]) {
uint8_t max = 0, bit = 0x80;
uint64_t mask[100] = {
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
};
for (int i = 8; i-- > 0; bit >>= 1) {
uint64_t x[100];
uint64_t xall = 0;
for (int j = 0; j < 100; j++) {
x[j] = ~s[i][j] & mask[j];
xall |= x[j];
}
if (xall) {
max |= bit;
for (int j = 0; j < 100; j++) {
mask[j] &= x[j];
}
}
}
return ~max;
}
再次生成 clang unrolled vectorized code。基准测试会告诉我们这种方法是否比以前的方法有更好的性能。