__uint128_t(128bit)如何进行位扫描?
How to bit scan forward and reverse a __uint128_t (128bit)?
我使用 DeBruijn 算法完成了位扫描 forward/reverse 64 位,但无法存档 128 位,__uint128_t。有什么解决办法吗?提前致谢!
位扫描的 FYI 代码 forward/reverse 使用 DeBruijn 算法的 64 位:
constexpr std::uint32_t
bitScanForward<std::uint64_t>(std::uint64_t n) noexcept {
constexpr std::uint32_t seq[] = {
0, 47, 1, 56, 48, 27, 2, 60, 57, 49, 41, 37, 28, 16, 3, 61,
54, 58, 35, 52, 50, 42, 21, 44, 38, 32, 29, 23, 17, 11, 4, 62,
46, 55, 26, 59, 40, 36, 15, 53, 34, 51, 20, 43, 31, 22, 10, 45,
25, 39, 14, 33, 19, 30, 9, 24, 13, 18, 8, 12, 7, 6, 5, 63};
return seq[((n ^ (n - 1)) * 0x03f79d71b4cb0a89ULL) >> 58];
}
constexpr std::uint32_t
bitScanReverse<std::uint64_t>(std::uint64_t n) noexcept {
n |= n >> 1;
n |= n >> 2;
n |= n >> 4;
n |= n >> 8;
n |= n >> 16;
n |= n >> 32;
constexpr std::uint32_t seq[] = {
0, 47, 1, 56, 48, 27, 2, 60, 57, 49, 41, 37, 28, 16, 3, 61,
54, 58, 35, 52, 50, 42, 21, 44, 38, 32, 29, 23, 17, 11, 4, 62,
46, 55, 26, 59, 40, 36, 15, 53, 34, 51, 20, 43, 31, 22, 10, 45,
25, 39, 14, 33, 19, 30, 9, 24, 13, 18, 8, 12, 7, 6, 5, 63};
return seq[(n * 0x03f79d71b4cb0a89ULL) >> 58];
}
可以将 64 位 BitScanReverse
适配到 128 位的情况,但这不会
非常高效,因为 128 位乘法运算相对昂贵,
正如@Marc Glisse 在他的评论中已经指出的那样。
不过,您可以使用 64 位 BitScanReverse
/BitScanForward
作为构建块
便携式 128 位 bsf
/bsr
:
#include<stdint.h>
#include<stdio.h>
int bitScanReverse(uint64_t n);
int bitScanForward(uint64_t n);
int bsr_u128 (__uint128_t u) {
uint64_t hi = u >> 64;
uint64_t lo = u;
int hi_neq_0 = (hi != 0);
uint64_t hi_or_lo = hi_neq_0 ? hi : lo;
int bsr_hi_or_lo = bitScanReverse(hi_or_lo);
return bsr_hi_or_lo + (hi_neq_0 << 6);
}
int bsf_u128 (__uint128_t u) {
uint64_t hi = u >> 64;
uint64_t lo = u;
int lo_eq_0 = (lo == 0);
uint64_t hi_or_lo = lo_eq_0 ? hi : lo;
int bsf_hi_or_lo = bitScanForward(hi_or_lo);
return bsf_hi_or_lo + (lo_eq_0 << 6);
}
int bitScanReverse(uint64_t n){
n |= n >> 1;
n |= n >> 2;
n |= n >> 4;
n |= n >> 8;
n |= n >> 16;
n |= n >> 32;
static int seq[] = {
0, 47, 1, 56, 48, 27, 2, 60, 57, 49, 41, 37, 28, 16, 3, 61,
54, 58, 35, 52, 50, 42, 21, 44, 38, 32, 29, 23, 17, 11, 4, 62,
46, 55, 26, 59, 40, 36, 15, 53, 34, 51, 20, 43, 31, 22, 10, 45,
25, 39, 14, 33, 19, 30, 9, 24, 13, 18, 8, 12, 7, 6, 5, 63};
return seq[(n * 0x03f79d71b4cb0a89ULL) >> 58];
}
int bitScanForward(uint64_t n){
static int seq[] = {
0, 47, 1, 56, 48, 27, 2, 60, 57, 49, 41, 37, 28, 16, 3, 61,
54, 58, 35, 52, 50, 42, 21, 44, 38, 32, 29, 23, 17, 11, 4, 62,
46, 55, 26, 59, 40, 36, 15, 53, 34, 51, 20, 43, 31, 22, 10, 45,
25, 39, 14, 33, 19, 30, 9, 24, 13, 18, 8, 12, 7, 6, 5, 63};
return seq[((n ^ (n - 1)) * 0x03f79d71b4cb0a89ULL) >> 58];
}
int main(){
__uint128_t t = 1;
__uint64_t hi, lo;
int i;
for (i=0;i<129;i++){
lo = t;
hi = t>>64;
printf("%3i %016lX %016lX bsr = %3i bsf = %3i\n",i,hi,lo,bsr_u128(t),bsf_u128(t));
t=t+t;
}
t = 1;
printf("\nThe zero input case is similar in the 64 bit and the 128 bit case:\n\n");
for (i=0;i<65;i++){
lo = t;
printf("%3i %016lX bsr = %3i bsf = %3i\n",i,lo,bitScanReverse(t),bitScanForward(t));
t=t+t;
}
return 0;
}
在 x86 上,这会产生相当高效的代码,例如 (gcc -O3 -m64 -march=nehalem):
bsf_u128:
xor eax, eax
test rdi, rdi
cmove rdi, rsi
sete al
sal eax, 6
lea rsi, [rdi-1]
xor rsi, rdi
movabs rdi, 285870213051386505
imul rsi, rdi
shr rsi, 58
add eax, DWORD PTR seq.31934[0+rsi*4]
ret
为了测试代码,在不同的位置设置了一个位。输出为:
$ ./a.exe
0 0000000000000000 0000000000000001 bsr = 0 bsf = 0
1 0000000000000000 0000000000000002 bsr = 1 bsf = 1
2 0000000000000000 0000000000000004 bsr = 2 bsf = 2
....
62 0000000000000000 4000000000000000 bsr = 62 bsf = 62
63 0000000000000000 8000000000000000 bsr = 63 bsf = 63
64 0000000000000001 0000000000000000 bsr = 64 bsf = 64
65 0000000000000002 0000000000000000 bsr = 65 bsf = 65
....
126 4000000000000000 0000000000000000 bsr = 126 bsf = 126
127 8000000000000000 0000000000000000 bsr = 127 bsf = 127
128 0000000000000000 0000000000000000 bsr = 0 bsf = 127
The zero input case is similar in the 64 bit and the 128 bit case:
0 0000000000000001 bsr = 0 bsf = 0
1 0000000000000002 bsr = 1 bsf = 1
2 0000000000000004 bsr = 2 bsf = 2
....
62 4000000000000000 bsr = 62 bsf = 62
63 8000000000000000 bsr = 63 bsf = 63
64 0000000000000000 bsr = 0 bsf = 63
高效 128 位 bsf
/bsr
的另一种解决方案是回收此问题中讨论的想法:
我使用 DeBruijn 算法完成了位扫描 forward/reverse 64 位,但无法存档 128 位,__uint128_t。有什么解决办法吗?提前致谢!
位扫描的 FYI 代码 forward/reverse 使用 DeBruijn 算法的 64 位:
constexpr std::uint32_t
bitScanForward<std::uint64_t>(std::uint64_t n) noexcept {
constexpr std::uint32_t seq[] = {
0, 47, 1, 56, 48, 27, 2, 60, 57, 49, 41, 37, 28, 16, 3, 61,
54, 58, 35, 52, 50, 42, 21, 44, 38, 32, 29, 23, 17, 11, 4, 62,
46, 55, 26, 59, 40, 36, 15, 53, 34, 51, 20, 43, 31, 22, 10, 45,
25, 39, 14, 33, 19, 30, 9, 24, 13, 18, 8, 12, 7, 6, 5, 63};
return seq[((n ^ (n - 1)) * 0x03f79d71b4cb0a89ULL) >> 58];
}
constexpr std::uint32_t
bitScanReverse<std::uint64_t>(std::uint64_t n) noexcept {
n |= n >> 1;
n |= n >> 2;
n |= n >> 4;
n |= n >> 8;
n |= n >> 16;
n |= n >> 32;
constexpr std::uint32_t seq[] = {
0, 47, 1, 56, 48, 27, 2, 60, 57, 49, 41, 37, 28, 16, 3, 61,
54, 58, 35, 52, 50, 42, 21, 44, 38, 32, 29, 23, 17, 11, 4, 62,
46, 55, 26, 59, 40, 36, 15, 53, 34, 51, 20, 43, 31, 22, 10, 45,
25, 39, 14, 33, 19, 30, 9, 24, 13, 18, 8, 12, 7, 6, 5, 63};
return seq[(n * 0x03f79d71b4cb0a89ULL) >> 58];
}
可以将 64 位 BitScanReverse
适配到 128 位的情况,但这不会
非常高效,因为 128 位乘法运算相对昂贵,
正如@Marc Glisse 在他的评论中已经指出的那样。
不过,您可以使用 64 位 BitScanReverse
/BitScanForward
作为构建块
便携式 128 位 bsf
/bsr
:
#include<stdint.h>
#include<stdio.h>
int bitScanReverse(uint64_t n);
int bitScanForward(uint64_t n);
int bsr_u128 (__uint128_t u) {
uint64_t hi = u >> 64;
uint64_t lo = u;
int hi_neq_0 = (hi != 0);
uint64_t hi_or_lo = hi_neq_0 ? hi : lo;
int bsr_hi_or_lo = bitScanReverse(hi_or_lo);
return bsr_hi_or_lo + (hi_neq_0 << 6);
}
int bsf_u128 (__uint128_t u) {
uint64_t hi = u >> 64;
uint64_t lo = u;
int lo_eq_0 = (lo == 0);
uint64_t hi_or_lo = lo_eq_0 ? hi : lo;
int bsf_hi_or_lo = bitScanForward(hi_or_lo);
return bsf_hi_or_lo + (lo_eq_0 << 6);
}
int bitScanReverse(uint64_t n){
n |= n >> 1;
n |= n >> 2;
n |= n >> 4;
n |= n >> 8;
n |= n >> 16;
n |= n >> 32;
static int seq[] = {
0, 47, 1, 56, 48, 27, 2, 60, 57, 49, 41, 37, 28, 16, 3, 61,
54, 58, 35, 52, 50, 42, 21, 44, 38, 32, 29, 23, 17, 11, 4, 62,
46, 55, 26, 59, 40, 36, 15, 53, 34, 51, 20, 43, 31, 22, 10, 45,
25, 39, 14, 33, 19, 30, 9, 24, 13, 18, 8, 12, 7, 6, 5, 63};
return seq[(n * 0x03f79d71b4cb0a89ULL) >> 58];
}
int bitScanForward(uint64_t n){
static int seq[] = {
0, 47, 1, 56, 48, 27, 2, 60, 57, 49, 41, 37, 28, 16, 3, 61,
54, 58, 35, 52, 50, 42, 21, 44, 38, 32, 29, 23, 17, 11, 4, 62,
46, 55, 26, 59, 40, 36, 15, 53, 34, 51, 20, 43, 31, 22, 10, 45,
25, 39, 14, 33, 19, 30, 9, 24, 13, 18, 8, 12, 7, 6, 5, 63};
return seq[((n ^ (n - 1)) * 0x03f79d71b4cb0a89ULL) >> 58];
}
int main(){
__uint128_t t = 1;
__uint64_t hi, lo;
int i;
for (i=0;i<129;i++){
lo = t;
hi = t>>64;
printf("%3i %016lX %016lX bsr = %3i bsf = %3i\n",i,hi,lo,bsr_u128(t),bsf_u128(t));
t=t+t;
}
t = 1;
printf("\nThe zero input case is similar in the 64 bit and the 128 bit case:\n\n");
for (i=0;i<65;i++){
lo = t;
printf("%3i %016lX bsr = %3i bsf = %3i\n",i,lo,bitScanReverse(t),bitScanForward(t));
t=t+t;
}
return 0;
}
在 x86 上,这会产生相当高效的代码,例如 (gcc -O3 -m64 -march=nehalem):
bsf_u128:
xor eax, eax
test rdi, rdi
cmove rdi, rsi
sete al
sal eax, 6
lea rsi, [rdi-1]
xor rsi, rdi
movabs rdi, 285870213051386505
imul rsi, rdi
shr rsi, 58
add eax, DWORD PTR seq.31934[0+rsi*4]
ret
为了测试代码,在不同的位置设置了一个位。输出为:
$ ./a.exe
0 0000000000000000 0000000000000001 bsr = 0 bsf = 0
1 0000000000000000 0000000000000002 bsr = 1 bsf = 1
2 0000000000000000 0000000000000004 bsr = 2 bsf = 2
....
62 0000000000000000 4000000000000000 bsr = 62 bsf = 62
63 0000000000000000 8000000000000000 bsr = 63 bsf = 63
64 0000000000000001 0000000000000000 bsr = 64 bsf = 64
65 0000000000000002 0000000000000000 bsr = 65 bsf = 65
....
126 4000000000000000 0000000000000000 bsr = 126 bsf = 126
127 8000000000000000 0000000000000000 bsr = 127 bsf = 127
128 0000000000000000 0000000000000000 bsr = 0 bsf = 127
The zero input case is similar in the 64 bit and the 128 bit case:
0 0000000000000001 bsr = 0 bsf = 0
1 0000000000000002 bsr = 1 bsf = 1
2 0000000000000004 bsr = 2 bsf = 2
....
62 4000000000000000 bsr = 62 bsf = 62
63 8000000000000000 bsr = 63 bsf = 63
64 0000000000000000 bsr = 0 bsf = 63
高效 128 位 bsf
/bsr
的另一种解决方案是回收此问题中讨论的想法: