使用按位运算检测多字节字中的 0xff
Detect 0xff in multibyte word with bitwise operations
我有一个由 4 个字节组成的 32 位无符号整数,例如:0x12ff3456
.
我试图找到一个按位运算来用 0x01
替换 0xff
字节,用 0x00
替换任何其他字节,例如:
0x12ff3456
=> 0x00010000
0x543245ff
=> 0x00000001
...等等
组成32位无符号整数的字节一次只能有一个等于0xff
。
有没有人知道如何以尽可能少的操作执行此操作?折叠 (bitshifts + ands) 是一种选择,但需要太多操作。
Bit Twiddling Hacks 页面解释了如何获得一个掩码,其中每个零字节都用高位集标记。你可以在这里申请:
((~x - 0x01010101) & x & 0x80808080) >> 7
存在多种可能性,哪一种导致指令数量最少或执行时间最快,将取决于所使用的机器体系结构和编译器。一些架构提供 ANDN
指令,其他架构支持三输入逻辑指令,还有一些架构将移位与逻辑操作合并。下面我展示了三个通过详尽测试的变体。
这两种方法是将输出基于 0xFF
的逐字节相等性检查或基于 0xFE
的逐字节 "greater than" 测试。这些是通过 FUNC_VARIANT
选择的。 "greater than" 测试进一步基于 "less than" 测试,为此提供了两个实现变体,由 LTU_VARIANT
选择。
智能字节处理算法的来源在评论中注明。一般来说,在中间步骤中需要一定量的掩码,以防止特定字节的处理影响相邻字节。
请注意,代码可以很容易地调整为一次处理八个字节,而不是问题指定的四个字节。
快速检查 Compiler Explorer shows that with gcc, FUNC_VARIANT=1
,LTU_VARIANT=0
compiles to the shortest instruction sequences for both x86-64 and ARM64。不过,这不一定会转化为尽可能高的性能。
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#define FUNC_VARIANT 0
#define LTU_VARIANT 0
#define UINT32_H4 0x80808080U // byte-wise sign bits (MSBs)
#define UINT32_L4 0x01010101U // byte-wise LSBs
#define UINT32_M4 0xffffffffU // byte-wise maximum
uint32_t sign_to_bool4 (uint32_t a)
{
return (a >> 7) & UINT32_L4;
}
uint32_t vhaddu4 (uint32_t a, uint32_t b)
{
/* Peter L. Montgomery's observation (newsgroup comp.arch, 2000/02/11,
https://groups.google.com/d/msg/comp.arch/gXFuGZtZKag/_5yrz2zDbe4J):
(A+B)/2 = (A AND B) + (A XOR B)/2.
*/
return (a & b) + (((a ^ b) >> 1) & ~UINT32_H4);
}
uint32_t ltu4_core (uint32_t a, uint32_t b)
{
/* Sebastiano Vigna, "Broadword implementation of rank/select queries."
In: International Workshop on Experimental and Efficient Algorithms,
pp. 154-168, Springer Berlin Heidelberg, 2008.
*/
return (((a | UINT32_H4) - (b & ~UINT32_H4)) | (a ^ b)) ^ (a | ~b);
}
uint32_t vsetltu4 (uint32_t a, uint32_t b)
{
#if LTU_VARIANT==1
return sign_to_bool4 (ltu4_core (a, b));
#else // LTU_VARIANT
return sign_to_bool4 (vhaddu4 (~a, b));
#endif // LTU_VARIANT
}
uint32_t vsetgtu4 (uint32_t a, uint32_t b)
{
return vsetltu4 (b, a);
}
uint32_t vseteq4 (uint32_t a, uint32_t b)
{
uint32_t r, t;
/* Alan Mycroft's null-byte detection algorithm (newsgroup comp.lang.c, 1987/04/08,
https://groups.google.com/forum/#!original/comp.lang.c/2HtQXvg7iKc/xOJeipH6KLMJ):
null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
*/
r = a ^ b; // 0x00 if a == b
t = r | UINT32_H4; // set msbs, to catch carry out
r = r ^ t; // extract msbs, msb = 1 if r < 0x80
t = t - UINT32_L4; // sign bit = 0, if r was 0x00 or 0x80
t = r & ~t; // sign_bit = 1, if r was 0x00
r = t >> 7; // convert to bool
return r;
}
uint32_t func (uint32_t a)
{
#if FUNC_VARIANT == 1
return vsetgtu4 (a, ~UINT32_L4); // byte-wise a >ᶸ 0xFE
#else // FUNC_VARIANT
return vseteq4 (a, UINT32_M4); // byte-wise a == 0xFF
#endif // FUNC_VARIANT
}
uint32_t ref_func (uint32_t a)
{
uint8_t a0 = (uint8_t)((a >> 0) & 0xff);
uint8_t a1 = (uint8_t)((a >> 8) & 0xff);
uint8_t a2 = (uint8_t)((a >> 16) & 0xff);
uint8_t a3 = (uint8_t)((a >> 24) & 0xff);
int p0 = (a0 == 0xff);
int p1 = (a1 == 0xff);
int p2 = (a2 == 0xff);
int p3 = (a3 == 0xff);
return (((uint32_t)p3 << 24) | ((uint32_t)p2 << 16) |
((uint32_t)p1 << 8) | ((uint32_t)p0 << 0));
}
int main (void)
{
uint32_t res, ref, x = 0;
do {
res = func (x);
ref = ref_func (x);
if (res != ref) {
printf ("error @ %08x: res=%08x ref=%08x\n", x, res, ref);
return EXIT_FAILURE;
}
x++;
} while (x);
printf ("test passed\n");
return EXIT_SUCCESS;
}
我有一个由 4 个字节组成的 32 位无符号整数,例如:0x12ff3456
.
我试图找到一个按位运算来用 0x01
替换 0xff
字节,用 0x00
替换任何其他字节,例如:
0x12ff3456
=> 0x00010000
0x543245ff
=> 0x00000001
...等等
组成32位无符号整数的字节一次只能有一个等于0xff
。
有没有人知道如何以尽可能少的操作执行此操作?折叠 (bitshifts + ands) 是一种选择,但需要太多操作。
Bit Twiddling Hacks 页面解释了如何获得一个掩码,其中每个零字节都用高位集标记。你可以在这里申请:
((~x - 0x01010101) & x & 0x80808080) >> 7
存在多种可能性,哪一种导致指令数量最少或执行时间最快,将取决于所使用的机器体系结构和编译器。一些架构提供 ANDN
指令,其他架构支持三输入逻辑指令,还有一些架构将移位与逻辑操作合并。下面我展示了三个通过详尽测试的变体。
这两种方法是将输出基于 0xFF
的逐字节相等性检查或基于 0xFE
的逐字节 "greater than" 测试。这些是通过 FUNC_VARIANT
选择的。 "greater than" 测试进一步基于 "less than" 测试,为此提供了两个实现变体,由 LTU_VARIANT
选择。
智能字节处理算法的来源在评论中注明。一般来说,在中间步骤中需要一定量的掩码,以防止特定字节的处理影响相邻字节。
请注意,代码可以很容易地调整为一次处理八个字节,而不是问题指定的四个字节。
快速检查 Compiler Explorer shows that with gcc, FUNC_VARIANT=1
,LTU_VARIANT=0
compiles to the shortest instruction sequences for both x86-64 and ARM64。不过,这不一定会转化为尽可能高的性能。
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#define FUNC_VARIANT 0
#define LTU_VARIANT 0
#define UINT32_H4 0x80808080U // byte-wise sign bits (MSBs)
#define UINT32_L4 0x01010101U // byte-wise LSBs
#define UINT32_M4 0xffffffffU // byte-wise maximum
uint32_t sign_to_bool4 (uint32_t a)
{
return (a >> 7) & UINT32_L4;
}
uint32_t vhaddu4 (uint32_t a, uint32_t b)
{
/* Peter L. Montgomery's observation (newsgroup comp.arch, 2000/02/11,
https://groups.google.com/d/msg/comp.arch/gXFuGZtZKag/_5yrz2zDbe4J):
(A+B)/2 = (A AND B) + (A XOR B)/2.
*/
return (a & b) + (((a ^ b) >> 1) & ~UINT32_H4);
}
uint32_t ltu4_core (uint32_t a, uint32_t b)
{
/* Sebastiano Vigna, "Broadword implementation of rank/select queries."
In: International Workshop on Experimental and Efficient Algorithms,
pp. 154-168, Springer Berlin Heidelberg, 2008.
*/
return (((a | UINT32_H4) - (b & ~UINT32_H4)) | (a ^ b)) ^ (a | ~b);
}
uint32_t vsetltu4 (uint32_t a, uint32_t b)
{
#if LTU_VARIANT==1
return sign_to_bool4 (ltu4_core (a, b));
#else // LTU_VARIANT
return sign_to_bool4 (vhaddu4 (~a, b));
#endif // LTU_VARIANT
}
uint32_t vsetgtu4 (uint32_t a, uint32_t b)
{
return vsetltu4 (b, a);
}
uint32_t vseteq4 (uint32_t a, uint32_t b)
{
uint32_t r, t;
/* Alan Mycroft's null-byte detection algorithm (newsgroup comp.lang.c, 1987/04/08,
https://groups.google.com/forum/#!original/comp.lang.c/2HtQXvg7iKc/xOJeipH6KLMJ):
null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
*/
r = a ^ b; // 0x00 if a == b
t = r | UINT32_H4; // set msbs, to catch carry out
r = r ^ t; // extract msbs, msb = 1 if r < 0x80
t = t - UINT32_L4; // sign bit = 0, if r was 0x00 or 0x80
t = r & ~t; // sign_bit = 1, if r was 0x00
r = t >> 7; // convert to bool
return r;
}
uint32_t func (uint32_t a)
{
#if FUNC_VARIANT == 1
return vsetgtu4 (a, ~UINT32_L4); // byte-wise a >ᶸ 0xFE
#else // FUNC_VARIANT
return vseteq4 (a, UINT32_M4); // byte-wise a == 0xFF
#endif // FUNC_VARIANT
}
uint32_t ref_func (uint32_t a)
{
uint8_t a0 = (uint8_t)((a >> 0) & 0xff);
uint8_t a1 = (uint8_t)((a >> 8) & 0xff);
uint8_t a2 = (uint8_t)((a >> 16) & 0xff);
uint8_t a3 = (uint8_t)((a >> 24) & 0xff);
int p0 = (a0 == 0xff);
int p1 = (a1 == 0xff);
int p2 = (a2 == 0xff);
int p3 = (a3 == 0xff);
return (((uint32_t)p3 << 24) | ((uint32_t)p2 << 16) |
((uint32_t)p1 << 8) | ((uint32_t)p0 << 0));
}
int main (void)
{
uint32_t res, ref, x = 0;
do {
res = func (x);
ref = ref_func (x);
if (res != ref) {
printf ("error @ %08x: res=%08x ref=%08x\n", x, res, ref);
return EXIT_FAILURE;
}
x++;
} while (x);
printf ("test passed\n");
return EXIT_SUCCESS;
}