使用按位运算检测多字节字中的 0xff

Detect 0xff in multibyte word with bitwise operations

我有一个由 4 个字节组成的 32 位无符号整数,例如:0x12ff3456.

我试图找到一个按位运算来用 0x01 替换 0xff 字节,用 0x00 替换任何其他字节,例如:

0x12ff3456 => 0x00010000

0x543245ff => 0x00000001 ...等等

组成32位无符号整数的字节一次只能有一个等于0xff。 有没有人知道如何以尽可能少的操作执行此操作?折叠 (bitshifts + ands) 是一种选择,但需要太多操作。

Bit Twiddling Hacks 页面解释了如何获得一个掩码,其中每个零字节都用高位集标记。你可以在这里申请:

((~x - 0x01010101) & x & 0x80808080) >> 7

存在多种可能性,哪一种导致指令数量最少或执行时间最快,将取决于所使用的机器体系结构和编译器。一些架构提供 ANDN 指令,其他架构支持三输入逻辑指令,还有一些架构将移位与逻辑操作合并。下面我展示了三个通过详尽测试的变体。

这两种方法是将输出基于 0xFF 的逐字节相等性检查或基于 0xFE 的逐字节 "greater than" 测试。这些是通过 FUNC_VARIANT 选择的。 "greater than" 测试进一步基于 "less than" 测试,为此提供了两个实现变体,由 LTU_VARIANT 选择。

智能字节处理算法的来源在评论中注明。一般来说,在中间步骤中需要一定量的掩码,以防止特定字节的处理影响相邻字节。

请注意,代码可以很容易地调整为一次处理八个字节,而不是问题指定的四个字节。

快速检查 Compiler Explorer shows that with gcc, FUNC_VARIANT=1,LTU_VARIANT=0 compiles to the shortest instruction sequences for both x86-64 and ARM64。不过,这不一定会转化为尽可能高的性能。

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>

#define FUNC_VARIANT  0
#define LTU_VARIANT   0

#define UINT32_H4  0x80808080U   // byte-wise sign bits (MSBs)
#define UINT32_L4  0x01010101U   // byte-wise LSBs 
#define UINT32_M4  0xffffffffU   // byte-wise maximum

uint32_t sign_to_bool4 (uint32_t a)
{
    return (a >> 7) & UINT32_L4;
}

uint32_t vhaddu4 (uint32_t a, uint32_t b)
{
    /* Peter L. Montgomery's observation (newsgroup comp.arch, 2000/02/11,
       https://groups.google.com/d/msg/comp.arch/gXFuGZtZKag/_5yrz2zDbe4J):
       (A+B)/2 = (A AND B) + (A XOR B)/2.
    */
    return (a & b) + (((a ^ b) >> 1) & ~UINT32_H4);
}

uint32_t ltu4_core (uint32_t a, uint32_t b)
{
    /* Sebastiano Vigna, "Broadword implementation of rank/select queries." 
       In: International Workshop on Experimental and Efficient Algorithms, 
       pp. 154-168, Springer Berlin Heidelberg, 2008.
    */
    return (((a | UINT32_H4) - (b & ~UINT32_H4)) | (a ^ b)) ^ (a | ~b);
}

uint32_t vsetltu4 (uint32_t a, uint32_t b)
{
#if LTU_VARIANT==1
    return sign_to_bool4 (ltu4_core (a, b));
#else // LTU_VARIANT
    return sign_to_bool4 (vhaddu4 (~a, b));
#endif // LTU_VARIANT
}

uint32_t vsetgtu4 (uint32_t a, uint32_t b)
{
    return vsetltu4 (b, a);
}

uint32_t vseteq4 (uint32_t a, uint32_t b)
{
    uint32_t r, t;

    /* Alan Mycroft's null-byte detection algorithm (newsgroup comp.lang.c, 1987/04/08,
       https://groups.google.com/forum/#!original/comp.lang.c/2HtQXvg7iKc/xOJeipH6KLMJ):
       null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
    */
    r = a ^ b;         // 0x00 if a == b
    t = r | UINT32_H4; // set msbs, to catch carry out
    r = r ^ t;         // extract msbs, msb = 1 if r < 0x80
    t = t - UINT32_L4; // sign bit = 0, if r was 0x00 or 0x80
    t = r & ~t;        // sign_bit = 1, if r was 0x00
    r = t >> 7;        // convert to bool
    return r;
}

uint32_t func (uint32_t a) 
{
#if FUNC_VARIANT == 1
    return vsetgtu4 (a, ~UINT32_L4); // byte-wise a >ᶸ 0xFE
#else // FUNC_VARIANT
    return vseteq4 (a, UINT32_M4);   // byte-wise a == 0xFF
#endif // FUNC_VARIANT
}

uint32_t ref_func (uint32_t a)
{
    uint8_t a0 = (uint8_t)((a >>  0) & 0xff);
    uint8_t a1 = (uint8_t)((a >>  8) & 0xff);
    uint8_t a2 = (uint8_t)((a >> 16) & 0xff);
    uint8_t a3 = (uint8_t)((a >> 24) & 0xff);
    int p0 = (a0 == 0xff);
    int p1 = (a1 == 0xff);
    int p2 = (a2 == 0xff);
    int p3 = (a3 == 0xff);
    return (((uint32_t)p3 << 24) | ((uint32_t)p2 << 16) |
            ((uint32_t)p1 <<  8) | ((uint32_t)p0 <<  0));
}

int main (void)
{
    uint32_t res, ref, x = 0;

    do {
        res = func (x);
        ref = ref_func (x);
        if (res != ref) {
            printf ("error @ %08x: res=%08x  ref=%08x\n", x, res, ref);
            return EXIT_FAILURE;
        }
        x++;
    } while (x);
    printf ("test passed\n");
    return EXIT_SUCCESS;
}