二进制加法从左到右进位
Binary addition carrying from left to right
你如何有效地实现从左到右的 64 位二进制加法?就像翻转位然后添加:
0101 + 1110 = BitReverse(1010 + 0111)
= BitReverse(10001)
= 10001
我目前的解决方案是使用增量交换(也可能是字节交换内部函数)反转输入的位顺序,使用正常加法,然后再次反转,这不是特别快,但可能比循环 64-位整数。 (我还没有测试过,但还是太慢了。)
uint64_t addreverse(uint64_t a, uint64_t b) {
return BitReverse(BitReverse(a) + BitReverse(b));
}
这很慢,因为位需要反转 3 次,使用字节交换时需要 40 多次操作。
编辑:我不能将它们颠倒存储,因为我也需要定期添加。
您想按位进行并跟踪进位。类似于:
uint64_t addreverse(uint64_t a, uint64_t b) {
uint64_t current_bit = 0x8000000000000000ull;
unsigned int shift = 63;
unsigned int carry = 0;
uint64_t sum = 0;
while (current_bit) {
uint64_t bit_sum = ((a & current_bit) >> shift) +
((b & current_bit) >> shift);
sum += (((bit_sum & 1) + carry) << shift);
carry = (bit_sum & 2) >> 1;
current_bit >>= 1;
--shift;
}
return sum;
}
我会使用 LUT tables。这里小C++ 32bit(DOWRD = unsigned __int32_t
)例子:
//---------------------------------------------------------------------------
// select fastest operation
#define BitReverse BitReverse1
#define AddReverse AddReverseKS
//---------------------------------------------------------------------------
DWORD BitReverseSlow(DWORD x) // Slow 1bit reversing
{
DWORD m0,m1,y;
for (y=0,m0=1,m1=0x80000000;m0<m1;m0<<=1,m1>>=1)
{
if (DWORD(m0&x)) y|=m1;
if (DWORD(m1&x)) y|=m0;
}
return y;
}
//---------------------------------------------------------------------------
DWORD BitReverse1(DWORD x)
{
x=((x<<16)&0xFFFF0000)|((x>>16)&0x0000FFFF);
x=((x<< 8)&0xFF00FF00)|((x>> 8)&0x00FF00FF);
x=((x<< 4)&0xF0F0F0F0)|((x>> 4)&0x0F0F0F0F);
x=((x<< 2)&0xCCCCCCCC)|((x>> 2)&0x33333333);
x=((x<< 1)&0xAAAAAAAA)|((x>> 1)&0x55555555);
return x;
}
//---------------------------------------------------------------------------
DWORD BitReverse8(DWORD x) // fast 8bit LUT reversing
{
DWORD y;
BYTE *px=(BYTE*)&x;
BYTE *py=(BYTE*)&y;
static const BYTE BitReverse8_LUT[256]= // BitReverse8_LUT[x]=BitReverse(x) in 8bits
{
0x00,0x80,0x40,0xC0,0x20,0xA0,0x60,0xE0,0x10,0x90,0x50,0xD0,0x30,0xB0,0x70,0xF0,
0x08,0x88,0x48,0xC8,0x28,0xA8,0x68,0xE8,0x18,0x98,0x58,0xD8,0x38,0xB8,0x78,0xF8,
0x04,0x84,0x44,0xC4,0x24,0xA4,0x64,0xE4,0x14,0x94,0x54,0xD4,0x34,0xB4,0x74,0xF4,
0x0C,0x8C,0x4C,0xCC,0x2C,0xAC,0x6C,0xEC,0x1C,0x9C,0x5C,0xDC,0x3C,0xBC,0x7C,0xFC,
0x02,0x82,0x42,0xC2,0x22,0xA2,0x62,0xE2,0x12,0x92,0x52,0xD2,0x32,0xB2,0x72,0xF2,
0x0A,0x8A,0x4A,0xCA,0x2A,0xAA,0x6A,0xEA,0x1A,0x9A,0x5A,0xDA,0x3A,0xBA,0x7A,0xFA,
0x06,0x86,0x46,0xC6,0x26,0xA6,0x66,0xE6,0x16,0x96,0x56,0xD6,0x36,0xB6,0x76,0xF6,
0x0E,0x8E,0x4E,0xCE,0x2E,0xAE,0x6E,0xEE,0x1E,0x9E,0x5E,0xDE,0x3E,0xBE,0x7E,0xFE,
0x01,0x81,0x41,0xC1,0x21,0xA1,0x61,0xE1,0x11,0x91,0x51,0xD1,0x31,0xB1,0x71,0xF1,
0x09,0x89,0x49,0xC9,0x29,0xA9,0x69,0xE9,0x19,0x99,0x59,0xD9,0x39,0xB9,0x79,0xF9,
0x05,0x85,0x45,0xC5,0x25,0xA5,0x65,0xE5,0x15,0x95,0x55,0xD5,0x35,0xB5,0x75,0xF5,
0x0D,0x8D,0x4D,0xCD,0x2D,0xAD,0x6D,0xED,0x1D,0x9D,0x5D,0xDD,0x3D,0xBD,0x7D,0xFD,
0x03,0x83,0x43,0xC3,0x23,0xA3,0x63,0xE3,0x13,0x93,0x53,0xD3,0x33,0xB3,0x73,0xF3,
0x0B,0x8B,0x4B,0xCB,0x2B,0xAB,0x6B,0xEB,0x1B,0x9B,0x5B,0xDB,0x3B,0xBB,0x7B,0xFB,
0x07,0x87,0x47,0xC7,0x27,0xA7,0x67,0xE7,0x17,0x97,0x57,0xD7,0x37,0xB7,0x77,0xF7,
0x0F,0x8F,0x4F,0xCF,0x2F,0xAF,0x6F,0xEF,0x1F,0x9F,0x5F,0xDF,0x3F,0xBF,0x7F,0xFF
};
py[0]=BitReverse8_LUT[px[3]];
py[1]=BitReverse8_LUT[px[2]];
py[2]=BitReverse8_LUT[px[1]];
py[3]=BitReverse8_LUT[px[0]];
return y;
}
//---------------------------------------------------------------------------
DWORD AddReverseSlow(DWORD x,DWORD y)
{
return BitReverse(BitReverse(x)+BitReverse(y));
}
//---------------------------------------------------------------------------
DWORD AddReverse4(DWORD x,DWORD y)
{
int i;
DWORD z,cy0,cy,a;
static const BYTE AddReverse4_LUT[16][16]= // AddReverse4_LUT[x][y]=BitReverse(BitReverse(x<<1)+BitReverse(y<<1)) in 4bits
{
{ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 },
{ 2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29 },
{ 4, 6, 2, 1, 12, 14, 10, 9, 20, 22, 18, 17, 28, 30, 26, 25 },
{ 6, 5, 1, 3, 14, 13, 9, 11, 22, 21, 17, 19, 30, 29, 25, 27 },
{ 8, 10, 12, 14, 4, 6, 2, 1, 24, 26, 28, 30, 20, 22, 18, 17 },
{ 10, 9, 14, 13, 6, 5, 1, 3, 26, 25, 30, 29, 22, 21, 17, 19 },
{ 12, 14, 10, 9, 2, 1, 6, 5, 28, 30, 26, 25, 18, 17, 22, 21 },
{ 14, 13, 9, 11, 1, 3, 5, 7, 30, 29, 25, 27, 17, 19, 21, 23 },
{ 16, 18, 20, 22, 24, 26, 28, 30, 8, 10, 12, 14, 4, 6, 2, 1 },
{ 18, 17, 22, 21, 26, 25, 30, 29, 10, 9, 14, 13, 6, 5, 1, 3 },
{ 20, 22, 18, 17, 28, 30, 26, 25, 12, 14, 10, 9, 2, 1, 6, 5 },
{ 22, 21, 17, 19, 30, 29, 25, 27, 14, 13, 9, 11, 1, 3, 5, 7 },
{ 24, 26, 28, 30, 20, 22, 18, 17, 4, 6, 2, 1, 12, 14, 10, 9 },
{ 26, 25, 30, 29, 22, 21, 17, 19, 6, 5, 1, 3, 14, 13, 9, 11 },
{ 28, 30, 26, 25, 18, 17, 22, 21, 2, 1, 6, 5, 10, 9, 14, 13 },
{ 30, 29, 25, 27, 17, 19, 21, 23, 1, 3, 5, 7, 9, 11, 13, 15 }
};
for (cy0=0,z=0,i=28;i>=0;i-=4,cy0=cy)
{
a=AddReverse4_LUT[(x>>i)&15][(y>>i)&15]; cy=a&1; a>>=1; // add
if (cy0){ a=AddReverse4_LUT[8][a]; cy|=a&1; a>>=1; } // adc
z|=a<<i;
}
return z;
}
//---------------------------------------------------------------------------
WORD AddReverese8_LUT[256][256]; // LUT[x][y]=BitReverse(BitReverse(x<<1)+BitReverse(y<<1)) in 8bits
DWORD AddReverse8(DWORD x,DWORD y)
{
int i;
DWORD z,cy0,cy,a;
BYTE *px=(BYTE*)&x;
BYTE *py=(BYTE*)&y;
BYTE *pz=(BYTE*)&z;
for (cy0=0,z=0,i=3;i>=0;i--,cy0=cy)
{
a=AddReverese8_LUT[px[i]][py[i]]; cy=a&1; a>>=1; // add
if (cy0){ a=AddReverese8_LUT[128][a]; cy|=a&1; a>>=1; } // adc
pz[i]=a;
}
return z;
}
//---------------------------------------------------------------------------
DWORD AddReverseKS(DWORD x,DWORD y) // https://en.wikipedia.org/wiki/Kogge–Stone_adder
{ // Adder from harold's answer ported to 32bit
DWORD p=x^y;
DWORD g=x&y;
g|=p&(g>> 1); p&=p>> 1;
g|=p&(g>> 2); p&=p>> 2;
g|=p&(g>> 4); p&=p>> 4;
g|=p&(g>> 8); p&=p>> 8;
g|=p&(g>>16);
return x^y^(g>>1);
}
//---------------------------------------------------------------------------
AddReverse8_LUT
初始化如下:
for (DWORD x=0;x<256;x++)
for (DWORD y=0;y<256;y++)
AddReverse8_LUT[x][y]=BitReverse(BitReverse(x<<1)+BitReverse(y<<1));
这里是我的设置的测量时间:
8M x BitReverseSlow [ 506.585 ms]
8M x BitReverse1 [ 59.769 ms]
8M x BitReverse8 [ 72.355 ms]
16M x AddReverseSlow [ 611.677 ms]
16M x AddReverse4 [ 491.546 ms]
16M x AddReverse8 [ 347.293 ms]
16M x AddReverseKS [ 142.149 ms]
如您所见,LUT 越大速度越好,但代价是 space。最重要的是,如果分辨率是 8/16/32 位,您可以通过指针使用 BYTE 访问,而不是使用慢位 shift/mask 来加快速度。
只是为了确保加法 LUT[][]
table 的编码使得加法向左移动 1 位并且空的 space 被下一个进位标志占用蚕食...
在我做 adc
(通过进位标志递增)的添加中,LUT
的操作数在 LUT 位分辨率中是 1
位反转所以
1000bin = 8dec // 4 bits
10000000bin = 128dec // 8 bits
要将其移植到 64 位 只需将 DWORD
更改为您的数据类型并更改位掩码和迭代计数以匹配 64 位 ...
无论如何,这让我想起了我以前的回答:
- Cant make value propagate through carry
不过看起来 BitReverse
的非 LUT 版本更快
经过一些挖掘,看起来 LUT 与临时变量的使用迫使我的编译器以不同的方式编译我的函数,速度大大降低(更多的棕褐色两次)。一旦所有函数都强制执行此类调用 api,LUT 版本会稍微快一些:
8M x BitReverseSlow [ 537.534 ms]
8M x BitReverse1 [ 99.804 ms] <- this is slowed down due to changing call style to the same as the BitReverse8
8M x BitReverse8 [ 80.725 ms]
所以我怀疑对于相当简单的函数,我的编译器使用汇编调用风格(不使用 heap/stack 作为操作数和返回值)。
所以结果是:
使用 BitReverse1
更快。出于同样的原因,harolds 的回答到目前为止也比我的好得多 + 它需要的操作更少 ...
但是,如果您将其硬连线(FPGA 或在芯片上或使用门),我怀疑 LUT 版本的 BitReverse
会更快(尽管您可以直接进行 Hardwire 位反转,这是无与伦比的)。
模拟一个 Kogge-Stone adder,移位方向相反,给出了一个很好的算法,
uint64_t p = x ^ y;
uint64_t g = x & y;
g |= p & (g >> 1);
p &= p >> 1;
g |= p & (g >> 2);
p &= p >> 2;
g |= p & (g >> 4);
p &= p >> 4;
g |= p & (g >> 8);
p &= p >> 8;
g |= p & (g >> 16);
p &= p >> 16;
g |= p & (g >> 32);
uint64_t result = x ^ y ^ (g >> 1);
你如何有效地实现从左到右的 64 位二进制加法?就像翻转位然后添加:
0101 + 1110 = BitReverse(1010 + 0111)
= BitReverse(10001)
= 10001
我目前的解决方案是使用增量交换(也可能是字节交换内部函数)反转输入的位顺序,使用正常加法,然后再次反转,这不是特别快,但可能比循环 64-位整数。 (我还没有测试过,但还是太慢了。)
uint64_t addreverse(uint64_t a, uint64_t b) {
return BitReverse(BitReverse(a) + BitReverse(b));
}
这很慢,因为位需要反转 3 次,使用字节交换时需要 40 多次操作。
编辑:我不能将它们颠倒存储,因为我也需要定期添加。
您想按位进行并跟踪进位。类似于:
uint64_t addreverse(uint64_t a, uint64_t b) {
uint64_t current_bit = 0x8000000000000000ull;
unsigned int shift = 63;
unsigned int carry = 0;
uint64_t sum = 0;
while (current_bit) {
uint64_t bit_sum = ((a & current_bit) >> shift) +
((b & current_bit) >> shift);
sum += (((bit_sum & 1) + carry) << shift);
carry = (bit_sum & 2) >> 1;
current_bit >>= 1;
--shift;
}
return sum;
}
我会使用 LUT tables。这里小C++ 32bit(DOWRD = unsigned __int32_t
)例子:
//---------------------------------------------------------------------------
// select fastest operation
#define BitReverse BitReverse1
#define AddReverse AddReverseKS
//---------------------------------------------------------------------------
DWORD BitReverseSlow(DWORD x) // Slow 1bit reversing
{
DWORD m0,m1,y;
for (y=0,m0=1,m1=0x80000000;m0<m1;m0<<=1,m1>>=1)
{
if (DWORD(m0&x)) y|=m1;
if (DWORD(m1&x)) y|=m0;
}
return y;
}
//---------------------------------------------------------------------------
DWORD BitReverse1(DWORD x)
{
x=((x<<16)&0xFFFF0000)|((x>>16)&0x0000FFFF);
x=((x<< 8)&0xFF00FF00)|((x>> 8)&0x00FF00FF);
x=((x<< 4)&0xF0F0F0F0)|((x>> 4)&0x0F0F0F0F);
x=((x<< 2)&0xCCCCCCCC)|((x>> 2)&0x33333333);
x=((x<< 1)&0xAAAAAAAA)|((x>> 1)&0x55555555);
return x;
}
//---------------------------------------------------------------------------
DWORD BitReverse8(DWORD x) // fast 8bit LUT reversing
{
DWORD y;
BYTE *px=(BYTE*)&x;
BYTE *py=(BYTE*)&y;
static const BYTE BitReverse8_LUT[256]= // BitReverse8_LUT[x]=BitReverse(x) in 8bits
{
0x00,0x80,0x40,0xC0,0x20,0xA0,0x60,0xE0,0x10,0x90,0x50,0xD0,0x30,0xB0,0x70,0xF0,
0x08,0x88,0x48,0xC8,0x28,0xA8,0x68,0xE8,0x18,0x98,0x58,0xD8,0x38,0xB8,0x78,0xF8,
0x04,0x84,0x44,0xC4,0x24,0xA4,0x64,0xE4,0x14,0x94,0x54,0xD4,0x34,0xB4,0x74,0xF4,
0x0C,0x8C,0x4C,0xCC,0x2C,0xAC,0x6C,0xEC,0x1C,0x9C,0x5C,0xDC,0x3C,0xBC,0x7C,0xFC,
0x02,0x82,0x42,0xC2,0x22,0xA2,0x62,0xE2,0x12,0x92,0x52,0xD2,0x32,0xB2,0x72,0xF2,
0x0A,0x8A,0x4A,0xCA,0x2A,0xAA,0x6A,0xEA,0x1A,0x9A,0x5A,0xDA,0x3A,0xBA,0x7A,0xFA,
0x06,0x86,0x46,0xC6,0x26,0xA6,0x66,0xE6,0x16,0x96,0x56,0xD6,0x36,0xB6,0x76,0xF6,
0x0E,0x8E,0x4E,0xCE,0x2E,0xAE,0x6E,0xEE,0x1E,0x9E,0x5E,0xDE,0x3E,0xBE,0x7E,0xFE,
0x01,0x81,0x41,0xC1,0x21,0xA1,0x61,0xE1,0x11,0x91,0x51,0xD1,0x31,0xB1,0x71,0xF1,
0x09,0x89,0x49,0xC9,0x29,0xA9,0x69,0xE9,0x19,0x99,0x59,0xD9,0x39,0xB9,0x79,0xF9,
0x05,0x85,0x45,0xC5,0x25,0xA5,0x65,0xE5,0x15,0x95,0x55,0xD5,0x35,0xB5,0x75,0xF5,
0x0D,0x8D,0x4D,0xCD,0x2D,0xAD,0x6D,0xED,0x1D,0x9D,0x5D,0xDD,0x3D,0xBD,0x7D,0xFD,
0x03,0x83,0x43,0xC3,0x23,0xA3,0x63,0xE3,0x13,0x93,0x53,0xD3,0x33,0xB3,0x73,0xF3,
0x0B,0x8B,0x4B,0xCB,0x2B,0xAB,0x6B,0xEB,0x1B,0x9B,0x5B,0xDB,0x3B,0xBB,0x7B,0xFB,
0x07,0x87,0x47,0xC7,0x27,0xA7,0x67,0xE7,0x17,0x97,0x57,0xD7,0x37,0xB7,0x77,0xF7,
0x0F,0x8F,0x4F,0xCF,0x2F,0xAF,0x6F,0xEF,0x1F,0x9F,0x5F,0xDF,0x3F,0xBF,0x7F,0xFF
};
py[0]=BitReverse8_LUT[px[3]];
py[1]=BitReverse8_LUT[px[2]];
py[2]=BitReverse8_LUT[px[1]];
py[3]=BitReverse8_LUT[px[0]];
return y;
}
//---------------------------------------------------------------------------
DWORD AddReverseSlow(DWORD x,DWORD y)
{
return BitReverse(BitReverse(x)+BitReverse(y));
}
//---------------------------------------------------------------------------
DWORD AddReverse4(DWORD x,DWORD y)
{
int i;
DWORD z,cy0,cy,a;
static const BYTE AddReverse4_LUT[16][16]= // AddReverse4_LUT[x][y]=BitReverse(BitReverse(x<<1)+BitReverse(y<<1)) in 4bits
{
{ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 },
{ 2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29 },
{ 4, 6, 2, 1, 12, 14, 10, 9, 20, 22, 18, 17, 28, 30, 26, 25 },
{ 6, 5, 1, 3, 14, 13, 9, 11, 22, 21, 17, 19, 30, 29, 25, 27 },
{ 8, 10, 12, 14, 4, 6, 2, 1, 24, 26, 28, 30, 20, 22, 18, 17 },
{ 10, 9, 14, 13, 6, 5, 1, 3, 26, 25, 30, 29, 22, 21, 17, 19 },
{ 12, 14, 10, 9, 2, 1, 6, 5, 28, 30, 26, 25, 18, 17, 22, 21 },
{ 14, 13, 9, 11, 1, 3, 5, 7, 30, 29, 25, 27, 17, 19, 21, 23 },
{ 16, 18, 20, 22, 24, 26, 28, 30, 8, 10, 12, 14, 4, 6, 2, 1 },
{ 18, 17, 22, 21, 26, 25, 30, 29, 10, 9, 14, 13, 6, 5, 1, 3 },
{ 20, 22, 18, 17, 28, 30, 26, 25, 12, 14, 10, 9, 2, 1, 6, 5 },
{ 22, 21, 17, 19, 30, 29, 25, 27, 14, 13, 9, 11, 1, 3, 5, 7 },
{ 24, 26, 28, 30, 20, 22, 18, 17, 4, 6, 2, 1, 12, 14, 10, 9 },
{ 26, 25, 30, 29, 22, 21, 17, 19, 6, 5, 1, 3, 14, 13, 9, 11 },
{ 28, 30, 26, 25, 18, 17, 22, 21, 2, 1, 6, 5, 10, 9, 14, 13 },
{ 30, 29, 25, 27, 17, 19, 21, 23, 1, 3, 5, 7, 9, 11, 13, 15 }
};
for (cy0=0,z=0,i=28;i>=0;i-=4,cy0=cy)
{
a=AddReverse4_LUT[(x>>i)&15][(y>>i)&15]; cy=a&1; a>>=1; // add
if (cy0){ a=AddReverse4_LUT[8][a]; cy|=a&1; a>>=1; } // adc
z|=a<<i;
}
return z;
}
//---------------------------------------------------------------------------
WORD AddReverese8_LUT[256][256]; // LUT[x][y]=BitReverse(BitReverse(x<<1)+BitReverse(y<<1)) in 8bits
DWORD AddReverse8(DWORD x,DWORD y)
{
int i;
DWORD z,cy0,cy,a;
BYTE *px=(BYTE*)&x;
BYTE *py=(BYTE*)&y;
BYTE *pz=(BYTE*)&z;
for (cy0=0,z=0,i=3;i>=0;i--,cy0=cy)
{
a=AddReverese8_LUT[px[i]][py[i]]; cy=a&1; a>>=1; // add
if (cy0){ a=AddReverese8_LUT[128][a]; cy|=a&1; a>>=1; } // adc
pz[i]=a;
}
return z;
}
//---------------------------------------------------------------------------
DWORD AddReverseKS(DWORD x,DWORD y) // https://en.wikipedia.org/wiki/Kogge–Stone_adder
{ // Adder from harold's answer ported to 32bit
DWORD p=x^y;
DWORD g=x&y;
g|=p&(g>> 1); p&=p>> 1;
g|=p&(g>> 2); p&=p>> 2;
g|=p&(g>> 4); p&=p>> 4;
g|=p&(g>> 8); p&=p>> 8;
g|=p&(g>>16);
return x^y^(g>>1);
}
//---------------------------------------------------------------------------
AddReverse8_LUT
初始化如下:
for (DWORD x=0;x<256;x++)
for (DWORD y=0;y<256;y++)
AddReverse8_LUT[x][y]=BitReverse(BitReverse(x<<1)+BitReverse(y<<1));
这里是我的设置的测量时间:
8M x BitReverseSlow [ 506.585 ms]
8M x BitReverse1 [ 59.769 ms]
8M x BitReverse8 [ 72.355 ms]
16M x AddReverseSlow [ 611.677 ms]
16M x AddReverse4 [ 491.546 ms]
16M x AddReverse8 [ 347.293 ms]
16M x AddReverseKS [ 142.149 ms]
如您所见,LUT 越大速度越好,但代价是 space。最重要的是,如果分辨率是 8/16/32 位,您可以通过指针使用 BYTE 访问,而不是使用慢位 shift/mask 来加快速度。
只是为了确保加法 LUT[][]
table 的编码使得加法向左移动 1 位并且空的 space 被下一个进位标志占用蚕食...
在我做 adc
(通过进位标志递增)的添加中,LUT
的操作数在 LUT 位分辨率中是 1
位反转所以
1000bin = 8dec // 4 bits
10000000bin = 128dec // 8 bits
要将其移植到 64 位 只需将 DWORD
更改为您的数据类型并更改位掩码和迭代计数以匹配 64 位 ...
无论如何,这让我想起了我以前的回答:
- Cant make value propagate through carry
不过看起来 BitReverse
的非 LUT 版本更快
经过一些挖掘,看起来 LUT 与临时变量的使用迫使我的编译器以不同的方式编译我的函数,速度大大降低(更多的棕褐色两次)。一旦所有函数都强制执行此类调用 api,LUT 版本会稍微快一些:
8M x BitReverseSlow [ 537.534 ms]
8M x BitReverse1 [ 99.804 ms] <- this is slowed down due to changing call style to the same as the BitReverse8
8M x BitReverse8 [ 80.725 ms]
所以我怀疑对于相当简单的函数,我的编译器使用汇编调用风格(不使用 heap/stack 作为操作数和返回值)。
所以结果是:
使用 BitReverse1
更快。出于同样的原因,harolds 的回答到目前为止也比我的好得多 + 它需要的操作更少 ...
但是,如果您将其硬连线(FPGA 或在芯片上或使用门),我怀疑 LUT 版本的 BitReverse
会更快(尽管您可以直接进行 Hardwire 位反转,这是无与伦比的)。
模拟一个 Kogge-Stone adder,移位方向相反,给出了一个很好的算法,
uint64_t p = x ^ y;
uint64_t g = x & y;
g |= p & (g >> 1);
p &= p >> 1;
g |= p & (g >> 2);
p &= p >> 2;
g |= p & (g >> 4);
p &= p >> 4;
g |= p & (g >> 8);
p &= p >> 8;
g |= p & (g >> 16);
p &= p >> 16;
g |= p & (g >> 32);
uint64_t result = x ^ y ^ (g >> 1);