使用 NEON 内在函数进行优化
Optimization using NEON intrinsics
我是 NEON 内在的初学者。我正在尝试优化下面的算法
uint32_t blue = 0, red = 0 , green = 0, alpha = 0, factor = 0 , shift = 0;
// some initial calculation to calculate factor shift and R G B init values all are expected to be initilized with 16 bit unsigned
//pSRC is 32 bbp flat pixel array and count is total pixels count
for( int i = 0; i < count; i++ )
{
blue += *psrc++;
green += *psrc++;
green += *psrc++;
alpha += *psrc++;
*pDest++ = static_cast< uint_8 >( ( blue * factor ) >> shift );
*pDest++ = static_cast< uint_8 >( ( green * factor ) >> shift );
*pDest++ = static_cast< uint_8 >( ( red * factor ) >> shift );
*pDest++ = static_cast< uint_8 >( ( alpha * factor ) >> shift );
}
我不确定如何执行此操作,因为我需要 32 位容器中的结果并且源数据为 8 位 ( R G B A )
,并且没有可以添加 8 位的指令32 位。
谁能帮我解决这个问题?
我能够按照 Paul link 的建议将它们转换为 32 位,并进行必要的运算。现在我有:
uint32x4_t result1 = vshlq_u32(mult1281, shift);
uint32x4_t result2 = vshlq_u32(mult1282, shift);
uint32x4_t result3 = vshlq_u32(mult1283, shift);
uint32x4_t result4 = vshlq_u32(mult1284, shift);
结果 1/2/3/4 现在包含 32 位(每通道)RGB 通道。我现在如何组合结果 1/2/3/4 以获得 8 位(每通道)RGB 通道并将其放回目的地?
算法的深层意义我还是没看懂,当然你可以用NEON优化一下:
uint32_t blue = 0, red = 0, green = 0, alpha = 0, factor = 0, shift = 0;
// some your initializations.
uint32x4_t bgra = { blue, green, red, alpha };
for (int i = 0; i < count; i += 2)
{
//load 8 8-bit values and unpack to 16-bit
uint16x8_t src = vmovl_u8(vld1_u8(psrc + i * 4));
//accumulate low 4 values
bgra = vaddw_u16(bgra, vget_low_u16(src));
//get low 4 values of dst
uint32x4_t lo = vshrq_n_u32(vmulq_u32(bgra, vdupq_n_u32(factor)), shift);
//accumulate high 4 values
bgra = vaddw_u16(bgra, vget_high_u16(src));
//get high 4 values of dst
uint32x4_t hi = vshrq_n_u32(vmulq_u32(bgra, vdupq_n_u32(factor)), shift);
//pack 8 32-bit values to 8 8-bit.
uint8x8_t dst = vmovn_u16(vcombine_u16(vmovn_u32(lo), vmovn_u32(hi)));
//store result
vst1_u8(pDest + i * 4, dst);
}
我是 NEON 内在的初学者。我正在尝试优化下面的算法
uint32_t blue = 0, red = 0 , green = 0, alpha = 0, factor = 0 , shift = 0;
// some initial calculation to calculate factor shift and R G B init values all are expected to be initilized with 16 bit unsigned
//pSRC is 32 bbp flat pixel array and count is total pixels count
for( int i = 0; i < count; i++ )
{
blue += *psrc++;
green += *psrc++;
green += *psrc++;
alpha += *psrc++;
*pDest++ = static_cast< uint_8 >( ( blue * factor ) >> shift );
*pDest++ = static_cast< uint_8 >( ( green * factor ) >> shift );
*pDest++ = static_cast< uint_8 >( ( red * factor ) >> shift );
*pDest++ = static_cast< uint_8 >( ( alpha * factor ) >> shift );
}
我不确定如何执行此操作,因为我需要 32 位容器中的结果并且源数据为 8 位 ( R G B A )
,并且没有可以添加 8 位的指令32 位。
谁能帮我解决这个问题?
我能够按照 Paul link 的建议将它们转换为 32 位,并进行必要的运算。现在我有:
uint32x4_t result1 = vshlq_u32(mult1281, shift);
uint32x4_t result2 = vshlq_u32(mult1282, shift);
uint32x4_t result3 = vshlq_u32(mult1283, shift);
uint32x4_t result4 = vshlq_u32(mult1284, shift);
结果 1/2/3/4 现在包含 32 位(每通道)RGB 通道。我现在如何组合结果 1/2/3/4 以获得 8 位(每通道)RGB 通道并将其放回目的地?
算法的深层意义我还是没看懂,当然你可以用NEON优化一下:
uint32_t blue = 0, red = 0, green = 0, alpha = 0, factor = 0, shift = 0;
// some your initializations.
uint32x4_t bgra = { blue, green, red, alpha };
for (int i = 0; i < count; i += 2)
{
//load 8 8-bit values and unpack to 16-bit
uint16x8_t src = vmovl_u8(vld1_u8(psrc + i * 4));
//accumulate low 4 values
bgra = vaddw_u16(bgra, vget_low_u16(src));
//get low 4 values of dst
uint32x4_t lo = vshrq_n_u32(vmulq_u32(bgra, vdupq_n_u32(factor)), shift);
//accumulate high 4 values
bgra = vaddw_u16(bgra, vget_high_u16(src));
//get high 4 values of dst
uint32x4_t hi = vshrq_n_u32(vmulq_u32(bgra, vdupq_n_u32(factor)), shift);
//pack 8 32-bit values to 8 8-bit.
uint8x8_t dst = vmovn_u16(vcombine_u16(vmovn_u32(lo), vmovn_u32(hi)));
//store result
vst1_u8(pDest + i * 4, dst);
}