使用 SIMD 优化图像大小调整(最近方法)
Optimization of image resizing (method Nearest) with using SIMD
我知道 'Nearest' 调整图像大小的方法是最快的方法。
尽管如此,我还是在寻找加速它的方法。
明显的步骤是预先计算索引:
void CalcIndex(int sizeS, int sizeD, int colors, int* idx)
{
float scale = (float)sizeS / sizeD;
for (size_t i = 0; i < sizeD; ++i)
{
int index = (int)::floor((i + 0.5f) * scale)
idx[i] = Min(Max(index, 0), sizeS - 1) * colors;
}
}
template<int colors> inline void CopyPixel(const uint8_t* src, uint8_t* dst)
{
for (int i = 0; i < colors; ++i)
dst[i] = src[i];
}
template<int colors> void Resize(const uint8_t* src, int srcW, int srcH,
uint8_t* dst, int dstW, int dstH)
{
int idxY[dstH], idxX[dstW];//pre-calculated indices (see CalcIndex).
for (int dy = 0; dy < dstH; dy++)
{
const uint8_t * srcY = src + idxY[dy] * srcW * colors;
for (int dx = 0, offset = 0; dx < dstW; dx++, offset += colors)
CopyPixel<N>(srcY + idxX[dx], dst + offset);
dst += dstW * colors;
}
}
是否存在下一个优化步骤?例如使用 SIMD 或其他一些优化技术。
P.S。特别是我对 RGB (Colors = 3
) 的优化很感兴趣。
如果我使用当前代码,我会看到 ARGB 图像 (Colors = 4
) 的处理速度比 RGB 快 50%,尽管它的处理速度更大 30%。
我认为使用 _mm256_i32gather_epi32 (AVX2) 可以在调整 32 位像素的情况下提高一些性能:
inline void Gather32bit(const uint8_t * src, const int* idx, uint8_t* dst)
{
__m256i _idx = _mm256_loadu_si256((__m256i*)idx);
__m256i val = _mm256_i32gather_epi32((int*)src, _idx, 1);
_mm256_storeu_si256((__m256i*)dst, val);
}
template<> void Resize<4>(const uint8_t* src, int srcW, int srcH,
uint8_t* dst, int dstW, int dstH)
{
int idxY[dstH], idxX[dstW];//pre-calculated indices.
size_t dstW8 = dstW & (8 - 1);
for (int dy = 0; dy < dstH; dy++)
{
const uint8_t * srcY = src + idxY[dy] * srcW * 4;
int dx = 0, offset = 0;
for (; dx < dstW8; dx += 8, offset += 8*4)
Gather32bit(srcY, idxX + dx,dst + offset);
for (; dx < dstW; dx++, offset += 4)
CopyPixel<N>(srcY + idxX[dx], dst + offset);
dst += dstW * 4;
}
}
P.S。经过一些修改,此方法可以应用于 RGB24:
const __m256i K8_SHUFFLE = _mm256_setr_epi8(
0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1,
0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1);
const __m256i K32_PERMUTE = _mm256_setr_epi32(0x0, 0x1, 0x2, 0x4, 0x5, 0x6, -1, -1);
inline void Gather24bit(const uint8_t * src, const int* idx, uint8_t* dst)
{
__m256i _idx = _mm256_loadu_si256((__m256i*)idx);
__m256i bgrx = _mm256_i32gather_epi32((int*)src, _idx, 1);
__m256i bgr = _mm256_permutevar8x32_epi32(
_mm256_shuffle_epi8(bgrx, K8_SHUFFLE), K32_PERMUTE);
_mm256_storeu_si256((__m256i*)dst, bgr);
}
template<> void Resize<3>(const uint8_t* src, int srcW, int srcH,
uint8_t* dst, int dstW, int dstH)
{
int idxY[dstH], idxX[dstW];//pre-calculated indices.
size_t dstW8 = dstW & (8 - 1);
for (int dy = 0; dy < dstH; dy++)
{
const uint8_t * srcY = src + idxY[dy] * srcW * 3;
int dx = 0, offset = 0;
for (; dx < dstW8; dx += 8, offset += 8*3)
Gather24bit(srcY, idxX + dx,dst + offset);
for (; dx < dstW; dx++, offset += 3)
CopyPixel<3>(srcY + idxX[dx], dst + offset);
dst += dstW * 3;
}
}
请注意,如果 srcW < dstW
那么 @Aki-Suihkonen 的方法更快。
(基于 SIMD 的)调整大小算法中的速度问题来自索引输入和输出元素的不匹配。当例如调整大小因子为 6/5,需要消耗 6 个像素并写入 5 个像素。16 字节的 OTOH SIMD 寄存器宽度映射到 16 个灰度元素、4 个 RGBA 元素或 5.33 个 RGB 元素。
我的经验是,当尝试一次写入 2-4 个 SIMD 数据寄存器,读取所需数量的来自输入的线性字节 + 一些,并在 x86 SSSE3 中使用 pshufb
或在 Neon 中使用 vtbl
从寄存器收集负载——从不从内存收集负载。人们当然需要一种快速机制来内联计算 LUT 索引,或预先计算在不同输出行之间共享的索引。
根据(水平)分辨率的 input/output 比率,应该准备好几个内核。
RGBrgbRGBrgbRGBr|gbRGBrgb .... <- input
^ where to load next 32 bytes of input
RGBRGBrgbRGBrgbr|gbRGBrgbRGBRGBrg| <- 32 output bytes, from
0000000000000000|0000001111111111| <- high bit of index
0120123456789ab9|abcdef0123423456| <- low 4 bits of index
注意,可以用 LUT 方法处理所有通道数
// inner kernel for downsampling between 1x and almost 2x*
// - we need to read max 32 elements and write 16
void process_row_ds(uint8_t const *input, uint8_t const *indices,
int const *advances, uint8_t *output, int out_width) {
do {
auto a = load16_bytes(input);
auto b = load16_bytes(input + 16);
auto c = load16_bytes(indices);
a = lut32(a,b,c); // get 16 bytes out of 32
store16_bytes(output, a);
output += 16;
input += *advances++;
} while (out_width--); // multiples of 16...
}
// inner kernel for upsampling between 1x and inf
void process_row_us(uint8_t const *input, uint8_t const *indices,
int const *advances, uint8_t *output, int out_width) {
do {
auto a = load16_bytes(input);
auto c = load16_bytes(indices);
a = lut16(a, c); // get 16 bytes out of 16
store16_bytes(output, a);
output += 16;
input += *advances++;
} while (out_width--);
}
除了(至少)双线性插值之外,我还鼓励使用一些基本滤波进行下采样,例如高斯二项式内核 (1 1, 1 2 1, 1 3 3 1, 1 4 6 4 1, ...)
以及分层下采样。应用程序当然有可能容忍混叠伪像——AFAIK 的成本通常不会那么大,特别是考虑到否则算法将受内存限制。
可以使用 SIMD,我很确定它会有所帮助,不幸的是它相对困难。下面是一个简单的例子,只支持图片放大,不支持缩小。
不过,我希望它可以作为一个有用的起点。
MSVC和GCC都将LineResize::apply
方法中的热循环编译成11条指令。我认为 16 字节的 11 条指令应该比你的版本快。
#include <stdint.h>
#include <emmintrin.h>
#include <tmmintrin.h>
#include <vector>
#include <array>
#include <assert.h>
#include <stdio.h>
// Implements nearest neighbor resize method for RGB24 or BGR24 bitmaps
class LineResize
{
// Each mask produces up to 16 output bytes.
// For enlargement exactly 16, for shrinking up to 16, possibly even 0.
std::vector<__m128i> masks;
// Length is the same as masks.
// For enlargement, the values contain source pointer offsets in bytes.
// For shrinking, the values contain destination pointer offsets in bytes.
std::vector<uint8_t> offsets;
// True if this class will enlarge images, false if it will shrink the width of the images.
bool enlargement;
void resizeFields( size_t vectors )
{
masks.resize( vectors, _mm_set1_epi32( -1 ) );
offsets.resize( vectors, 0 );
}
public:
// Compile the shuffle table. The arguments are line widths in pixels.
LineResize( size_t source, size_t dest );
// Apply the algorithm to a single line of the image.
void apply( uint8_t* rdi, const uint8_t* rsi ) const;
};
LineResize::LineResize( size_t source, size_t dest )
{
const size_t sourceBytes = source * 3;
const size_t destBytes = dest * 3;
assert( sourceBytes >= 16 );
assert( destBytes >= 16 );
// Possible to do much faster without any integer divides.
// Optimizing this sample for simplicity.
if( sourceBytes < destBytes )
{
// Enlarging the image, each SIMD vector consumes <16 input bytes, produces exactly 16 output bytes
enlargement = true;
resizeFields( ( destBytes + 15 ) / 16 );
int8_t* pMasks = (int8_t*)masks.data();
uint8_t* const pOffsets = offsets.data();
int sourceOffset = 0;
const size_t countVectors = masks.size();
for( size_t i = 0; i < countVectors; i++ )
{
const int destSlice = (int)i * 16;
std::array<int, 16> lanes;
int lane;
for( lane = 0; lane < 16; lane++ )
{
const int destByte = destSlice + lane; // output byte index
const int destPixel = destByte / 3; // output pixel index
const int channel = destByte % 3; // output byte within pixel
const int sourcePixel = destPixel * (int)source / (int)dest; // input pixel
const int sourceByte = sourcePixel * 3 + channel; // input byte
if( destByte < (int)destBytes )
lanes[ lane ] = sourceByte;
else
{
// Destination offset out of range, i.e. the last SIMD vector
break;
}
}
// Produce the offset
if( i == 0 )
assert( lanes[ 0 ] == 0 );
else
{
const int off = lanes[ 0 ] - sourceOffset;
assert( off >= 0 && off <= 16 );
pOffsets[ i - 1 ] = (uint8_t)off;
sourceOffset = lanes[ 0 ];
}
// Produce the masks
for( int j = 0; j < lane; j++ )
pMasks[ j ] = (int8_t)( lanes[ j ] - sourceOffset );
// The masks are initialized with _mm_set1_epi32( -1 ) = all bits set,
// no need to handle remainder for the last vector.
pMasks += 16;
}
}
else
{
// Shrinking the image, each SIMD vector consumes 16 input bytes, produces <16 output bytes
enlargement = false;
resizeFields( ( sourceBytes + 15 ) / 16 );
// Not implemented, but the same idea works fine for this too.
// The only difference, instead of using offsets bytes for source offsets, use it for destination offsets.
assert( false );
}
}
void LineResize::apply( uint8_t * rdi, const uint8_t * rsi ) const
{
const __m128i* pm = masks.data();
const __m128i* const pmEnd = pm + masks.size();
const uint8_t* po = offsets.data();
__m128i mask, source;
if( enlargement )
{
// One iteration of the loop produces 16 output bytes
// In MSVC results in 11 instructions for 16 output bytes.
while( pm < pmEnd )
{
mask = _mm_load_si128( pm );
pm++;
source = _mm_loadu_si128( ( const __m128i * )( rsi ) );
rsi += *po;
po++;
_mm_storeu_si128( ( __m128i * )rdi, _mm_shuffle_epi8( source, mask ) );
rdi += 16;
}
}
else
{
// One iteration of the loop consumes 16 input bytes
while( pm < pmEnd )
{
mask = _mm_load_si128( pm );
pm++;
source = _mm_loadu_si128( ( const __m128i * )( rsi ) );
rsi += 16;
_mm_storeu_si128( ( __m128i * )rdi, _mm_shuffle_epi8( source, mask ) );
rdi += *po;
po++;
}
}
}
// Utility method to print RGB pixel values from the vector
static void printPixels( const std::vector<uint8_t>&vec )
{
assert( !vec.empty() );
assert( 0 == ( vec.size() % 3 ) );
const uint8_t* rsi = vec.data();
const uint8_t* const rsiEnd = rsi + vec.size();
while( rsi < rsiEnd )
{
const uint32_t r = rsi[ 0 ];
const uint32_t g = rsi[ 1 ];
const uint32_t b = rsi[ 2 ];
rsi += 3;
const uint32_t res = ( r << 16 ) | ( g << 8 ) | b;
printf( "%06X ", res );
}
printf( "\n" );
}
// A triviual test to resize 24 pixels -> 32 pixels
int main()
{
constexpr int sourceLength = 24;
constexpr int destLength = 32;
// Initialize sample input with 24 RGB pixels
std::vector<uint8_t> input( sourceLength * 3 );
for( size_t i = 0; i < input.size(); i++ )
input[ i ] = (uint8_t)i;
printf( "Input: " );
printPixels( input );
// That special handling of the last pixels of last line is missing from this example.
static_assert( 0 == destLength % 16 );
LineResize resizer( sourceLength, destLength );
std::vector<uint8_t> result( destLength * 3 );
resizer.apply( result.data(), input.data() );
printf( "Output: " );
printPixels( result );
return 0;
}
代码忽略对齐问题。对于生产,您需要另一种方法来处理图像的最后一行,它不是 运行 结尾,而是使用标量代码处理最后几个像素。
代码在热循环中包含更多的内存引用。然而,class中的两个向量并不太长,对于4k图像大小约为12kb,应该适合L1D缓存并留在那里。
如果您有 AVX2,可能会进一步改进。对于放大图像,使用_mm256_inserti128_si256
,vinserti128
指令可以从内存中加载16个字节到向量的高半部分。同样,对于缩小图像尺寸,请使用 _mm256_extracti128_si256
,该指令有一个使用内存目标的选项。
我知道 'Nearest' 调整图像大小的方法是最快的方法。 尽管如此,我还是在寻找加速它的方法。 明显的步骤是预先计算索引:
void CalcIndex(int sizeS, int sizeD, int colors, int* idx)
{
float scale = (float)sizeS / sizeD;
for (size_t i = 0; i < sizeD; ++i)
{
int index = (int)::floor((i + 0.5f) * scale)
idx[i] = Min(Max(index, 0), sizeS - 1) * colors;
}
}
template<int colors> inline void CopyPixel(const uint8_t* src, uint8_t* dst)
{
for (int i = 0; i < colors; ++i)
dst[i] = src[i];
}
template<int colors> void Resize(const uint8_t* src, int srcW, int srcH,
uint8_t* dst, int dstW, int dstH)
{
int idxY[dstH], idxX[dstW];//pre-calculated indices (see CalcIndex).
for (int dy = 0; dy < dstH; dy++)
{
const uint8_t * srcY = src + idxY[dy] * srcW * colors;
for (int dx = 0, offset = 0; dx < dstW; dx++, offset += colors)
CopyPixel<N>(srcY + idxX[dx], dst + offset);
dst += dstW * colors;
}
}
是否存在下一个优化步骤?例如使用 SIMD 或其他一些优化技术。
P.S。特别是我对 RGB (Colors = 3
) 的优化很感兴趣。
如果我使用当前代码,我会看到 ARGB 图像 (Colors = 4
) 的处理速度比 RGB 快 50%,尽管它的处理速度更大 30%。
我认为使用 _mm256_i32gather_epi32 (AVX2) 可以在调整 32 位像素的情况下提高一些性能:
inline void Gather32bit(const uint8_t * src, const int* idx, uint8_t* dst)
{
__m256i _idx = _mm256_loadu_si256((__m256i*)idx);
__m256i val = _mm256_i32gather_epi32((int*)src, _idx, 1);
_mm256_storeu_si256((__m256i*)dst, val);
}
template<> void Resize<4>(const uint8_t* src, int srcW, int srcH,
uint8_t* dst, int dstW, int dstH)
{
int idxY[dstH], idxX[dstW];//pre-calculated indices.
size_t dstW8 = dstW & (8 - 1);
for (int dy = 0; dy < dstH; dy++)
{
const uint8_t * srcY = src + idxY[dy] * srcW * 4;
int dx = 0, offset = 0;
for (; dx < dstW8; dx += 8, offset += 8*4)
Gather32bit(srcY, idxX + dx,dst + offset);
for (; dx < dstW; dx++, offset += 4)
CopyPixel<N>(srcY + idxX[dx], dst + offset);
dst += dstW * 4;
}
}
P.S。经过一些修改,此方法可以应用于 RGB24:
const __m256i K8_SHUFFLE = _mm256_setr_epi8(
0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1,
0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1);
const __m256i K32_PERMUTE = _mm256_setr_epi32(0x0, 0x1, 0x2, 0x4, 0x5, 0x6, -1, -1);
inline void Gather24bit(const uint8_t * src, const int* idx, uint8_t* dst)
{
__m256i _idx = _mm256_loadu_si256((__m256i*)idx);
__m256i bgrx = _mm256_i32gather_epi32((int*)src, _idx, 1);
__m256i bgr = _mm256_permutevar8x32_epi32(
_mm256_shuffle_epi8(bgrx, K8_SHUFFLE), K32_PERMUTE);
_mm256_storeu_si256((__m256i*)dst, bgr);
}
template<> void Resize<3>(const uint8_t* src, int srcW, int srcH,
uint8_t* dst, int dstW, int dstH)
{
int idxY[dstH], idxX[dstW];//pre-calculated indices.
size_t dstW8 = dstW & (8 - 1);
for (int dy = 0; dy < dstH; dy++)
{
const uint8_t * srcY = src + idxY[dy] * srcW * 3;
int dx = 0, offset = 0;
for (; dx < dstW8; dx += 8, offset += 8*3)
Gather24bit(srcY, idxX + dx,dst + offset);
for (; dx < dstW; dx++, offset += 3)
CopyPixel<3>(srcY + idxX[dx], dst + offset);
dst += dstW * 3;
}
}
请注意,如果 srcW < dstW
那么 @Aki-Suihkonen 的方法更快。
(基于 SIMD 的)调整大小算法中的速度问题来自索引输入和输出元素的不匹配。当例如调整大小因子为 6/5,需要消耗 6 个像素并写入 5 个像素。16 字节的 OTOH SIMD 寄存器宽度映射到 16 个灰度元素、4 个 RGBA 元素或 5.33 个 RGB 元素。
我的经验是,当尝试一次写入 2-4 个 SIMD 数据寄存器,读取所需数量的来自输入的线性字节 + 一些,并在 x86 SSSE3 中使用 pshufb
或在 Neon 中使用 vtbl
从寄存器收集负载——从不从内存收集负载。人们当然需要一种快速机制来内联计算 LUT 索引,或预先计算在不同输出行之间共享的索引。
根据(水平)分辨率的 input/output 比率,应该准备好几个内核。
RGBrgbRGBrgbRGBr|gbRGBrgb .... <- input
^ where to load next 32 bytes of input
RGBRGBrgbRGBrgbr|gbRGBrgbRGBRGBrg| <- 32 output bytes, from
0000000000000000|0000001111111111| <- high bit of index
0120123456789ab9|abcdef0123423456| <- low 4 bits of index
注意,可以用 LUT 方法处理所有通道数
// inner kernel for downsampling between 1x and almost 2x*
// - we need to read max 32 elements and write 16
void process_row_ds(uint8_t const *input, uint8_t const *indices,
int const *advances, uint8_t *output, int out_width) {
do {
auto a = load16_bytes(input);
auto b = load16_bytes(input + 16);
auto c = load16_bytes(indices);
a = lut32(a,b,c); // get 16 bytes out of 32
store16_bytes(output, a);
output += 16;
input += *advances++;
} while (out_width--); // multiples of 16...
}
// inner kernel for upsampling between 1x and inf
void process_row_us(uint8_t const *input, uint8_t const *indices,
int const *advances, uint8_t *output, int out_width) {
do {
auto a = load16_bytes(input);
auto c = load16_bytes(indices);
a = lut16(a, c); // get 16 bytes out of 16
store16_bytes(output, a);
output += 16;
input += *advances++;
} while (out_width--);
}
除了(至少)双线性插值之外,我还鼓励使用一些基本滤波进行下采样,例如高斯二项式内核 (1 1, 1 2 1, 1 3 3 1, 1 4 6 4 1, ...)
以及分层下采样。应用程序当然有可能容忍混叠伪像——AFAIK 的成本通常不会那么大,特别是考虑到否则算法将受内存限制。
可以使用 SIMD,我很确定它会有所帮助,不幸的是它相对困难。下面是一个简单的例子,只支持图片放大,不支持缩小。
不过,我希望它可以作为一个有用的起点。
MSVC和GCC都将LineResize::apply
方法中的热循环编译成11条指令。我认为 16 字节的 11 条指令应该比你的版本快。
#include <stdint.h>
#include <emmintrin.h>
#include <tmmintrin.h>
#include <vector>
#include <array>
#include <assert.h>
#include <stdio.h>
// Implements nearest neighbor resize method for RGB24 or BGR24 bitmaps
class LineResize
{
// Each mask produces up to 16 output bytes.
// For enlargement exactly 16, for shrinking up to 16, possibly even 0.
std::vector<__m128i> masks;
// Length is the same as masks.
// For enlargement, the values contain source pointer offsets in bytes.
// For shrinking, the values contain destination pointer offsets in bytes.
std::vector<uint8_t> offsets;
// True if this class will enlarge images, false if it will shrink the width of the images.
bool enlargement;
void resizeFields( size_t vectors )
{
masks.resize( vectors, _mm_set1_epi32( -1 ) );
offsets.resize( vectors, 0 );
}
public:
// Compile the shuffle table. The arguments are line widths in pixels.
LineResize( size_t source, size_t dest );
// Apply the algorithm to a single line of the image.
void apply( uint8_t* rdi, const uint8_t* rsi ) const;
};
LineResize::LineResize( size_t source, size_t dest )
{
const size_t sourceBytes = source * 3;
const size_t destBytes = dest * 3;
assert( sourceBytes >= 16 );
assert( destBytes >= 16 );
// Possible to do much faster without any integer divides.
// Optimizing this sample for simplicity.
if( sourceBytes < destBytes )
{
// Enlarging the image, each SIMD vector consumes <16 input bytes, produces exactly 16 output bytes
enlargement = true;
resizeFields( ( destBytes + 15 ) / 16 );
int8_t* pMasks = (int8_t*)masks.data();
uint8_t* const pOffsets = offsets.data();
int sourceOffset = 0;
const size_t countVectors = masks.size();
for( size_t i = 0; i < countVectors; i++ )
{
const int destSlice = (int)i * 16;
std::array<int, 16> lanes;
int lane;
for( lane = 0; lane < 16; lane++ )
{
const int destByte = destSlice + lane; // output byte index
const int destPixel = destByte / 3; // output pixel index
const int channel = destByte % 3; // output byte within pixel
const int sourcePixel = destPixel * (int)source / (int)dest; // input pixel
const int sourceByte = sourcePixel * 3 + channel; // input byte
if( destByte < (int)destBytes )
lanes[ lane ] = sourceByte;
else
{
// Destination offset out of range, i.e. the last SIMD vector
break;
}
}
// Produce the offset
if( i == 0 )
assert( lanes[ 0 ] == 0 );
else
{
const int off = lanes[ 0 ] - sourceOffset;
assert( off >= 0 && off <= 16 );
pOffsets[ i - 1 ] = (uint8_t)off;
sourceOffset = lanes[ 0 ];
}
// Produce the masks
for( int j = 0; j < lane; j++ )
pMasks[ j ] = (int8_t)( lanes[ j ] - sourceOffset );
// The masks are initialized with _mm_set1_epi32( -1 ) = all bits set,
// no need to handle remainder for the last vector.
pMasks += 16;
}
}
else
{
// Shrinking the image, each SIMD vector consumes 16 input bytes, produces <16 output bytes
enlargement = false;
resizeFields( ( sourceBytes + 15 ) / 16 );
// Not implemented, but the same idea works fine for this too.
// The only difference, instead of using offsets bytes for source offsets, use it for destination offsets.
assert( false );
}
}
void LineResize::apply( uint8_t * rdi, const uint8_t * rsi ) const
{
const __m128i* pm = masks.data();
const __m128i* const pmEnd = pm + masks.size();
const uint8_t* po = offsets.data();
__m128i mask, source;
if( enlargement )
{
// One iteration of the loop produces 16 output bytes
// In MSVC results in 11 instructions for 16 output bytes.
while( pm < pmEnd )
{
mask = _mm_load_si128( pm );
pm++;
source = _mm_loadu_si128( ( const __m128i * )( rsi ) );
rsi += *po;
po++;
_mm_storeu_si128( ( __m128i * )rdi, _mm_shuffle_epi8( source, mask ) );
rdi += 16;
}
}
else
{
// One iteration of the loop consumes 16 input bytes
while( pm < pmEnd )
{
mask = _mm_load_si128( pm );
pm++;
source = _mm_loadu_si128( ( const __m128i * )( rsi ) );
rsi += 16;
_mm_storeu_si128( ( __m128i * )rdi, _mm_shuffle_epi8( source, mask ) );
rdi += *po;
po++;
}
}
}
// Utility method to print RGB pixel values from the vector
static void printPixels( const std::vector<uint8_t>&vec )
{
assert( !vec.empty() );
assert( 0 == ( vec.size() % 3 ) );
const uint8_t* rsi = vec.data();
const uint8_t* const rsiEnd = rsi + vec.size();
while( rsi < rsiEnd )
{
const uint32_t r = rsi[ 0 ];
const uint32_t g = rsi[ 1 ];
const uint32_t b = rsi[ 2 ];
rsi += 3;
const uint32_t res = ( r << 16 ) | ( g << 8 ) | b;
printf( "%06X ", res );
}
printf( "\n" );
}
// A triviual test to resize 24 pixels -> 32 pixels
int main()
{
constexpr int sourceLength = 24;
constexpr int destLength = 32;
// Initialize sample input with 24 RGB pixels
std::vector<uint8_t> input( sourceLength * 3 );
for( size_t i = 0; i < input.size(); i++ )
input[ i ] = (uint8_t)i;
printf( "Input: " );
printPixels( input );
// That special handling of the last pixels of last line is missing from this example.
static_assert( 0 == destLength % 16 );
LineResize resizer( sourceLength, destLength );
std::vector<uint8_t> result( destLength * 3 );
resizer.apply( result.data(), input.data() );
printf( "Output: " );
printPixels( result );
return 0;
}
代码忽略对齐问题。对于生产,您需要另一种方法来处理图像的最后一行,它不是 运行 结尾,而是使用标量代码处理最后几个像素。
代码在热循环中包含更多的内存引用。然而,class中的两个向量并不太长,对于4k图像大小约为12kb,应该适合L1D缓存并留在那里。
如果您有 AVX2,可能会进一步改进。对于放大图像,使用_mm256_inserti128_si256
,vinserti128
指令可以从内存中加载16个字节到向量的高半部分。同样,对于缩小图像尺寸,请使用 _mm256_extracti128_si256
,该指令有一个使用内存目标的选项。