如何使用 Altivec 将向量存储到内存中未对齐的位置
How to store a vector to an unaligned location in memory with Altivec
我从 tutorial 了解到,未对齐的加载和存储看起来像:
//Load a vector from an unaligned location in memory
__vector unsigned char LoadUnaligned(const unsigned char * src )
{
__vector unsigned char permuteVector = vec_lvsl(0, src);
__vector unsigned char low = vec_ld( 0, src);
__vector unsigned char high = vec_ld( 16, src);
return vec_perm( low, high, permuteVector);
}
//Store a vector to an unaligned location in memory
void StoreUnaligned(__vector unsigned char v, __vector unsigned char * dst)
{
//Load the surrounding area
__vector unsigned char low = vec_ld( 0, dst);
__vector unsigned char high = vec_ld( 16, dst);
//Prepare the constants that we need
__vector unsigned char permuteVector = vec_lvsr( 0, (int*) dst);
__vector signed char oxFF = vec_splat_s8( -1 );
__vector signed char ox00 = vec_splat_s8( 0 );
//Make a mask for which parts of the vectors to swap out
__vector unsigned char mask = vec_perm( ox00, oxFF, permuteVector );
//Right rotate our input data
v = vec_perm( v, v, permuteVector );
//Insert our data into the low and high vectors
low = vec_sel( v, low, mask );
high = vec_sel( high, v, mask );
//Store the two aligned result vectors
vec_st( low, 0, dst);
vec_st( high, 16, dst);
}
看起来很糟糕。为了存储一个向量需要做大量的工作!
并且有适当的性能损失。
void SomeFuncA(const unsigned char * src, size_t size, unsigned char * dst)
{
for(size_t i = 0; i < size; i += 16)
{
__vector unsigned char a = vec_ld(0, src + i);
//simple work
vec_st(a, 0, dst + i);
}
}
void SomeFuncU(const unsigned char * src, size_t size, unsigned char * dst)
{
for(size_t i = 0; i < size; i += 16)
{
__vector unsigned char a = LoadUnaligned(src + i);
//simple work
StoreUnaligned(dst + i, a);
}
}
第二个函数比第一个慢3-4倍。
由于我无法控制输入和输出内存的对齐,因此我必须实现这两个版本。
如何最大程度地减少未对齐情况下的性能损失?
首先我想提一下,如果你多次将 Altivec 向量保存到一个未对齐的内存中,你不需要只在数组的开头和结尾将以前的内存状态保存在数组的中间。
所以在Simd Library中有一个有用的函数class,它实现了这个功能:
typedef __vector uint8_t v128_u8;
const v128_u8 K8_00 = vec_splat_u8(0x00);
const v128_u8 K8_FF = vec_splat_u8(0xFF);
template <bool align> inline v128_u8 Load(const uint8_t * p);
template <> inline v128_u8 Load<false>(const uint8_t * p)
{
v128_u8 lo = vec_ld(0, p);
v128_u8 hi = vec_ld(16, p);
return vec_perm(lo, hi, vec_lvsl(0, p));
}
template <> inline v128_u8 Load<true>(const uint8_t * p)
{
return vec_ld(0, p);
}
template <bool align> struct Storer;
template <> struct Storer<true>
{
template <class T> Storer(T * ptr)
:_ptr((uint8_t*)ptr)
{
}
template <class T> inline void First(T value)
{
vec_st((v128_u8)value, 0, _ptr);
}
template <class T> inline void Next(T value)
{
_ptr += 16;
vec_st((v128_u8)value, 0, _ptr);
}
inline void Flush()
{
}
private:
uint8_t * _ptr;
};
template <> struct Storer<false>
{
template <class T> inline Storer(T * ptr)
:_ptr((uint8_t*)ptr)
{
_perm = vec_lvsr(0, _ptr);
_mask = vec_perm(K8_00, K8_FF, _perm);
}
template <class T> inline void First(T value)
{
_last = (v128_u8)value;
v128_u8 background = vec_ld(0, _ptr);
v128_u8 foreground = vec_perm(_last, _last, _perm);
vec_st(vec_sel(background, foreground, _mask), 0, _ptr);
}
template <class T> inline void Next(T value)
{
_ptr += 16;
vec_st(vec_perm(_last, (v128_u8)value, _perm), 0, _ptr);
_last = (v128_u8)value;
}
inline void Flush()
{
v128_u8 background = vec_ld(16, _ptr);
v128_u8 foreground = vec_perm(_last, _last, _perm);
vec_st(vec_sel(foreground, background, _mask), 16, _ptr);
}
private:
uint8_t * _ptr;
v128_u8 _perm;
v128_u8 _mask;
v128_u8 _last;
};
它的用法如下:
template<bool align> void SomeFunc(const unsigned char * src, size_t size, unsigned char * dst)
{
Storer<align> _dst(dst);
__vector unsigned char a = Load<align>(src);
//simple work
_dst.First(a);// save first block
for(size_t i = 16; i < size; i += 16)
{
__vector unsigned char a = Load<align>(src + i);
//simple work
_dst.Next(a);// save body
}
_dst.Flush(); // save tail
}
与对齐版本相比,性能损失将达到 30-40%。
这当然令人不快,但可以容忍。
额外的优势是减少代码 - 所有函数(对齐和未对齐)都具有相同的实现。
我从 tutorial 了解到,未对齐的加载和存储看起来像:
//Load a vector from an unaligned location in memory
__vector unsigned char LoadUnaligned(const unsigned char * src )
{
__vector unsigned char permuteVector = vec_lvsl(0, src);
__vector unsigned char low = vec_ld( 0, src);
__vector unsigned char high = vec_ld( 16, src);
return vec_perm( low, high, permuteVector);
}
//Store a vector to an unaligned location in memory
void StoreUnaligned(__vector unsigned char v, __vector unsigned char * dst)
{
//Load the surrounding area
__vector unsigned char low = vec_ld( 0, dst);
__vector unsigned char high = vec_ld( 16, dst);
//Prepare the constants that we need
__vector unsigned char permuteVector = vec_lvsr( 0, (int*) dst);
__vector signed char oxFF = vec_splat_s8( -1 );
__vector signed char ox00 = vec_splat_s8( 0 );
//Make a mask for which parts of the vectors to swap out
__vector unsigned char mask = vec_perm( ox00, oxFF, permuteVector );
//Right rotate our input data
v = vec_perm( v, v, permuteVector );
//Insert our data into the low and high vectors
low = vec_sel( v, low, mask );
high = vec_sel( high, v, mask );
//Store the two aligned result vectors
vec_st( low, 0, dst);
vec_st( high, 16, dst);
}
看起来很糟糕。为了存储一个向量需要做大量的工作! 并且有适当的性能损失。
void SomeFuncA(const unsigned char * src, size_t size, unsigned char * dst)
{
for(size_t i = 0; i < size; i += 16)
{
__vector unsigned char a = vec_ld(0, src + i);
//simple work
vec_st(a, 0, dst + i);
}
}
void SomeFuncU(const unsigned char * src, size_t size, unsigned char * dst)
{
for(size_t i = 0; i < size; i += 16)
{
__vector unsigned char a = LoadUnaligned(src + i);
//simple work
StoreUnaligned(dst + i, a);
}
}
第二个函数比第一个慢3-4倍。 由于我无法控制输入和输出内存的对齐,因此我必须实现这两个版本。 如何最大程度地减少未对齐情况下的性能损失?
首先我想提一下,如果你多次将 Altivec 向量保存到一个未对齐的内存中,你不需要只在数组的开头和结尾将以前的内存状态保存在数组的中间。 所以在Simd Library中有一个有用的函数class,它实现了这个功能:
typedef __vector uint8_t v128_u8;
const v128_u8 K8_00 = vec_splat_u8(0x00);
const v128_u8 K8_FF = vec_splat_u8(0xFF);
template <bool align> inline v128_u8 Load(const uint8_t * p);
template <> inline v128_u8 Load<false>(const uint8_t * p)
{
v128_u8 lo = vec_ld(0, p);
v128_u8 hi = vec_ld(16, p);
return vec_perm(lo, hi, vec_lvsl(0, p));
}
template <> inline v128_u8 Load<true>(const uint8_t * p)
{
return vec_ld(0, p);
}
template <bool align> struct Storer;
template <> struct Storer<true>
{
template <class T> Storer(T * ptr)
:_ptr((uint8_t*)ptr)
{
}
template <class T> inline void First(T value)
{
vec_st((v128_u8)value, 0, _ptr);
}
template <class T> inline void Next(T value)
{
_ptr += 16;
vec_st((v128_u8)value, 0, _ptr);
}
inline void Flush()
{
}
private:
uint8_t * _ptr;
};
template <> struct Storer<false>
{
template <class T> inline Storer(T * ptr)
:_ptr((uint8_t*)ptr)
{
_perm = vec_lvsr(0, _ptr);
_mask = vec_perm(K8_00, K8_FF, _perm);
}
template <class T> inline void First(T value)
{
_last = (v128_u8)value;
v128_u8 background = vec_ld(0, _ptr);
v128_u8 foreground = vec_perm(_last, _last, _perm);
vec_st(vec_sel(background, foreground, _mask), 0, _ptr);
}
template <class T> inline void Next(T value)
{
_ptr += 16;
vec_st(vec_perm(_last, (v128_u8)value, _perm), 0, _ptr);
_last = (v128_u8)value;
}
inline void Flush()
{
v128_u8 background = vec_ld(16, _ptr);
v128_u8 foreground = vec_perm(_last, _last, _perm);
vec_st(vec_sel(foreground, background, _mask), 16, _ptr);
}
private:
uint8_t * _ptr;
v128_u8 _perm;
v128_u8 _mask;
v128_u8 _last;
};
它的用法如下:
template<bool align> void SomeFunc(const unsigned char * src, size_t size, unsigned char * dst)
{
Storer<align> _dst(dst);
__vector unsigned char a = Load<align>(src);
//simple work
_dst.First(a);// save first block
for(size_t i = 16; i < size; i += 16)
{
__vector unsigned char a = Load<align>(src + i);
//simple work
_dst.Next(a);// save body
}
_dst.Flush(); // save tail
}
与对齐版本相比,性能损失将达到 30-40%。 这当然令人不快,但可以容忍。
额外的优势是减少代码 - 所有函数(对齐和未对齐)都具有相同的实现。