(Vec4 x Mat4x4) 产品使用 SIMD 和改进
(Vec4 x Mat4x4) product using SIMD and improvements
我正在编写一个复杂的模拟程序,它似乎最耗时的例程是将四向量 (float4) 与 4x4 矩阵相乘的例程。我需要 运行 在几台或多或少较旧的计算机上运行此程序。这就是为什么我尝试在以下代码中检查此类操作的 SIMD 功能的原因:
//#include <xmmintrin.h> // SSE
//#include <pmmintrin.h> // SSE3
//#include <nmmintrin.h> // SSE4.2
#include <immintrin.h> // AVX
#include <iostream>
#include <ctime>
#include <string>
using namespace std;
// 4-vector.
typedef struct
{
float x;
float y;
float z;
float w;
}float4;
// typedef to simplify the pointer of function notation.
typedef void(*Function)(float4&,const float4*,const float4&);
float dot( const float4& in_A, const float4& in_x )
{
return in_A.x*in_x.x + in_A.y*in_x.y + in_A.z*in_x.z + in_A.w*in_x.w; // 7 FLOPS
}
void A_times_x( float4& out_y, const float4* in_A, const float4& in_x )
{
out_y.x = dot(in_A[0], in_x); // 7 FLOPS
out_y.y = dot(in_A[1], in_x); // 7 FLOPS
out_y.z = dot(in_A[2], in_x); // 7 FLOPS
out_y.w = dot(in_A[3], in_x); // 7 FLOPS
}
void A_times_x_SSE( float4& out_y, const float4* in_A, const float4& in_x )
{
// Load matrix A and vector x into SSE registers
__m128 x = _mm_load_ps((const float*)&in_x); // load/store are almost = 0 FLOPS
__m128 A0 = _mm_load_ps((const float*)(in_A + 0));
__m128 A1 = _mm_load_ps((const float*)(in_A + 1));
__m128 A2 = _mm_load_ps((const float*)(in_A + 2));
__m128 A3 = _mm_load_ps((const float*)(in_A + 3));
// Transpose the matrix and re-order the vector.
_MM_TRANSPOSE4_PS( A0,A1,A2,A3 );
__m128 u1 = _mm_shuffle_ps(x,x, _MM_SHUFFLE(0,0,0,0));
__m128 u2 = _mm_shuffle_ps(x,x, _MM_SHUFFLE(1,1,1,1));
__m128 u3 = _mm_shuffle_ps(x,x, _MM_SHUFFLE(2,2,2,2));
__m128 u4 = _mm_shuffle_ps(x,x, _MM_SHUFFLE(3,3,3,3));
// Multiply each matrix row with the vector x
__m128 m0 = _mm_mul_ps(A0, u1); // 4 FLOPS
__m128 m1 = _mm_mul_ps(A1, u2); // 4 FLOPS
__m128 m2 = _mm_mul_ps(A2, u3); // 4 FLOPS
__m128 m3 = _mm_mul_ps(A3, u4); // 4 FLOPS
// Using HADD, we add four floats at a time
__m128 sum_01 = _mm_add_ps(m0, m1); // 4 FLOPS
__m128 sum_23 = _mm_add_ps(m2, m3); // 4 FLOPS
__m128 result = _mm_add_ps(sum_01, sum_23); // 4 FLOPS
// Finally, store the result
_mm_store_ps((float*)&out_y, result);
}
void A_times_x_SSE3( float4& out_y, const float4* in_A, const float4& in_x )
{
// Should be 4 (SSE) x 4 (ALU) = 16 times faster than scalar.
// Load matrix A and vector x into SSE registers
__m128 x = _mm_load_ps((const float*)&in_x); // load/store are almost = 0 FLOPS
__m128 A0 = _mm_load_ps((const float*)(in_A + 0));
__m128 A1 = _mm_load_ps((const float*)(in_A + 1));
__m128 A2 = _mm_load_ps((const float*)(in_A + 2));
__m128 A3 = _mm_load_ps((const float*)(in_A + 3));
// Multiply each matrix row with the vector x
__m128 m0 = _mm_mul_ps(A0, x); // 4 FLOPS
__m128 m1 = _mm_mul_ps(A1, x); // 4 FLOPS
__m128 m2 = _mm_mul_ps(A2, x); // 4 FLOPS
__m128 m3 = _mm_mul_ps(A3, x); // 4 FLOPS
// Using HADD, we add four floats at a time
__m128 sum_01 = _mm_hadd_ps(m0, m1); // 4 FLOPS
__m128 sum_23 = _mm_hadd_ps(m2, m3); // 4 FLOPS
__m128 result = _mm_hadd_ps(sum_01, sum_23); // 4 FLOPS
// Finally, store the result
_mm_store_ps((float*)&out_y, result);
}
void A_times_x_SSE4( float4& out_y, const float4* in_A, const float4& in_x ) // 28 FLOPS
{
// Should be 4 (SSE) x 4 (ALU) = 16 times faster than scalar.
// Load matrix A and vector x into SSE registers
__m128 x = _mm_load_ps((const float*)&in_x); // load/store are almost = 0 FLOPS
__m128 A0 = _mm_load_ps((const float*)(in_A + 0));
__m128 A1 = _mm_load_ps((const float*)(in_A + 1));
__m128 A2 = _mm_load_ps((const float*)(in_A + 2));
__m128 A3 = _mm_load_ps((const float*)(in_A + 3));
// Multiply each matrix row with the vector x
__m128 m0 = _mm_dp_ps(A0, x, 0xFF); // 4 FLOPS
__m128 m1 = _mm_dp_ps(A1, x, 0xFF); // 4 FLOPS
__m128 m2 = _mm_dp_ps(A2, x, 0xFF); // 4 FLOPS
__m128 m3 = _mm_dp_ps(A3, x, 0xFF); // 4 FLOPS
// Using HADD, we add four floats at a time
__m128 mov_01 = _mm_movelh_ps(m0, m1); // 4 FLOPS
__m128 mov_23 = _mm_movelh_ps(m2, m3); // 4 FLOPS
__m128 result = _mm_shuffle_ps(mov_01, mov_23, _MM_SHUFFLE(2, 0, 2, 0)); // 4 FLOPS
// Finally, store the result
_mm_store_ps((float*)&out_y, result);
}
void A_times_x_AVX( float4& out_y, const float4* in_A, const float4& in_x )
{
// Load matrix A and vector x into SSE registers
__m128 x = _mm_load_ps((const float*)&in_x); // load/store are almost = 0 FLOPS
__m256 xx = _mm256_castps128_ps256(x);
xx = _mm256_insertf128_ps(xx,x,1);
__m256 A0 = _mm256_load_ps((const float*)(in_A + 0));
__m256 A2 = _mm256_load_ps((const float*)(in_A + 2));
// Multiply each matrix row with the vector x
__m256 m0 = _mm256_mul_ps(A0, xx); // 4 FLOPS
__m256 m2 = _mm256_mul_ps(A2, xx); // 4 FLOPS
// Using HADD, we add four floats at a time
__m256 sum_00 = _mm256_hadd_ps(m0, m2); // 4 FLOPS
/*__m128 sum_10 = _mm256_extractf128_ps(sum_00,0);
__m128 sum_01 = _mm256_extractf128_ps(sum_00,1);
__m128 result = _mm_hadd_ps(sum_10, sum_01); // 4 FLOPS
// Finally, store the result
_mm_store_ps((float*)&out_y, result);*/
// Finally, store the result (no temp variable: direct HADD, this avoid to copy from ALU128 to ALU256)
_mm_store_ps((float*)&out_y, _mm_hadd_ps(_mm256_extractf128_ps(sum_00,0),
_mm256_extractf128_ps(sum_00,1)));
}
void test_function ( Function f, string simd, unsigned int imax )
{
float4 Y;
float4 X1 = {0.5,1,0.2,0.7};
float4 X2 = {0.7,1,0.2,0.5};
float4 X3 = {0.5,0.2,1,0.7};
float4 X4 = {1,0.7,0.2,0.5};
float4 A[4] = {{0.5,1,0.2,0.7},
{0.6,0.4,0.1,0.8},
{0.3,0.8,0.2,0.5},
{1,0.4,0.6,0.9}};
clock_t tstart = clock();
for( unsigned int i=0 ; i<imax ; i++ )
for( unsigned long int j=0 ; j<250000000 ; j++ )
// Avoid for loop over long long, it is 2 times slower !
{
// Function pointer give a real call, whether the direct
// call is inlined and thus results are overestimated.
f( Y,A,X1 );
f( Y,A,X2 );
f( Y,A,X3 );
f( Y,A,X4 );
}
clock_t tend = clock();
double diff = static_cast<double>(tend - tstart) * 1e-3;
cout << "Time (" << simd << ") = " << diff << " s" << endl;
cout << "Nops (" << simd << ") = " << (double) imax << ".10^9" << endl;
cout << "Power (" << simd << ") = " << (double) imax * 28. / diff << " GFLOPS" << endl; // 28 FLOPS for std.
cout << endl;
}
int main ( int argc, char *argv[] )
{
test_function ( &A_times_x ,"std" , 1 );
test_function ( &A_times_x_SSE ,"SSE" , 2 );
test_function ( &A_times_x_SSE3,"SSE3", 3 );
test_function ( &A_times_x_SSE4,"SSE4", 1 );
test_function ( &A_times_x_AVX ,"AVX" , 3 );
return 0;
}
对于此类问题的改进,我遇到了一些麻烦。当 运行 编译代码时,我得到以下结果(Intel Core i5 4670K,3.4GHz,Haswell,Codeblock+MinGW 编译器使用 -O2 -march=corei7-avx):
Time (std) = 6.287 s
Nops (std) = 1.10^9
Power (std) = 4.45363 GFLOPS
Time (SSE) = 6.661 s
Nops (SSE) = 2.10^9
Power (SSE) = 8.40715 GFLOPS
Time (SSE3) = 8.361 s
Nops (SSE3) = 3.10^9
Power (SSE3) = 10.0466 GFLOPS
Time (SSE4) = 6.131 s
Nops (SSE4) = 1.10^9
Power (SSE4) = 4.56695 GFLOPS
Time (AVX) = 8.767 s
Nops (AVX) = 3.10^9
Power (AVX) = 9.58138 GFLOPS
我的问题如下:
这有可能提高 performances/speed 吗?它应该是
SSE 为 x4(最大),AVX 为 x8。
为什么 AVX 不比 SSE3 快?
对于那些说:"stop using your stuff, use Intel Math Kernel Library",我回复:"I would not, because I want a small executable file, and I only need to use SIMD for this specific case, not elsewhere" ;-)
Is this possible to improve more the performances/speed up ? It should be x4 (maximum) for SSE and x8 for AVX.
是的,我在 efficient-4x4-matrix-vector-multiplication-with-sse-horizontal-add-and-dot-product 中对此进行了详细解释。
将 4x4 矩阵 M
与列向量 u
相乘得到 v = M u
的有效方法是:
v = u1*col1 + u2*col2 + u3*col3 + u4*col4.
这需要您存储列向量。例如,假设您有以下 4x4 矩阵 A
:
0 1 2 3
4 5 6 7
8 9 10 11
12 13 14 15
然后将其存储为
0 4 8 c 1 5 9 d 2 6 a e 3 7 b f
相反,如果你想要行向量 uT
乘以矩阵 M
给出 vT = uT*M
那么你想要
vT = uT1*row1 + uT2*row2 + uT3*row3 + uT4*row4.
在这种情况下,您应该打包行而不是列。
所以要优化函数中的代码 A_times_x_SSE
注释掉该行
_MM_TRANSPOSE4_PS( A0,A1,A2,A3 );
并且此功能将比使用水平操作的其他功能更快。
使用 SIMD 的水平操作效率不高。它们在某些方面不是 SIMD,因为它们被分解为标量微操作,因此它们不是并行的。它们仅在不方便以对 SIMD 友好的形式打包数据时才有用。例如,当您无法存储 M
的列而只有行时。
Why the AVX is not faster than SSE3 ?
为了使用 AVX 高效地执行此操作,您必须同时对两个 4x4 矩阵进行操作,并打包矩阵,以便它们对 AVX 友好。现在让我们假设除了上面定义的矩阵 A
你还有另一个矩阵 B
:
16 17 18 19
20 21 22 23
24 25 26 27
28 29 30 31
为 AVX 打包 A
和 B
的最佳方式是
col1A col1B col2A col2B col3A col3B col4A col4B
0 4 8 12 16 20 24 28 1 5 9 13 17 21 25 29 2 6 10 14 18 22 26 30 3 7 11 15 19 23 27 31
假设您有两个向量 u = {0,1,2,3}
和 v = {4,5,6,7)
并且您想要 y
= Au
和 z
= Bv
然后你做的 AVX:
c1 = col1A col1B = {0 4 8 12 16 20 24 28} = _mm256_load_ps
c2 = col2A col2B = {1 5 9 13 17 21 25 29}
c3 = col3A col3B = {2 6 10 14 18 22 26 30}
c4 = col4A col4B = {3 7 11 15 19 23 27 31}
broad1 = {0,0,0,0,4,4,4,4}
broad2 = {1,1,1,1,5,5,5,5}
broad3 = {2,2,2,2,6,6,6,6}
broad4 = {3,3,3,3,7,7,7,7}
w = broad1*c1 + broad2*c2 + broad3*c + broad4*c4;
//w = {y1, y2, y3, y4, z1, z2, z3, z4};
因此生成的 8 宽向量 w
包含 4 向量 y
和 z
。这是使用 AVX 最有效的方法。如果你有固定矩阵和可变向量,你可以在一个循环中 运行 结束,然后在 运行 时间在循环之前打包对于一个大循环来说可以忽略不计。如果您知道矩阵在编译时是固定的,那么您可以在编译时打包。
我正在编写一个复杂的模拟程序,它似乎最耗时的例程是将四向量 (float4) 与 4x4 矩阵相乘的例程。我需要 运行 在几台或多或少较旧的计算机上运行此程序。这就是为什么我尝试在以下代码中检查此类操作的 SIMD 功能的原因:
//#include <xmmintrin.h> // SSE
//#include <pmmintrin.h> // SSE3
//#include <nmmintrin.h> // SSE4.2
#include <immintrin.h> // AVX
#include <iostream>
#include <ctime>
#include <string>
using namespace std;
// 4-vector.
typedef struct
{
float x;
float y;
float z;
float w;
}float4;
// typedef to simplify the pointer of function notation.
typedef void(*Function)(float4&,const float4*,const float4&);
float dot( const float4& in_A, const float4& in_x )
{
return in_A.x*in_x.x + in_A.y*in_x.y + in_A.z*in_x.z + in_A.w*in_x.w; // 7 FLOPS
}
void A_times_x( float4& out_y, const float4* in_A, const float4& in_x )
{
out_y.x = dot(in_A[0], in_x); // 7 FLOPS
out_y.y = dot(in_A[1], in_x); // 7 FLOPS
out_y.z = dot(in_A[2], in_x); // 7 FLOPS
out_y.w = dot(in_A[3], in_x); // 7 FLOPS
}
void A_times_x_SSE( float4& out_y, const float4* in_A, const float4& in_x )
{
// Load matrix A and vector x into SSE registers
__m128 x = _mm_load_ps((const float*)&in_x); // load/store are almost = 0 FLOPS
__m128 A0 = _mm_load_ps((const float*)(in_A + 0));
__m128 A1 = _mm_load_ps((const float*)(in_A + 1));
__m128 A2 = _mm_load_ps((const float*)(in_A + 2));
__m128 A3 = _mm_load_ps((const float*)(in_A + 3));
// Transpose the matrix and re-order the vector.
_MM_TRANSPOSE4_PS( A0,A1,A2,A3 );
__m128 u1 = _mm_shuffle_ps(x,x, _MM_SHUFFLE(0,0,0,0));
__m128 u2 = _mm_shuffle_ps(x,x, _MM_SHUFFLE(1,1,1,1));
__m128 u3 = _mm_shuffle_ps(x,x, _MM_SHUFFLE(2,2,2,2));
__m128 u4 = _mm_shuffle_ps(x,x, _MM_SHUFFLE(3,3,3,3));
// Multiply each matrix row with the vector x
__m128 m0 = _mm_mul_ps(A0, u1); // 4 FLOPS
__m128 m1 = _mm_mul_ps(A1, u2); // 4 FLOPS
__m128 m2 = _mm_mul_ps(A2, u3); // 4 FLOPS
__m128 m3 = _mm_mul_ps(A3, u4); // 4 FLOPS
// Using HADD, we add four floats at a time
__m128 sum_01 = _mm_add_ps(m0, m1); // 4 FLOPS
__m128 sum_23 = _mm_add_ps(m2, m3); // 4 FLOPS
__m128 result = _mm_add_ps(sum_01, sum_23); // 4 FLOPS
// Finally, store the result
_mm_store_ps((float*)&out_y, result);
}
void A_times_x_SSE3( float4& out_y, const float4* in_A, const float4& in_x )
{
// Should be 4 (SSE) x 4 (ALU) = 16 times faster than scalar.
// Load matrix A and vector x into SSE registers
__m128 x = _mm_load_ps((const float*)&in_x); // load/store are almost = 0 FLOPS
__m128 A0 = _mm_load_ps((const float*)(in_A + 0));
__m128 A1 = _mm_load_ps((const float*)(in_A + 1));
__m128 A2 = _mm_load_ps((const float*)(in_A + 2));
__m128 A3 = _mm_load_ps((const float*)(in_A + 3));
// Multiply each matrix row with the vector x
__m128 m0 = _mm_mul_ps(A0, x); // 4 FLOPS
__m128 m1 = _mm_mul_ps(A1, x); // 4 FLOPS
__m128 m2 = _mm_mul_ps(A2, x); // 4 FLOPS
__m128 m3 = _mm_mul_ps(A3, x); // 4 FLOPS
// Using HADD, we add four floats at a time
__m128 sum_01 = _mm_hadd_ps(m0, m1); // 4 FLOPS
__m128 sum_23 = _mm_hadd_ps(m2, m3); // 4 FLOPS
__m128 result = _mm_hadd_ps(sum_01, sum_23); // 4 FLOPS
// Finally, store the result
_mm_store_ps((float*)&out_y, result);
}
void A_times_x_SSE4( float4& out_y, const float4* in_A, const float4& in_x ) // 28 FLOPS
{
// Should be 4 (SSE) x 4 (ALU) = 16 times faster than scalar.
// Load matrix A and vector x into SSE registers
__m128 x = _mm_load_ps((const float*)&in_x); // load/store are almost = 0 FLOPS
__m128 A0 = _mm_load_ps((const float*)(in_A + 0));
__m128 A1 = _mm_load_ps((const float*)(in_A + 1));
__m128 A2 = _mm_load_ps((const float*)(in_A + 2));
__m128 A3 = _mm_load_ps((const float*)(in_A + 3));
// Multiply each matrix row with the vector x
__m128 m0 = _mm_dp_ps(A0, x, 0xFF); // 4 FLOPS
__m128 m1 = _mm_dp_ps(A1, x, 0xFF); // 4 FLOPS
__m128 m2 = _mm_dp_ps(A2, x, 0xFF); // 4 FLOPS
__m128 m3 = _mm_dp_ps(A3, x, 0xFF); // 4 FLOPS
// Using HADD, we add four floats at a time
__m128 mov_01 = _mm_movelh_ps(m0, m1); // 4 FLOPS
__m128 mov_23 = _mm_movelh_ps(m2, m3); // 4 FLOPS
__m128 result = _mm_shuffle_ps(mov_01, mov_23, _MM_SHUFFLE(2, 0, 2, 0)); // 4 FLOPS
// Finally, store the result
_mm_store_ps((float*)&out_y, result);
}
void A_times_x_AVX( float4& out_y, const float4* in_A, const float4& in_x )
{
// Load matrix A and vector x into SSE registers
__m128 x = _mm_load_ps((const float*)&in_x); // load/store are almost = 0 FLOPS
__m256 xx = _mm256_castps128_ps256(x);
xx = _mm256_insertf128_ps(xx,x,1);
__m256 A0 = _mm256_load_ps((const float*)(in_A + 0));
__m256 A2 = _mm256_load_ps((const float*)(in_A + 2));
// Multiply each matrix row with the vector x
__m256 m0 = _mm256_mul_ps(A0, xx); // 4 FLOPS
__m256 m2 = _mm256_mul_ps(A2, xx); // 4 FLOPS
// Using HADD, we add four floats at a time
__m256 sum_00 = _mm256_hadd_ps(m0, m2); // 4 FLOPS
/*__m128 sum_10 = _mm256_extractf128_ps(sum_00,0);
__m128 sum_01 = _mm256_extractf128_ps(sum_00,1);
__m128 result = _mm_hadd_ps(sum_10, sum_01); // 4 FLOPS
// Finally, store the result
_mm_store_ps((float*)&out_y, result);*/
// Finally, store the result (no temp variable: direct HADD, this avoid to copy from ALU128 to ALU256)
_mm_store_ps((float*)&out_y, _mm_hadd_ps(_mm256_extractf128_ps(sum_00,0),
_mm256_extractf128_ps(sum_00,1)));
}
void test_function ( Function f, string simd, unsigned int imax )
{
float4 Y;
float4 X1 = {0.5,1,0.2,0.7};
float4 X2 = {0.7,1,0.2,0.5};
float4 X3 = {0.5,0.2,1,0.7};
float4 X4 = {1,0.7,0.2,0.5};
float4 A[4] = {{0.5,1,0.2,0.7},
{0.6,0.4,0.1,0.8},
{0.3,0.8,0.2,0.5},
{1,0.4,0.6,0.9}};
clock_t tstart = clock();
for( unsigned int i=0 ; i<imax ; i++ )
for( unsigned long int j=0 ; j<250000000 ; j++ )
// Avoid for loop over long long, it is 2 times slower !
{
// Function pointer give a real call, whether the direct
// call is inlined and thus results are overestimated.
f( Y,A,X1 );
f( Y,A,X2 );
f( Y,A,X3 );
f( Y,A,X4 );
}
clock_t tend = clock();
double diff = static_cast<double>(tend - tstart) * 1e-3;
cout << "Time (" << simd << ") = " << diff << " s" << endl;
cout << "Nops (" << simd << ") = " << (double) imax << ".10^9" << endl;
cout << "Power (" << simd << ") = " << (double) imax * 28. / diff << " GFLOPS" << endl; // 28 FLOPS for std.
cout << endl;
}
int main ( int argc, char *argv[] )
{
test_function ( &A_times_x ,"std" , 1 );
test_function ( &A_times_x_SSE ,"SSE" , 2 );
test_function ( &A_times_x_SSE3,"SSE3", 3 );
test_function ( &A_times_x_SSE4,"SSE4", 1 );
test_function ( &A_times_x_AVX ,"AVX" , 3 );
return 0;
}
对于此类问题的改进,我遇到了一些麻烦。当 运行 编译代码时,我得到以下结果(Intel Core i5 4670K,3.4GHz,Haswell,Codeblock+MinGW 编译器使用 -O2 -march=corei7-avx):
Time (std) = 6.287 s
Nops (std) = 1.10^9
Power (std) = 4.45363 GFLOPS
Time (SSE) = 6.661 s
Nops (SSE) = 2.10^9
Power (SSE) = 8.40715 GFLOPS
Time (SSE3) = 8.361 s
Nops (SSE3) = 3.10^9
Power (SSE3) = 10.0466 GFLOPS
Time (SSE4) = 6.131 s
Nops (SSE4) = 1.10^9
Power (SSE4) = 4.56695 GFLOPS
Time (AVX) = 8.767 s
Nops (AVX) = 3.10^9
Power (AVX) = 9.58138 GFLOPS
我的问题如下:
这有可能提高 performances/speed 吗?它应该是 SSE 为 x4(最大),AVX 为 x8。
为什么 AVX 不比 SSE3 快?
对于那些说:"stop using your stuff, use Intel Math Kernel Library",我回复:"I would not, because I want a small executable file, and I only need to use SIMD for this specific case, not elsewhere" ;-)
Is this possible to improve more the performances/speed up ? It should be x4 (maximum) for SSE and x8 for AVX.
是的,我在 efficient-4x4-matrix-vector-multiplication-with-sse-horizontal-add-and-dot-product 中对此进行了详细解释。
将 4x4 矩阵 M
与列向量 u
相乘得到 v = M u
的有效方法是:
v = u1*col1 + u2*col2 + u3*col3 + u4*col4.
这需要您存储列向量。例如,假设您有以下 4x4 矩阵 A
:
0 1 2 3
4 5 6 7
8 9 10 11
12 13 14 15
然后将其存储为
0 4 8 c 1 5 9 d 2 6 a e 3 7 b f
相反,如果你想要行向量 uT
乘以矩阵 M
给出 vT = uT*M
那么你想要
vT = uT1*row1 + uT2*row2 + uT3*row3 + uT4*row4.
在这种情况下,您应该打包行而不是列。
所以要优化函数中的代码 A_times_x_SSE
注释掉该行
_MM_TRANSPOSE4_PS( A0,A1,A2,A3 );
并且此功能将比使用水平操作的其他功能更快。
使用 SIMD 的水平操作效率不高。它们在某些方面不是 SIMD,因为它们被分解为标量微操作,因此它们不是并行的。它们仅在不方便以对 SIMD 友好的形式打包数据时才有用。例如,当您无法存储 M
的列而只有行时。
Why the AVX is not faster than SSE3 ?
为了使用 AVX 高效地执行此操作,您必须同时对两个 4x4 矩阵进行操作,并打包矩阵,以便它们对 AVX 友好。现在让我们假设除了上面定义的矩阵 A
你还有另一个矩阵 B
:
16 17 18 19
20 21 22 23
24 25 26 27
28 29 30 31
为 AVX 打包 A
和 B
的最佳方式是
col1A col1B col2A col2B col3A col3B col4A col4B
0 4 8 12 16 20 24 28 1 5 9 13 17 21 25 29 2 6 10 14 18 22 26 30 3 7 11 15 19 23 27 31
假设您有两个向量 u = {0,1,2,3}
和 v = {4,5,6,7)
并且您想要 y
= Au
和 z
= Bv
然后你做的 AVX:
c1 = col1A col1B = {0 4 8 12 16 20 24 28} = _mm256_load_ps
c2 = col2A col2B = {1 5 9 13 17 21 25 29}
c3 = col3A col3B = {2 6 10 14 18 22 26 30}
c4 = col4A col4B = {3 7 11 15 19 23 27 31}
broad1 = {0,0,0,0,4,4,4,4}
broad2 = {1,1,1,1,5,5,5,5}
broad3 = {2,2,2,2,6,6,6,6}
broad4 = {3,3,3,3,7,7,7,7}
w = broad1*c1 + broad2*c2 + broad3*c + broad4*c4;
//w = {y1, y2, y3, y4, z1, z2, z3, z4};
因此生成的 8 宽向量 w
包含 4 向量 y
和 z
。这是使用 AVX 最有效的方法。如果你有固定矩阵和可变向量,你可以在一个循环中 运行 结束,然后在 运行 时间在循环之前打包对于一个大循环来说可以忽略不计。如果您知道矩阵在编译时是固定的,那么您可以在编译时打包。