在 Raspberry Pi 4 上使用 ARM Neon Intrinsics 加速矩阵向量乘法
Accelerating matrix vector multiplication with ARM Neon Intrinsics on Raspberry Pi 4
我需要优化矩阵向量乘法。数据如下所示:
- 向量有 81 列
- 矩阵有 90,000 行和 81 列并且已经转置。所以可以使用逐行点积。
- 因此输出是一个包含 90,000 行的向量
- 都在一维浮点数组中
此例程还必须满足一些非功能要求:
- 应使用尽可能少的标准库(例如不使用
std::vector
)
- 不应该使用第三方库(所以我也不用
Eigen
或 Blas
)
这是我的(经过简化,为了便于阅读,我假设输入被完全屏蔽)代码,
// input_height = 90000
// input_width = 81
for (uint32_t y = 0; y < input_height; y += 4) {
float32x4_t sum0 = vmovq_n_f32(0);
float32x4_t sum1 = vmovq_n_f32(0);
float32x4_t sum2 = vmovq_n_f32(0);
float32x4_t sum3 = vmovq_n_f32(0);
for (uint32_t x = 0; x < input_width; x += 16) {
float32x4x4_t A = load_matrix_transpose(kernel + x);
float32x4x4_t B0 = load_matrix_transpose(input + y * input_width + x);
float32x4x4_t B1 = load_matrix_transpose(input + (y + 1) * input_width + x);
float32x4x4_t B2 = load_matrix_transpose(input + (y + 2) * input_width + x);
float32x4x4_t B3 = load_matrix_transpose(input + (y + 3) * input_width + x);
matrix_element_wise_multiplication(A, B0, sum0);
matrix_element_wise_multiplication(A, B1, sum1);
matrix_element_wise_multiplication(A, B2, sum2);
matrix_element_wise_multiplication(A, B3, sum3);
}
output[y] = vaddvq_f32(sum0);
output[y + 1] = vaddvq_f32(sum1);
output[y + 2] = vaddvq_f32(sum2);
output[y + 3] = vaddvq_f32(sum3);
}
其中load_matrix_transpose
、matrix_element_wise_multiplication
是以下函数:
inline float32x4x4_t load_matrix_transpose(float *a) {
float32x4x4_t ret;
ret.val[0] = simd_load(a);
ret.val[1] = simd_load(a + 4);
ret.val[2] = simd_load(a + 8);
ret.val[3] = simd_load(a + 12);
return ret;
}
inline void simd_matrix_element_wise_multiplication(float32x4x4_t & A, float32x4x4_t & B, float32x4x4_t & C) {
C = vmlaq_f32(C, A.val[0], B.val[0]);
C = vmlaq_f32(C, A.val[1], B.val[1]);
C = vmlaq_f32(C, A.val[2], B.val[2]);
C = vmlaq_f32(C, A.val[3], B.val[3]);
}
在我的 Rasperry Pi 4(ARMv8、8GB RAM、4 核)上,代码采用优化级别 -O3
,大约 60ms
。
长 运行(多次循环),Neon 寄存器版本的速度正好是普通代码的两倍。
我的问题是,有没有进一步优化代码的方法?我尝试了很多东西,但无法对正常代码进行任何改进。
就优化而言,数据局部性是最高优先级,您应该了解寄存器容量,因为寄存器是迄今为止最快和最稀缺的资源。
aarch64
: 32x128bit neon
寄存器(512 字节)
aarch32
: 16x128bit neon
寄存器(256 字节)
一个 81x90000 的矩阵在转置时需要保存 90000 个中间值来进行乘法运算,并且由于 360000 字节不适合 512 字节的寄存器组,因此将会有大量的内存交换,这会转化为巨大的性能影响.
另一方面,向量的 4*81 字节正好适合 512 字节。
void matVecMult81x90000(float *pDst, float *pMat, float *pVec)
{
register float32x4_t vec0_3, vec4_7, vec8_11, vec12_15, vec16_19, vec20_23, vec24_27, vec28_31, vec32_35, vec36_39, vec40_43, vec44_47, vec48_51, vec52_55, vec56_59, vec60_63, vec64_67, vec68_71, vec72_75, vec76_79, vec80;
register float32x4_t mat0, mat1, mat2, mat3, mat4, rslt;
register float32x2_t drslt;
register uint32_t nRows = 90000;
vec80 = vdupq_n_f32(0.0f);
mat4 =vdupq_n_f32(0.0f);
vec0_3 = vld1q_f32(pVec); pVec += 4;
vec4_7 = vld1q_f32(pVec); pVec += 4;
vec8_11 = vld1q_f32(pVec); pVec += 4;
vec12_15 = vld1q_f32(pVec); pVec += 4;
vec16_19 = vld1q_f32(pVec); pVec += 4;
vec20_23 = vld1q_f32(pVec); pVec += 4;
vec24_27 = vld1q_f32(pVec); pVec += 4;
vec28_31 = vld1q_f32(pVec); pVec += 4;
vec32_35 = vld1q_f32(pVec); pVec += 4;
vec36_39 = vld1q_f32(pVec); pVec += 4;
vec40_43 = vld1q_f32(pVec); pVec += 4;
vec44_47 = vld1q_f32(pVec); pVec += 4;
vec48_51 = vld1q_f32(pVec); pVec += 4;
vec52_55 = vld1q_f32(pVec); pVec += 4;
vec56_59 = vld1q_f32(pVec); pVec += 4;
vec60_63 = vld1q_f32(pVec); pVec += 4;
vec64_67 = vld1q_f32(pVec); pVec += 4;
vec68_71 = vld1q_f32(pVec); pVec += 4;
vec72_75 = vld1q_f32(pVec); pVec += 4;
vec76_79 = vld1q_f32(pVec); pVec += 4;
vld1q_lane_f32(pVec, vec80, 0);
do {
mat0 = vld1q_f32(pMat); pMat += 4;
mat1 = vld1q_f32(pMat); pMat += 4;
mat2 = vld1q_f32(pMat); pMat += 4;
mat3 = vld1q_f32(pMat); pMat += 4;
rslt = vmulq_f32(mat0, vec0_3);
rslt += vmulq_f32(mat1, vec4_7);
rslt += vmulq_f32(mat2, vec8_11);
rslt += vmulq_f32(mat3, vec12_15);
mat0 = vld1q_f32(pMat); pMat += 4;
mat1 = vld1q_f32(pMat); pMat += 4;
mat2 = vld1q_f32(pMat); pMat += 4;
mat3 = vld1q_f32(pMat); pMat += 4;
rslt += vmulq_f32(mat0, vec16_19);
rslt += vmulq_f32(mat1, vec20_23);
rslt += vmulq_f32(mat2, vec24_27);
rslt += vmulq_f32(mat3, vec28_31);
mat0 = vld1q_f32(pMat); pMat += 4;
mat1 = vld1q_f32(pMat); pMat += 4;
mat2 = vld1q_f32(pMat); pMat += 4;
mat3 = vld1q_f32(pMat); pMat += 4;
rslt += vmulq_f32(mat0, vec32_35);
rslt += vmulq_f32(mat1, vec36_39);
rslt += vmulq_f32(mat2, vec40_43);
rslt += vmulq_f32(mat3, vec44_47);
mat0 = vld1q_f32(pMat); pMat += 4;
mat1 = vld1q_f32(pMat); pMat += 4;
mat2 = vld1q_f32(pMat); pMat += 4;
mat3 = vld1q_f32(pMat); pMat += 4;
rslt += vmulq_f32(mat0, vec48_51);
rslt += vmulq_f32(mat1, vec52_55);
rslt += vmulq_f32(mat2, vec56_59);
rslt += vmulq_f32(mat3, vec60_63);
mat0 = vld1q_f32(pMat); pMat += 4;
mat1 = vld1q_f32(pMat); pMat += 4;
mat2 = vld1q_f32(pMat); pMat += 4;
mat3 = vld1q_f32(pMat); pMat += 4;
vld1q_lane_f32(pMat, mat4, 0); pMat += 1;
rslt += vmulq_f32(mat0, vec64_67);
rslt += vmulq_f32(mat1, vec68_71);
rslt += vmulq_f32(mat2, vec72_75);
rslt += vmulq_f32(mat3, vec76_79);
rslt += vmulq_f32(mat4, vec80);
*pDst++ = vaddvq_f32(rslt);
} while (--nRows);
}
不幸的是,编译器并不能很好地配合。 (GCC 和 Clang)
生成的代码显示了循环内 Vector 上的一些堆栈交换。
下面是没有任何堆栈交换的手写汇编中的相同函数:
.arch armv8-a
.global matVecMult81x90000_asm
.text
.balign 64
.func
matVecMult81x90000_asm:
// init loop counter
mov w3, #90000 & 0xffff
movk w3, #90000>>16, lsl #16
// preserve registers
stp d8, d9, [sp, #-48]!
stp d10, d11, [sp, #1*16]
stp d12, d13, [sp, #2*16]
// load vectors
ldp q0, q1, [x2, #0*32]
ldp q2, q3, [x2, #1*32]
ldp q4, q5, [x2, #2*32]
ldp q6, q7, [x2, #3*32]
ldp q8, q9, [x2, #4*32]
ldp q10, q11, [x2, #5*32]
ldp q12, q13, [x2, #6*32]
ldp q16, q17, [x2, #7*32]
ldp q18, q19, [x2, #8*32]
ldp q20, q21, [x2, #9*32]
ldr s22, [x2, #10*32]
// loop
.balign 64
1:
ldp q24, q25, [x1, #0*32]
ldp q26, q27, [x1, #1*32]
ldp q28, q29, [x1, #2*32]
ldp q30, q31, [x1, #3*32]
subs w3, w3, #1
fmul v23.4s, v24.4s, v0.4s
fmla v23.4s, v25.4s, v1.4s
fmla v23.4s, v26.4s, v2.4s
fmla v23.4s, v27.4s, v3.4s
fmla v23.4s, v28.4s, v4.4s
fmla v23.4s, v29.4s, v5.4s
fmla v23.4s, v30.4s, v6.4s
fmla v23.4s, v31.4s, v7.4s
ldp q24, q25, [x1, #4*32]
ldp q26, q27, [x1, #5*32]
ldp q28, q29, [x1, #6*32]
ldp q30, q31, [x1, #7*32]
fmla v23.4s, v24.4s, v8.4s
fmla v23.4s, v25.4s, v9.4s
fmla v23.4s, v26.4s, v10.4s
fmla v23.4s, v27.4s, v11.4s
fmla v23.4s, v28.4s, v12.4s
fmla v23.4s, v29.4s, v13.4s
fmla v23.4s, v30.4s, v16.4s
fmla v23.4s, v31.4s, v17.4s
ldp q24, q25, [x1, #8*32]
ldp q26, q27, [x1, #9*32]
ldr s28, [x1, #10*32]
fmla v23.4s, v24.4s, v18.4s
fmla v23.4s, v25.4s, v19.4s
fmla v23.4s, v26.4s, v20.4s
fmla v23.4s, v27.4s, v21.4s
fmla v23.4s, v28.4s, v22.4s
add x1, x1, #81*4
faddp v23.4s, v23.4s, v23.4s
faddp v23.2s, v23.2s, v23.2s
str s23, [x0], #4
b.ne 1b
.balign 8
//restore registers
ldp d10, d11, [sp, #1*16]
ldp d12, d13, [sp, #2*16]
ldp d8, d9, [sp], #48
// return
ret
.endfunc
.end
RK3368测试结果:
Clang 内在函数:10.41 毫秒
装配:9.59ms
在这种情况下,编译器的表现并没有那么糟糕,但它们往往愚蠢得令人难以置信。强烈推荐学习汇编。
这里是 Jake 答案的优化。
使用 4 个累加器而不是一个累加器会有所帮助,因为 FMA 指令的延迟远高于吞吐量。根据 Cortex-A72 optimization guide,FMLA
指令的延迟对于完整的东西是 7 个周期,或者当依赖于累加器时是 3 个周期(如果你想知道到底什么是 Q-form 和 D-form , Q 用于 16 字节向量,D 用于 8 字节向量)。吞吐量高得多,是1个周期,CPU每个周期可以运行一个FMA。
以下版本使用了 4 个独立的累加器而不是单个累加器,尽管我们在循环末尾需要 3 条额外的指令来对累加器求和,但应该会提高吞吐量。
我还使用了一些宏来帮助处理重复代码。未经测试。
void matVecMult81( float *pDst, const float *pMat, const float *pVec, size_t nRows = 90000 )
{
// 30 vector registers in total; ARM64 has 32 of them, so we're good.
float32x4_t vec0_3, vec4_7, vec8_11, vec12_15, vec16_19, vec20_23, vec24_27, vec28_31, vec32_35, vec36_39, vec40_43, vec44_47, vec48_51, vec52_55, vec56_59, vec60_63, vec64_67, vec68_71, vec72_75, vec76_79, vec80;
float32x4_t mat0, mat1, mat2, mat3, mat4;
float32x4_t res0, res1, res2, res3;
vec80 = mat4 = vdupq_n_f32( 0.0f );
// Load 16 numbers from pVec into 3 vector registers, incrementing the source pointer
#define LOAD_VEC_16( v0, v1, v2, v3 ) \
v0 = vld1q_f32( pVec ); pVec += 4; \
v1 = vld1q_f32( pVec ); pVec += 4; \
v2 = vld1q_f32( pVec ); pVec += 4; \
v3 = vld1q_f32( pVec ); pVec += 4
// Load the complete vector into registers using the above macro
LOAD_VEC_16( vec0_3, vec4_7, vec8_11, vec12_15 );
LOAD_VEC_16( vec16_19, vec20_23, vec24_27, vec28_31 );
LOAD_VEC_16( vec32_35, vec36_39, vec40_43, vec44_47 );
LOAD_VEC_16( vec48_51, vec52_55, vec56_59, vec60_63 );
LOAD_VEC_16( vec64_67, vec68_71, vec72_75, vec76_79 );
// Load the final scalar of the vector
vec80 = vld1q_lane_f32( pVec, vec80, 0 );
#undef LOAD_VEC_16
// Load 16 numbers from pMat into mat0 - mat3, incrementing the source pointer
#define LOAD_MATRIX_16() \
mat0 = vld1q_f32( pMat ); pMat += 4; \
mat1 = vld1q_f32( pMat ); pMat += 4; \
mat2 = vld1q_f32( pMat ); pMat += 4; \
mat3 = vld1q_f32( pMat ); pMat += 4
// Multiply 16 numbers in mat0 - mat3 by the specified pieces of the vector, and accumulate into res0 - res3
// Multiple accumulators is critical for performance, 4 instructions produced by this macro don't have data dependencies between them.
#define HANDLE_BLOCK_16( v0, v1, v2, v3 ) \
res0 = vfmaq_f32( res0, mat0, v0 ); \
res1 = vfmaq_f32( res1, mat1, v1 ); \
res2 = vfmaq_f32( res2, mat2, v2 ); \
res3 = vfmaq_f32( res3, mat3, v3 )
const float* const pMatEnd = pMat + nRows * 81;
while( pMat < pMatEnd )
{
// Initial 16 elements only need multiplication.
LOAD_MATRIX_16();
res0 = vmulq_f32( mat0, vec0_3 );
res1 = vmulq_f32( mat1, vec4_7 );
res2 = vmulq_f32( mat2, vec8_11 );
res3 = vmulq_f32( mat3, vec12_15 );
// Handle the rest of the row using FMA instructions.
LOAD_MATRIX_16();
HANDLE_BLOCK_16( vec16_19, vec20_23, vec24_27, vec28_31 );
LOAD_MATRIX_16();
HANDLE_BLOCK_16( vec32_35, vec36_39, vec40_43, vec44_47 );
LOAD_MATRIX_16();
HANDLE_BLOCK_16( vec48_51, vec52_55, vec56_59, vec60_63 );
// The final block of the row has 17 scalars instead of 16
LOAD_MATRIX_16();
mat4 = vld1q_lane_f32( pMat, mat4, 0 ); pMat++;
HANDLE_BLOCK_16( vec64_67, vec68_71, vec72_75, vec76_79 );
res0 = vfmaq_f32( res0, mat4, vec80 );
// Vertically add 4 accumulators into res0
res1 = vaddq_f32( res1, res2 );
res0 = vaddq_f32( res3, res0 );
res0 = vaddq_f32( res1, res0 );
// Store the horizontal sum of the accumulator
*pDst = vaddvq_f32( res0 );
pDst++;
}
#undef LOAD_MATRIX_16
#undef HANDLE_BLOCK_16
}
使用 GCC 10.1 从该源生成的程序集 looks more or less OK。
我需要优化矩阵向量乘法。数据如下所示:
- 向量有 81 列
- 矩阵有 90,000 行和 81 列并且已经转置。所以可以使用逐行点积。
- 因此输出是一个包含 90,000 行的向量
- 都在一维浮点数组中
此例程还必须满足一些非功能要求:
- 应使用尽可能少的标准库(例如不使用
std::vector
) - 不应该使用第三方库(所以我也不用
Eigen
或Blas
)
这是我的(经过简化,为了便于阅读,我假设输入被完全屏蔽)代码,
// input_height = 90000
// input_width = 81
for (uint32_t y = 0; y < input_height; y += 4) {
float32x4_t sum0 = vmovq_n_f32(0);
float32x4_t sum1 = vmovq_n_f32(0);
float32x4_t sum2 = vmovq_n_f32(0);
float32x4_t sum3 = vmovq_n_f32(0);
for (uint32_t x = 0; x < input_width; x += 16) {
float32x4x4_t A = load_matrix_transpose(kernel + x);
float32x4x4_t B0 = load_matrix_transpose(input + y * input_width + x);
float32x4x4_t B1 = load_matrix_transpose(input + (y + 1) * input_width + x);
float32x4x4_t B2 = load_matrix_transpose(input + (y + 2) * input_width + x);
float32x4x4_t B3 = load_matrix_transpose(input + (y + 3) * input_width + x);
matrix_element_wise_multiplication(A, B0, sum0);
matrix_element_wise_multiplication(A, B1, sum1);
matrix_element_wise_multiplication(A, B2, sum2);
matrix_element_wise_multiplication(A, B3, sum3);
}
output[y] = vaddvq_f32(sum0);
output[y + 1] = vaddvq_f32(sum1);
output[y + 2] = vaddvq_f32(sum2);
output[y + 3] = vaddvq_f32(sum3);
}
其中load_matrix_transpose
、matrix_element_wise_multiplication
是以下函数:
inline float32x4x4_t load_matrix_transpose(float *a) {
float32x4x4_t ret;
ret.val[0] = simd_load(a);
ret.val[1] = simd_load(a + 4);
ret.val[2] = simd_load(a + 8);
ret.val[3] = simd_load(a + 12);
return ret;
}
inline void simd_matrix_element_wise_multiplication(float32x4x4_t & A, float32x4x4_t & B, float32x4x4_t & C) {
C = vmlaq_f32(C, A.val[0], B.val[0]);
C = vmlaq_f32(C, A.val[1], B.val[1]);
C = vmlaq_f32(C, A.val[2], B.val[2]);
C = vmlaq_f32(C, A.val[3], B.val[3]);
}
在我的 Rasperry Pi 4(ARMv8、8GB RAM、4 核)上,代码采用优化级别 -O3
,大约 60ms
。
长 运行(多次循环),Neon 寄存器版本的速度正好是普通代码的两倍。
我的问题是,有没有进一步优化代码的方法?我尝试了很多东西,但无法对正常代码进行任何改进。
就优化而言,数据局部性是最高优先级,您应该了解寄存器容量,因为寄存器是迄今为止最快和最稀缺的资源。
aarch64
: 32x128bit neon
寄存器(512 字节)
aarch32
: 16x128bit neon
寄存器(256 字节)
一个 81x90000 的矩阵在转置时需要保存 90000 个中间值来进行乘法运算,并且由于 360000 字节不适合 512 字节的寄存器组,因此将会有大量的内存交换,这会转化为巨大的性能影响.
另一方面,向量的 4*81 字节正好适合 512 字节。
void matVecMult81x90000(float *pDst, float *pMat, float *pVec)
{
register float32x4_t vec0_3, vec4_7, vec8_11, vec12_15, vec16_19, vec20_23, vec24_27, vec28_31, vec32_35, vec36_39, vec40_43, vec44_47, vec48_51, vec52_55, vec56_59, vec60_63, vec64_67, vec68_71, vec72_75, vec76_79, vec80;
register float32x4_t mat0, mat1, mat2, mat3, mat4, rslt;
register float32x2_t drslt;
register uint32_t nRows = 90000;
vec80 = vdupq_n_f32(0.0f);
mat4 =vdupq_n_f32(0.0f);
vec0_3 = vld1q_f32(pVec); pVec += 4;
vec4_7 = vld1q_f32(pVec); pVec += 4;
vec8_11 = vld1q_f32(pVec); pVec += 4;
vec12_15 = vld1q_f32(pVec); pVec += 4;
vec16_19 = vld1q_f32(pVec); pVec += 4;
vec20_23 = vld1q_f32(pVec); pVec += 4;
vec24_27 = vld1q_f32(pVec); pVec += 4;
vec28_31 = vld1q_f32(pVec); pVec += 4;
vec32_35 = vld1q_f32(pVec); pVec += 4;
vec36_39 = vld1q_f32(pVec); pVec += 4;
vec40_43 = vld1q_f32(pVec); pVec += 4;
vec44_47 = vld1q_f32(pVec); pVec += 4;
vec48_51 = vld1q_f32(pVec); pVec += 4;
vec52_55 = vld1q_f32(pVec); pVec += 4;
vec56_59 = vld1q_f32(pVec); pVec += 4;
vec60_63 = vld1q_f32(pVec); pVec += 4;
vec64_67 = vld1q_f32(pVec); pVec += 4;
vec68_71 = vld1q_f32(pVec); pVec += 4;
vec72_75 = vld1q_f32(pVec); pVec += 4;
vec76_79 = vld1q_f32(pVec); pVec += 4;
vld1q_lane_f32(pVec, vec80, 0);
do {
mat0 = vld1q_f32(pMat); pMat += 4;
mat1 = vld1q_f32(pMat); pMat += 4;
mat2 = vld1q_f32(pMat); pMat += 4;
mat3 = vld1q_f32(pMat); pMat += 4;
rslt = vmulq_f32(mat0, vec0_3);
rslt += vmulq_f32(mat1, vec4_7);
rslt += vmulq_f32(mat2, vec8_11);
rslt += vmulq_f32(mat3, vec12_15);
mat0 = vld1q_f32(pMat); pMat += 4;
mat1 = vld1q_f32(pMat); pMat += 4;
mat2 = vld1q_f32(pMat); pMat += 4;
mat3 = vld1q_f32(pMat); pMat += 4;
rslt += vmulq_f32(mat0, vec16_19);
rslt += vmulq_f32(mat1, vec20_23);
rslt += vmulq_f32(mat2, vec24_27);
rslt += vmulq_f32(mat3, vec28_31);
mat0 = vld1q_f32(pMat); pMat += 4;
mat1 = vld1q_f32(pMat); pMat += 4;
mat2 = vld1q_f32(pMat); pMat += 4;
mat3 = vld1q_f32(pMat); pMat += 4;
rslt += vmulq_f32(mat0, vec32_35);
rslt += vmulq_f32(mat1, vec36_39);
rslt += vmulq_f32(mat2, vec40_43);
rslt += vmulq_f32(mat3, vec44_47);
mat0 = vld1q_f32(pMat); pMat += 4;
mat1 = vld1q_f32(pMat); pMat += 4;
mat2 = vld1q_f32(pMat); pMat += 4;
mat3 = vld1q_f32(pMat); pMat += 4;
rslt += vmulq_f32(mat0, vec48_51);
rslt += vmulq_f32(mat1, vec52_55);
rslt += vmulq_f32(mat2, vec56_59);
rslt += vmulq_f32(mat3, vec60_63);
mat0 = vld1q_f32(pMat); pMat += 4;
mat1 = vld1q_f32(pMat); pMat += 4;
mat2 = vld1q_f32(pMat); pMat += 4;
mat3 = vld1q_f32(pMat); pMat += 4;
vld1q_lane_f32(pMat, mat4, 0); pMat += 1;
rslt += vmulq_f32(mat0, vec64_67);
rslt += vmulq_f32(mat1, vec68_71);
rslt += vmulq_f32(mat2, vec72_75);
rslt += vmulq_f32(mat3, vec76_79);
rslt += vmulq_f32(mat4, vec80);
*pDst++ = vaddvq_f32(rslt);
} while (--nRows);
}
不幸的是,编译器并不能很好地配合。 (GCC 和 Clang)
生成的代码显示了循环内 Vector 上的一些堆栈交换。
下面是没有任何堆栈交换的手写汇编中的相同函数:
.arch armv8-a
.global matVecMult81x90000_asm
.text
.balign 64
.func
matVecMult81x90000_asm:
// init loop counter
mov w3, #90000 & 0xffff
movk w3, #90000>>16, lsl #16
// preserve registers
stp d8, d9, [sp, #-48]!
stp d10, d11, [sp, #1*16]
stp d12, d13, [sp, #2*16]
// load vectors
ldp q0, q1, [x2, #0*32]
ldp q2, q3, [x2, #1*32]
ldp q4, q5, [x2, #2*32]
ldp q6, q7, [x2, #3*32]
ldp q8, q9, [x2, #4*32]
ldp q10, q11, [x2, #5*32]
ldp q12, q13, [x2, #6*32]
ldp q16, q17, [x2, #7*32]
ldp q18, q19, [x2, #8*32]
ldp q20, q21, [x2, #9*32]
ldr s22, [x2, #10*32]
// loop
.balign 64
1:
ldp q24, q25, [x1, #0*32]
ldp q26, q27, [x1, #1*32]
ldp q28, q29, [x1, #2*32]
ldp q30, q31, [x1, #3*32]
subs w3, w3, #1
fmul v23.4s, v24.4s, v0.4s
fmla v23.4s, v25.4s, v1.4s
fmla v23.4s, v26.4s, v2.4s
fmla v23.4s, v27.4s, v3.4s
fmla v23.4s, v28.4s, v4.4s
fmla v23.4s, v29.4s, v5.4s
fmla v23.4s, v30.4s, v6.4s
fmla v23.4s, v31.4s, v7.4s
ldp q24, q25, [x1, #4*32]
ldp q26, q27, [x1, #5*32]
ldp q28, q29, [x1, #6*32]
ldp q30, q31, [x1, #7*32]
fmla v23.4s, v24.4s, v8.4s
fmla v23.4s, v25.4s, v9.4s
fmla v23.4s, v26.4s, v10.4s
fmla v23.4s, v27.4s, v11.4s
fmla v23.4s, v28.4s, v12.4s
fmla v23.4s, v29.4s, v13.4s
fmla v23.4s, v30.4s, v16.4s
fmla v23.4s, v31.4s, v17.4s
ldp q24, q25, [x1, #8*32]
ldp q26, q27, [x1, #9*32]
ldr s28, [x1, #10*32]
fmla v23.4s, v24.4s, v18.4s
fmla v23.4s, v25.4s, v19.4s
fmla v23.4s, v26.4s, v20.4s
fmla v23.4s, v27.4s, v21.4s
fmla v23.4s, v28.4s, v22.4s
add x1, x1, #81*4
faddp v23.4s, v23.4s, v23.4s
faddp v23.2s, v23.2s, v23.2s
str s23, [x0], #4
b.ne 1b
.balign 8
//restore registers
ldp d10, d11, [sp, #1*16]
ldp d12, d13, [sp, #2*16]
ldp d8, d9, [sp], #48
// return
ret
.endfunc
.end
RK3368测试结果:
Clang 内在函数:10.41 毫秒
装配:9.59ms
在这种情况下,编译器的表现并没有那么糟糕,但它们往往愚蠢得令人难以置信。强烈推荐学习汇编。
这里是 Jake 答案的优化。
使用 4 个累加器而不是一个累加器会有所帮助,因为 FMA 指令的延迟远高于吞吐量。根据 Cortex-A72 optimization guide,FMLA
指令的延迟对于完整的东西是 7 个周期,或者当依赖于累加器时是 3 个周期(如果你想知道到底什么是 Q-form 和 D-form , Q 用于 16 字节向量,D 用于 8 字节向量)。吞吐量高得多,是1个周期,CPU每个周期可以运行一个FMA。
以下版本使用了 4 个独立的累加器而不是单个累加器,尽管我们在循环末尾需要 3 条额外的指令来对累加器求和,但应该会提高吞吐量。
我还使用了一些宏来帮助处理重复代码。未经测试。
void matVecMult81( float *pDst, const float *pMat, const float *pVec, size_t nRows = 90000 )
{
// 30 vector registers in total; ARM64 has 32 of them, so we're good.
float32x4_t vec0_3, vec4_7, vec8_11, vec12_15, vec16_19, vec20_23, vec24_27, vec28_31, vec32_35, vec36_39, vec40_43, vec44_47, vec48_51, vec52_55, vec56_59, vec60_63, vec64_67, vec68_71, vec72_75, vec76_79, vec80;
float32x4_t mat0, mat1, mat2, mat3, mat4;
float32x4_t res0, res1, res2, res3;
vec80 = mat4 = vdupq_n_f32( 0.0f );
// Load 16 numbers from pVec into 3 vector registers, incrementing the source pointer
#define LOAD_VEC_16( v0, v1, v2, v3 ) \
v0 = vld1q_f32( pVec ); pVec += 4; \
v1 = vld1q_f32( pVec ); pVec += 4; \
v2 = vld1q_f32( pVec ); pVec += 4; \
v3 = vld1q_f32( pVec ); pVec += 4
// Load the complete vector into registers using the above macro
LOAD_VEC_16( vec0_3, vec4_7, vec8_11, vec12_15 );
LOAD_VEC_16( vec16_19, vec20_23, vec24_27, vec28_31 );
LOAD_VEC_16( vec32_35, vec36_39, vec40_43, vec44_47 );
LOAD_VEC_16( vec48_51, vec52_55, vec56_59, vec60_63 );
LOAD_VEC_16( vec64_67, vec68_71, vec72_75, vec76_79 );
// Load the final scalar of the vector
vec80 = vld1q_lane_f32( pVec, vec80, 0 );
#undef LOAD_VEC_16
// Load 16 numbers from pMat into mat0 - mat3, incrementing the source pointer
#define LOAD_MATRIX_16() \
mat0 = vld1q_f32( pMat ); pMat += 4; \
mat1 = vld1q_f32( pMat ); pMat += 4; \
mat2 = vld1q_f32( pMat ); pMat += 4; \
mat3 = vld1q_f32( pMat ); pMat += 4
// Multiply 16 numbers in mat0 - mat3 by the specified pieces of the vector, and accumulate into res0 - res3
// Multiple accumulators is critical for performance, 4 instructions produced by this macro don't have data dependencies between them.
#define HANDLE_BLOCK_16( v0, v1, v2, v3 ) \
res0 = vfmaq_f32( res0, mat0, v0 ); \
res1 = vfmaq_f32( res1, mat1, v1 ); \
res2 = vfmaq_f32( res2, mat2, v2 ); \
res3 = vfmaq_f32( res3, mat3, v3 )
const float* const pMatEnd = pMat + nRows * 81;
while( pMat < pMatEnd )
{
// Initial 16 elements only need multiplication.
LOAD_MATRIX_16();
res0 = vmulq_f32( mat0, vec0_3 );
res1 = vmulq_f32( mat1, vec4_7 );
res2 = vmulq_f32( mat2, vec8_11 );
res3 = vmulq_f32( mat3, vec12_15 );
// Handle the rest of the row using FMA instructions.
LOAD_MATRIX_16();
HANDLE_BLOCK_16( vec16_19, vec20_23, vec24_27, vec28_31 );
LOAD_MATRIX_16();
HANDLE_BLOCK_16( vec32_35, vec36_39, vec40_43, vec44_47 );
LOAD_MATRIX_16();
HANDLE_BLOCK_16( vec48_51, vec52_55, vec56_59, vec60_63 );
// The final block of the row has 17 scalars instead of 16
LOAD_MATRIX_16();
mat4 = vld1q_lane_f32( pMat, mat4, 0 ); pMat++;
HANDLE_BLOCK_16( vec64_67, vec68_71, vec72_75, vec76_79 );
res0 = vfmaq_f32( res0, mat4, vec80 );
// Vertically add 4 accumulators into res0
res1 = vaddq_f32( res1, res2 );
res0 = vaddq_f32( res3, res0 );
res0 = vaddq_f32( res1, res0 );
// Store the horizontal sum of the accumulator
*pDst = vaddvq_f32( res0 );
pDst++;
}
#undef LOAD_MATRIX_16
#undef HANDLE_BLOCK_16
}
使用 GCC 10.1 从该源生成的程序集 looks more or less OK。