NEON 中的 _mm_hadd_ps 等价于什么?
What's the equivalent of _mm_hadd_ps in NEON?
我正在尝试将以下代码从 SSE 转换为适用于 Apple 64 位 iOS 设备的 NEON:
void Matrix::TransformPoint( const float vec[ 4 ], const Matrix& matTrans, float out[ 4 ] )
{
alignas( 16 ) float v4[ 4 ] = { vec[ 0 ], vec[ 1 ], vec[ 2 ], vec[ 3 ] };
__m128 vec4 = _mm_load_ps( v4 );
__m128 row1 = _mm_load_ps( &matTrans.m[ 0 ] );
__m128 row2 = _mm_load_ps( &matTrans.m[ 4 ] );
__m128 row3 = _mm_load_ps( &matTrans.m[ 8 ] );
__m128 row4 = _mm_load_ps( &matTrans.m[ 12 ] );
__m128 r1 = _mm_mul_ps( row1, vec4 );
__m128 r2 = _mm_mul_ps( row2, vec4 );
__m128 r3 = _mm_mul_ps( row3, vec4 );
__m128 r4 = _mm_mul_ps( row4, vec4 );
__m128 sum_01 = _mm_hadd_ps( r1, r2 );
__m128 sum_23 = _mm_hadd_ps( r3, r4 );
__m128 result = _mm_hadd_ps( sum_01, sum_23 );
_mm_store_ps( out, result );
}
这是我目前拥有的:
alignas( 16 ) float v4[ 4 ] = { vec[ 0 ], vec[ 1 ], vec[ 2 ], vec[ 3 ] };
float32x4_t vec4 = vld1q_f32( v4 );
float32x4_t row1 = vld1q_f32( &mat.m[ 0 ] );
float32x4_t row2 = vld1q_f32( &mat.m[ 4 ] );
float32x4_t row3 = vld1q_f32( &mat.m[ 8 ] );
float32x4_t row4 = vld1q_f32( &mat.m[ 12 ] );
float32x4_t r1 = vmulq_f32( row1, vec4 );
float32x4_t r2 = vmulq_f32( row2, vec4 );
float32x4_t r3 = vmulq_f32( row3, vec4 );
float32x4_t r4 = vmulq_f32( row4, vec4 );
float32x4_t sum_01 = ??? <-- How to write this?
float32x4_t sum_23 = ??? <-- How to write this?
float32x4_t result = ??? <-- How to write this?
vst1q_f32( out, result );
如何替换 _mm_hadd_ps
?
这是典型的矩阵-向量乘法,最好先进行矩阵转置,然后再进行一系列向量-标量乘法-累加运算,这样可以完全避免耗时的水平加法:
float32x4x4_t mat;
float32x4_t vec4, result;
mat = vld4q_f32(pMat);
vec4 = vld1q_f32(pVec);
result = vmulq_lane_f32(mat.val[0], vget_low_f32(vec4), 0);
result = vmlaq_lane_f32(result, mat.val[1], vget_low_f32(vec4), 1);
result = vmlaq_lane_f32(result, mat.val[2], vget_high_f32(vec4), 0);
result = vmlaq_lane_f32(result, mat.val[3], vget_high_f32(vec4), 1);
vst1q_f32(pDst, result);
我同意你想重写这个函数以避免水平求和的其他发帖者。有关示例,请参阅 DirectXMath:
inline XMVECTOR XM_CALLCONV XMVector4Transform
(
FXMVECTOR V,
FXMMATRIX M
)
{
#if defined(_XM_NO_INTRINSICS_)
float fX = (M.m[0][0]*V.vector4_f32[0])+(M.m[1][0]*V.vector4_f32[1])+(M.m[2][0]*V.vector4_f32[2])+(M.m[3][0]*V.vector4_f32[3]);
float fY = (M.m[0][1]*V.vector4_f32[0])+(M.m[1][1]*V.vector4_f32[1])+(M.m[2][1]*V.vector4_f32[2])+(M.m[3][1]*V.vector4_f32[3]);
float fZ = (M.m[0][2]*V.vector4_f32[0])+(M.m[1][2]*V.vector4_f32[1])+(M.m[2][2]*V.vector4_f32[2])+(M.m[3][2]*V.vector4_f32[3]);
float fW = (M.m[0][3]*V.vector4_f32[0])+(M.m[1][3]*V.vector4_f32[1])+(M.m[2][3]*V.vector4_f32[2])+(M.m[3][3]*V.vector4_f32[3]);
XMVECTORF32 vResult = { { { fX, fY, fZ, fW } } };
return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
float32x2_t VL = vget_low_f32( V );
XMVECTOR vResult = vmulq_lane_f32( M.r[0], VL, 0 ); // X
vResult = vmlaq_lane_f32( vResult, M.r[1], VL, 1 ); // Y
float32x2_t VH = vget_high_f32( V );
vResult = vmlaq_lane_f32( vResult, M.r[2], VH, 0 ); // Z
return vmlaq_lane_f32( vResult, M.r[3], VH, 1 ); // W
#elif defined(_XM_SSE_INTRINSICS_)
// Splat x,y,z and w
XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
// Mul by the matrix
vTempX = _mm_mul_ps(vTempX,M.r[0]);
vTempY = _mm_mul_ps(vTempY,M.r[1]);
vTempZ = _mm_mul_ps(vTempZ,M.r[2]);
vTempW = _mm_mul_ps(vTempW,M.r[3]);
// Add them all together
vTempX = _mm_add_ps(vTempX,vTempY);
vTempZ = _mm_add_ps(vTempZ,vTempW);
vTempX = _mm_add_ps(vTempX,vTempZ);
return vTempX;
#endif
}
为了回答有关水平向量加法的明确问题,我使用了两个成对加法。对于 ARMv8 / ARM64,有一个 vpaddq_f32
这使得它只需两条指令即可对 4 个值求和:
inline XMVECTOR XM_CALLCONV XMVectorSum
(
FXMVECTOR V
)
{
#if defined(_XM_NO_INTRINSICS_)
XMVECTORF32 Result;
Result.f[0] =
Result.f[1] =
Result.f[2] =
Result.f[3] = V.vector4_f32[0] + V.vector4_f32[1] + V.vector4_f32[2] + V.vector4_f32[3];
return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__
XMVECTOR vTemp = vpaddq_f32(V, V);
return vpaddq_f32(vTemp,vTemp);
#else
float32x2_t v1 = vget_low_f32(V);
float32x2_t v2 = vget_high_f32(V);
v1 = vadd_f32(v1, v2);
v1 = vpadd_f32(v1, v1);
return vcombine_f32(v1, v1);
#endif
#elif defined(_XM_SSE3_INTRINSICS_)
XMVECTOR vTemp = _mm_hadd_ps(V, V);
return _mm_hadd_ps(vTemp,vTemp);
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 3, 0, 1));
XMVECTOR vTemp2 = _mm_add_ps(V, vTemp);
vTemp = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 0, 3, 2));
return _mm_add_ps(vTemp, vTemp2);
#endif
}
回答正题,neon有pairwise adds,和sse的horizontal add一样。寻找 vpadd_f32.
我正在尝试将以下代码从 SSE 转换为适用于 Apple 64 位 iOS 设备的 NEON:
void Matrix::TransformPoint( const float vec[ 4 ], const Matrix& matTrans, float out[ 4 ] )
{
alignas( 16 ) float v4[ 4 ] = { vec[ 0 ], vec[ 1 ], vec[ 2 ], vec[ 3 ] };
__m128 vec4 = _mm_load_ps( v4 );
__m128 row1 = _mm_load_ps( &matTrans.m[ 0 ] );
__m128 row2 = _mm_load_ps( &matTrans.m[ 4 ] );
__m128 row3 = _mm_load_ps( &matTrans.m[ 8 ] );
__m128 row4 = _mm_load_ps( &matTrans.m[ 12 ] );
__m128 r1 = _mm_mul_ps( row1, vec4 );
__m128 r2 = _mm_mul_ps( row2, vec4 );
__m128 r3 = _mm_mul_ps( row3, vec4 );
__m128 r4 = _mm_mul_ps( row4, vec4 );
__m128 sum_01 = _mm_hadd_ps( r1, r2 );
__m128 sum_23 = _mm_hadd_ps( r3, r4 );
__m128 result = _mm_hadd_ps( sum_01, sum_23 );
_mm_store_ps( out, result );
}
这是我目前拥有的:
alignas( 16 ) float v4[ 4 ] = { vec[ 0 ], vec[ 1 ], vec[ 2 ], vec[ 3 ] };
float32x4_t vec4 = vld1q_f32( v4 );
float32x4_t row1 = vld1q_f32( &mat.m[ 0 ] );
float32x4_t row2 = vld1q_f32( &mat.m[ 4 ] );
float32x4_t row3 = vld1q_f32( &mat.m[ 8 ] );
float32x4_t row4 = vld1q_f32( &mat.m[ 12 ] );
float32x4_t r1 = vmulq_f32( row1, vec4 );
float32x4_t r2 = vmulq_f32( row2, vec4 );
float32x4_t r3 = vmulq_f32( row3, vec4 );
float32x4_t r4 = vmulq_f32( row4, vec4 );
float32x4_t sum_01 = ??? <-- How to write this?
float32x4_t sum_23 = ??? <-- How to write this?
float32x4_t result = ??? <-- How to write this?
vst1q_f32( out, result );
如何替换 _mm_hadd_ps
?
这是典型的矩阵-向量乘法,最好先进行矩阵转置,然后再进行一系列向量-标量乘法-累加运算,这样可以完全避免耗时的水平加法:
float32x4x4_t mat;
float32x4_t vec4, result;
mat = vld4q_f32(pMat);
vec4 = vld1q_f32(pVec);
result = vmulq_lane_f32(mat.val[0], vget_low_f32(vec4), 0);
result = vmlaq_lane_f32(result, mat.val[1], vget_low_f32(vec4), 1);
result = vmlaq_lane_f32(result, mat.val[2], vget_high_f32(vec4), 0);
result = vmlaq_lane_f32(result, mat.val[3], vget_high_f32(vec4), 1);
vst1q_f32(pDst, result);
我同意你想重写这个函数以避免水平求和的其他发帖者。有关示例,请参阅 DirectXMath:
inline XMVECTOR XM_CALLCONV XMVector4Transform
(
FXMVECTOR V,
FXMMATRIX M
)
{
#if defined(_XM_NO_INTRINSICS_)
float fX = (M.m[0][0]*V.vector4_f32[0])+(M.m[1][0]*V.vector4_f32[1])+(M.m[2][0]*V.vector4_f32[2])+(M.m[3][0]*V.vector4_f32[3]);
float fY = (M.m[0][1]*V.vector4_f32[0])+(M.m[1][1]*V.vector4_f32[1])+(M.m[2][1]*V.vector4_f32[2])+(M.m[3][1]*V.vector4_f32[3]);
float fZ = (M.m[0][2]*V.vector4_f32[0])+(M.m[1][2]*V.vector4_f32[1])+(M.m[2][2]*V.vector4_f32[2])+(M.m[3][2]*V.vector4_f32[3]);
float fW = (M.m[0][3]*V.vector4_f32[0])+(M.m[1][3]*V.vector4_f32[1])+(M.m[2][3]*V.vector4_f32[2])+(M.m[3][3]*V.vector4_f32[3]);
XMVECTORF32 vResult = { { { fX, fY, fZ, fW } } };
return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
float32x2_t VL = vget_low_f32( V );
XMVECTOR vResult = vmulq_lane_f32( M.r[0], VL, 0 ); // X
vResult = vmlaq_lane_f32( vResult, M.r[1], VL, 1 ); // Y
float32x2_t VH = vget_high_f32( V );
vResult = vmlaq_lane_f32( vResult, M.r[2], VH, 0 ); // Z
return vmlaq_lane_f32( vResult, M.r[3], VH, 1 ); // W
#elif defined(_XM_SSE_INTRINSICS_)
// Splat x,y,z and w
XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
// Mul by the matrix
vTempX = _mm_mul_ps(vTempX,M.r[0]);
vTempY = _mm_mul_ps(vTempY,M.r[1]);
vTempZ = _mm_mul_ps(vTempZ,M.r[2]);
vTempW = _mm_mul_ps(vTempW,M.r[3]);
// Add them all together
vTempX = _mm_add_ps(vTempX,vTempY);
vTempZ = _mm_add_ps(vTempZ,vTempW);
vTempX = _mm_add_ps(vTempX,vTempZ);
return vTempX;
#endif
}
为了回答有关水平向量加法的明确问题,我使用了两个成对加法。对于 ARMv8 / ARM64,有一个 vpaddq_f32
这使得它只需两条指令即可对 4 个值求和:
inline XMVECTOR XM_CALLCONV XMVectorSum
(
FXMVECTOR V
)
{
#if defined(_XM_NO_INTRINSICS_)
XMVECTORF32 Result;
Result.f[0] =
Result.f[1] =
Result.f[2] =
Result.f[3] = V.vector4_f32[0] + V.vector4_f32[1] + V.vector4_f32[2] + V.vector4_f32[3];
return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__
XMVECTOR vTemp = vpaddq_f32(V, V);
return vpaddq_f32(vTemp,vTemp);
#else
float32x2_t v1 = vget_low_f32(V);
float32x2_t v2 = vget_high_f32(V);
v1 = vadd_f32(v1, v2);
v1 = vpadd_f32(v1, v1);
return vcombine_f32(v1, v1);
#endif
#elif defined(_XM_SSE3_INTRINSICS_)
XMVECTOR vTemp = _mm_hadd_ps(V, V);
return _mm_hadd_ps(vTemp,vTemp);
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 3, 0, 1));
XMVECTOR vTemp2 = _mm_add_ps(V, vTemp);
vTemp = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 0, 3, 2));
return _mm_add_ps(vTemp, vTemp2);
#endif
}
回答正题,neon有pairwise adds,和sse的horizontal add一样。寻找 vpadd_f32.