XMVectorSetByIndex when index = 0 和 XMVectorSetX 有什么区别?
What's the difference between XMVectorSetByIndex when index = 0 and XMVectorSetX?
看了DirectXMath Library的源码,发现XMVectorSetByIndex
和XMVectorSetX
的实现完全不同。为什么不 XMVectorSetX
简单地 returns XMVectorSetByIndex
(索引 = 0)?
XMVectorSetX
实际上能够使用 SSE 或 ARM-NEON 内在函数,而 XMVectorSetByIndex
必须 'spill to memory'(即它根本不是 SIMD)。
// Set a single indexed floating point component
inline XMVECTOR XM_CALLCONV XMVectorSetByIndex(FXMVECTOR V, float f, size_t i)
{
assert( i < 4 );
_Analysis_assume_( i < 4 );
#if defined(_XM_NO_INTRINSICS_)
XMVECTOR U;
U = V;
U.vector4_f32[i] = f;
return U;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
XMVECTOR U = V;
U.n128_f32[i] = f;
return U;
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR U = V;
U.m128_f32[i] = f;
return U;
#endif
}
对比
// Sets the X component of a vector to a passed floating point value
inline XMVECTOR XM_CALLCONV XMVectorSetX(FXMVECTOR V, float x)
{
#if defined(_XM_NO_INTRINSICS_)
XMVECTOR U;
U.vector4_f32[0] = x;
U.vector4_f32[1] = V.vector4_f32[1];
U.vector4_f32[2] = V.vector4_f32[2];
U.vector4_f32[3] = V.vector4_f32[3];
return U;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
return vsetq_lane_f32(x,V,0);
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR vResult = _mm_set_ss(x);
vResult = _mm_move_ss(V,vResult);
return vResult;
#endif
}
查看 XMVectorSetY
案例以及 /arch:AVX
或 /arch:AVX2
可以使用 SSE4 指令 _mm_insert_ps
的情况也是有益的,否则它必须执行相当多的工作可以获得 SIMD 代码生成,而不必 'spill to memory'.
inline XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y)
{
#if defined(_XM_NO_INTRINSICS_)
XMVECTOR U;
U.vector4_f32[0] = V.vector4_f32[0];
U.vector4_f32[1] = y;
U.vector4_f32[2] = V.vector4_f32[2];
U.vector4_f32[3] = V.vector4_f32[3];
return U;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
return vsetq_lane_f32(y,V,1);
#elif defined(_XM_SSE4_INTRINSICS_)
XMVECTOR vResult = _mm_set_ss(y);
vResult = _mm_insert_ps( V, vResult, 0x10 );
return vResult;
#elif defined(_XM_SSE_INTRINSICS_)
// Swap y and x
XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
// Convert input to vector
XMVECTOR vTemp = _mm_set_ss(y);
// Replace the x component
vResult = _mm_move_ss(vResult,vTemp);
// Swap y and x again
vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1));
return vResult;
#endif
}
Note that DirectXMath is now available on GitHub.
看了DirectXMath Library的源码,发现XMVectorSetByIndex
和XMVectorSetX
的实现完全不同。为什么不 XMVectorSetX
简单地 returns XMVectorSetByIndex
(索引 = 0)?
XMVectorSetX
实际上能够使用 SSE 或 ARM-NEON 内在函数,而 XMVectorSetByIndex
必须 'spill to memory'(即它根本不是 SIMD)。
// Set a single indexed floating point component
inline XMVECTOR XM_CALLCONV XMVectorSetByIndex(FXMVECTOR V, float f, size_t i)
{
assert( i < 4 );
_Analysis_assume_( i < 4 );
#if defined(_XM_NO_INTRINSICS_)
XMVECTOR U;
U = V;
U.vector4_f32[i] = f;
return U;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
XMVECTOR U = V;
U.n128_f32[i] = f;
return U;
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR U = V;
U.m128_f32[i] = f;
return U;
#endif
}
对比
// Sets the X component of a vector to a passed floating point value
inline XMVECTOR XM_CALLCONV XMVectorSetX(FXMVECTOR V, float x)
{
#if defined(_XM_NO_INTRINSICS_)
XMVECTOR U;
U.vector4_f32[0] = x;
U.vector4_f32[1] = V.vector4_f32[1];
U.vector4_f32[2] = V.vector4_f32[2];
U.vector4_f32[3] = V.vector4_f32[3];
return U;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
return vsetq_lane_f32(x,V,0);
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR vResult = _mm_set_ss(x);
vResult = _mm_move_ss(V,vResult);
return vResult;
#endif
}
查看 XMVectorSetY
案例以及 /arch:AVX
或 /arch:AVX2
可以使用 SSE4 指令 _mm_insert_ps
的情况也是有益的,否则它必须执行相当多的工作可以获得 SIMD 代码生成,而不必 'spill to memory'.
inline XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y)
{
#if defined(_XM_NO_INTRINSICS_)
XMVECTOR U;
U.vector4_f32[0] = V.vector4_f32[0];
U.vector4_f32[1] = y;
U.vector4_f32[2] = V.vector4_f32[2];
U.vector4_f32[3] = V.vector4_f32[3];
return U;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
return vsetq_lane_f32(y,V,1);
#elif defined(_XM_SSE4_INTRINSICS_)
XMVECTOR vResult = _mm_set_ss(y);
vResult = _mm_insert_ps( V, vResult, 0x10 );
return vResult;
#elif defined(_XM_SSE_INTRINSICS_)
// Swap y and x
XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
// Convert input to vector
XMVECTOR vTemp = _mm_set_ss(y);
// Replace the x component
vResult = _mm_move_ss(vResult,vTemp);
// Swap y and x again
vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1));
return vResult;
#endif
}
Note that DirectXMath is now available on GitHub.