用于网格法线计算的 SIMD 代码不起作用(尝试将 C++ 转换为 SIMD)
SIMD code for mesh normals calculation not working (Trying to convert C++ to SIMD)
C++代码
void Mesh::RecalculateNormals()
{
for (int i = 0; i < indexCount; i += 3)
{
const int ia = indices[i];
const int ib = indices[i + 1];
const int ic = indices[i + 2];
const glm::vec3 e1 = glm::vec3(vert[ia].position) - glm::vec3(vert[ib].position);
const glm::vec3 e2 = glm::vec3(vert[ic].position) - glm::vec3(vert[ib].position);
const glm::vec3 no = cross(e1, e2);
vert[ia].normal += glm::vec4(no, 0.0);
vert[ib].normal += glm::vec4(no, 0.0);
vert[ic].normal += glm::vec4(no, 0.0);
}
for (int i = 0; i < vertexCount; i++)
vert[i].normal = glm::vec4(glm::normalize(glm::vec3(vert[i].normal)), 0.0f);
}
带有 SIMD 的新代码
// From : https://geometrian.com/programming/tutorials/cross-product/index.php
[[nodiscard]] inline static __m128 cross_product(__m128 const& vec0, __m128 const& vec1) {
__m128 tmp0 = _mm_shuffle_ps(vec0, vec0, _MM_SHUFFLE(3, 0, 2, 1));
__m128 tmp1 = _mm_shuffle_ps(vec1, vec1, _MM_SHUFFLE(3, 1, 0, 2));
__m128 tmp2 = _mm_mul_ps(tmp0, vec1);
__m128 tmp3 = _mm_mul_ps(tmp0, tmp1);
__m128 tmp4 = _mm_shuffle_ps(tmp2, tmp2, _MM_SHUFFLE(3, 0, 2, 1));
return _mm_sub_ps(tmp3, tmp4);
}
void normalize(const glm::vec4& lpInput, glm::vec4& lpOutput) {
const __m128& vInput = reinterpret_cast<const __m128&>(lpInput); // load input vector (x, y, z, a)
__m128 vSquared = _mm_mul_ps(vInput, vInput); // square the input values
__m128 vHalfSum = _mm_hadd_ps(vSquared, vSquared);
__m128 vSum = _mm_hadd_ps(vHalfSum, vHalfSum); // compute the sum of values
float fInvSqrt; _mm_store_ss(&fInvSqrt, _mm_rsqrt_ss(vSum)); // compute the inverse sqrt
__m128 vNormalized = _mm_mul_ps(vInput, _mm_set1_ps(fInvSqrt)); // normalize the input vector
lpOutput = reinterpret_cast<const glm::vec4&>(vNormalized); // store normalized vector (x, y, z, a)
}
void Mesh::RecalculateNormals()
{
float result[4];
glm::vec4 tmp;
for (int i = 0; i < indexCount; i += 3)
{
const int ia = indices[i];
const int ib = indices[i + 1];
const int ic = indices[i + 2];
__m128 iav = _mm_set_ps(vert[ia].position.x, vert[ia].position.y, vert[ia].position.z, 0.0f);
__m128 ibv = _mm_set_ps(vert[ib].position.x, vert[ib].position.y, vert[ib].position.z, 0.0f);
__m128 icv = _mm_set_ps(vert[ic].position.x, vert[ic].position.y, vert[ic].position.z, 0.0f);
__m128 e1i = _mm_sub_ps(iav, ibv);
__m128 e2i = _mm_sub_ps(icv, ibv);
//const glm::vec3 e1 = glm::vec3(vert[ia].position) - glm::vec3(vert[ib].position);
//const glm::vec3 e2 = glm::vec3(vert[ic].position) - glm::vec3(vert[ib].position);
//const glm::vec3 no = cross(e1, e2);
__m128 no = cross_product(e1i, e2i);
//vert[ia].normal += glm::vec4(no, 0.0);
//vert[ib].normal += glm::vec4(no, 0.0);
//vert[ic].normal += glm::vec4(no, 0.0);
_mm_storeu_ps(result, no);
tmp = glm::make_vec4(result);
vert[ia].normal += tmp;
vert[ib].normal += tmp;
vert[ic].normal += tmp;
}
for (int i = 0; i < vertexCount; i++)
vert[i].normal = glm::vec4(glm::normalize(glm::vec3(vert[i].normal)), 0.0f);
}
但是它不起作用。请任何人帮助发现问题。
(我是 SIMD 新手)
您的正确性问题可能是由于英特尔搞砸了 _mm_set_ps
和类似内在函数的顺序。
要在前 3 个通道中创建 x、y、z 浮动的向量,请写入 _mm_set_ps( 0, z, y, x )
或 _mm_setr_ps( x, y, z, 0 )
。
这是您需要的更好的原始函数。该代码假定您至少拥有 SSE 4.1,该产品于 2007 年在 Intel Core 2 中引入,根据 Steam survey 目前市场渗透率为 98.89%。
// Load an FP32 3D vector from memory
inline __m128 loadFloat3( const glm::vec3& pos )
{
static_assert( sizeof( glm::vec3 ) == 12, "Expected to be 12 bytes (3 floats)" );
__m128 low = _mm_castpd_ps( _mm_load_sd( (const double*)&pos ) );
// Modern compilers are optimizing the following 2 intrinsics into a single insertps with a memory operand
__m128 high = _mm_load_ss( ( (const float*)&pos ) + 2 );
return _mm_insert_ps( low, high, 0x27 );
}
// Store an FP32 3D vector to memory
inline void storeFloat3( glm::vec3& pos, __m128 vec )
{
_mm_store_sd( (double*)&pos, _mm_castps_pd( vec ) );
( (int*)( &pos ) )[ 2 ] = _mm_extract_ps( vec, 2 );
}
// Zero out W lane, and store an FP32 vector to memory
inline void storeFloat4( glm::vec4& pos, __m128 vec )
{
static_assert( sizeof( glm::vec4 ) == 16, "Expected to be 16 bytes" );
vec = _mm_insert_ps( vec, vec, 0b1000 );
_mm_storeu_ps( (float*)&pos, vec );
}
// Normalize a 3D vector; if the source is zero, will instead return a zero vector
inline __m128 vector3Normalize( __m128 vec )
{
// Compute x^2 + y^2 + z^2, broadcast the result to all 4 lanes
__m128 dp = _mm_dp_ps( vec, vec, 0b01111111 );
// res = vec / sqrt( dp )
__m128 res = _mm_div_ps( vec, _mm_sqrt_ps( dp ) );
// Compare for dp > 0
__m128 positiveLength = _mm_cmpgt_ps( dp, _mm_setzero_ps() );
// Zero out the vector if the dot product was zero
return _mm_and_ps( res, positiveLength );
}
// Copy-pasted from there: https://github.com/microsoft/DirectXMath/blob/jan2021/Inc/DirectXMathVector.inl#L9506-L9519
// MIT license
#ifdef __AVX__
#define XM_PERMUTE_PS( v, c ) _mm_permute_ps( v, c )
#else
#define XM_PERMUTE_PS( v, c ) _mm_shuffle_ps( v, v, c )
#endif
#ifdef __AVX2__
#define XM_FNMADD_PS( a, b, c ) _mm_fnmadd_ps((a), (b), (c))
#else
#define XM_FNMADD_PS( a, b, c ) _mm_sub_ps((c), _mm_mul_ps((a), (b)))
#endif
inline __m128 vector3Cross( __m128 V1, __m128 V2 )
{
// y1,z1,x1,w1
__m128 vTemp1 = XM_PERMUTE_PS( V1, _MM_SHUFFLE( 3, 0, 2, 1 ) );
// z2,x2,y2,w2
__m128 vTemp2 = XM_PERMUTE_PS( V2, _MM_SHUFFLE( 3, 1, 0, 2 ) );
// Perform the left operation
__m128 vResult = _mm_mul_ps( vTemp1, vTemp2 );
// z1, x1, y1, w1
vTemp1 = XM_PERMUTE_PS( vTemp1, _MM_SHUFFLE( 3, 0, 2, 1 ) );
// y2, z2, x2, w2
vTemp2 = XM_PERMUTE_PS( vTemp2, _MM_SHUFFLE( 3, 1, 0, 2 ) );
// Perform the right operation
vResult = XM_FNMADD_PS( vTemp1, vTemp2, vResult );
return vResult;
}
您不应期望使用该算法获得高质量的结果。它通过累积三角形面积加权的三角形法线来计算 per-vertex 法线。这意味着如果您的网格同时具有小三角形和大三角形,则小三角形的法线将被忽略,这是次优的。更好的方法是通过顶点处三角形的角度对法线进行加权。 See this article 获取更多信息。
C++代码
void Mesh::RecalculateNormals()
{
for (int i = 0; i < indexCount; i += 3)
{
const int ia = indices[i];
const int ib = indices[i + 1];
const int ic = indices[i + 2];
const glm::vec3 e1 = glm::vec3(vert[ia].position) - glm::vec3(vert[ib].position);
const glm::vec3 e2 = glm::vec3(vert[ic].position) - glm::vec3(vert[ib].position);
const glm::vec3 no = cross(e1, e2);
vert[ia].normal += glm::vec4(no, 0.0);
vert[ib].normal += glm::vec4(no, 0.0);
vert[ic].normal += glm::vec4(no, 0.0);
}
for (int i = 0; i < vertexCount; i++)
vert[i].normal = glm::vec4(glm::normalize(glm::vec3(vert[i].normal)), 0.0f);
}
带有 SIMD 的新代码
// From : https://geometrian.com/programming/tutorials/cross-product/index.php
[[nodiscard]] inline static __m128 cross_product(__m128 const& vec0, __m128 const& vec1) {
__m128 tmp0 = _mm_shuffle_ps(vec0, vec0, _MM_SHUFFLE(3, 0, 2, 1));
__m128 tmp1 = _mm_shuffle_ps(vec1, vec1, _MM_SHUFFLE(3, 1, 0, 2));
__m128 tmp2 = _mm_mul_ps(tmp0, vec1);
__m128 tmp3 = _mm_mul_ps(tmp0, tmp1);
__m128 tmp4 = _mm_shuffle_ps(tmp2, tmp2, _MM_SHUFFLE(3, 0, 2, 1));
return _mm_sub_ps(tmp3, tmp4);
}
void normalize(const glm::vec4& lpInput, glm::vec4& lpOutput) {
const __m128& vInput = reinterpret_cast<const __m128&>(lpInput); // load input vector (x, y, z, a)
__m128 vSquared = _mm_mul_ps(vInput, vInput); // square the input values
__m128 vHalfSum = _mm_hadd_ps(vSquared, vSquared);
__m128 vSum = _mm_hadd_ps(vHalfSum, vHalfSum); // compute the sum of values
float fInvSqrt; _mm_store_ss(&fInvSqrt, _mm_rsqrt_ss(vSum)); // compute the inverse sqrt
__m128 vNormalized = _mm_mul_ps(vInput, _mm_set1_ps(fInvSqrt)); // normalize the input vector
lpOutput = reinterpret_cast<const glm::vec4&>(vNormalized); // store normalized vector (x, y, z, a)
}
void Mesh::RecalculateNormals()
{
float result[4];
glm::vec4 tmp;
for (int i = 0; i < indexCount; i += 3)
{
const int ia = indices[i];
const int ib = indices[i + 1];
const int ic = indices[i + 2];
__m128 iav = _mm_set_ps(vert[ia].position.x, vert[ia].position.y, vert[ia].position.z, 0.0f);
__m128 ibv = _mm_set_ps(vert[ib].position.x, vert[ib].position.y, vert[ib].position.z, 0.0f);
__m128 icv = _mm_set_ps(vert[ic].position.x, vert[ic].position.y, vert[ic].position.z, 0.0f);
__m128 e1i = _mm_sub_ps(iav, ibv);
__m128 e2i = _mm_sub_ps(icv, ibv);
//const glm::vec3 e1 = glm::vec3(vert[ia].position) - glm::vec3(vert[ib].position);
//const glm::vec3 e2 = glm::vec3(vert[ic].position) - glm::vec3(vert[ib].position);
//const glm::vec3 no = cross(e1, e2);
__m128 no = cross_product(e1i, e2i);
//vert[ia].normal += glm::vec4(no, 0.0);
//vert[ib].normal += glm::vec4(no, 0.0);
//vert[ic].normal += glm::vec4(no, 0.0);
_mm_storeu_ps(result, no);
tmp = glm::make_vec4(result);
vert[ia].normal += tmp;
vert[ib].normal += tmp;
vert[ic].normal += tmp;
}
for (int i = 0; i < vertexCount; i++)
vert[i].normal = glm::vec4(glm::normalize(glm::vec3(vert[i].normal)), 0.0f);
}
但是它不起作用。请任何人帮助发现问题。
(我是 SIMD 新手)
您的正确性问题可能是由于英特尔搞砸了 _mm_set_ps
和类似内在函数的顺序。
要在前 3 个通道中创建 x、y、z 浮动的向量,请写入 _mm_set_ps( 0, z, y, x )
或 _mm_setr_ps( x, y, z, 0 )
。
这是您需要的更好的原始函数。该代码假定您至少拥有 SSE 4.1,该产品于 2007 年在 Intel Core 2 中引入,根据 Steam survey 目前市场渗透率为 98.89%。
// Load an FP32 3D vector from memory
inline __m128 loadFloat3( const glm::vec3& pos )
{
static_assert( sizeof( glm::vec3 ) == 12, "Expected to be 12 bytes (3 floats)" );
__m128 low = _mm_castpd_ps( _mm_load_sd( (const double*)&pos ) );
// Modern compilers are optimizing the following 2 intrinsics into a single insertps with a memory operand
__m128 high = _mm_load_ss( ( (const float*)&pos ) + 2 );
return _mm_insert_ps( low, high, 0x27 );
}
// Store an FP32 3D vector to memory
inline void storeFloat3( glm::vec3& pos, __m128 vec )
{
_mm_store_sd( (double*)&pos, _mm_castps_pd( vec ) );
( (int*)( &pos ) )[ 2 ] = _mm_extract_ps( vec, 2 );
}
// Zero out W lane, and store an FP32 vector to memory
inline void storeFloat4( glm::vec4& pos, __m128 vec )
{
static_assert( sizeof( glm::vec4 ) == 16, "Expected to be 16 bytes" );
vec = _mm_insert_ps( vec, vec, 0b1000 );
_mm_storeu_ps( (float*)&pos, vec );
}
// Normalize a 3D vector; if the source is zero, will instead return a zero vector
inline __m128 vector3Normalize( __m128 vec )
{
// Compute x^2 + y^2 + z^2, broadcast the result to all 4 lanes
__m128 dp = _mm_dp_ps( vec, vec, 0b01111111 );
// res = vec / sqrt( dp )
__m128 res = _mm_div_ps( vec, _mm_sqrt_ps( dp ) );
// Compare for dp > 0
__m128 positiveLength = _mm_cmpgt_ps( dp, _mm_setzero_ps() );
// Zero out the vector if the dot product was zero
return _mm_and_ps( res, positiveLength );
}
// Copy-pasted from there: https://github.com/microsoft/DirectXMath/blob/jan2021/Inc/DirectXMathVector.inl#L9506-L9519
// MIT license
#ifdef __AVX__
#define XM_PERMUTE_PS( v, c ) _mm_permute_ps( v, c )
#else
#define XM_PERMUTE_PS( v, c ) _mm_shuffle_ps( v, v, c )
#endif
#ifdef __AVX2__
#define XM_FNMADD_PS( a, b, c ) _mm_fnmadd_ps((a), (b), (c))
#else
#define XM_FNMADD_PS( a, b, c ) _mm_sub_ps((c), _mm_mul_ps((a), (b)))
#endif
inline __m128 vector3Cross( __m128 V1, __m128 V2 )
{
// y1,z1,x1,w1
__m128 vTemp1 = XM_PERMUTE_PS( V1, _MM_SHUFFLE( 3, 0, 2, 1 ) );
// z2,x2,y2,w2
__m128 vTemp2 = XM_PERMUTE_PS( V2, _MM_SHUFFLE( 3, 1, 0, 2 ) );
// Perform the left operation
__m128 vResult = _mm_mul_ps( vTemp1, vTemp2 );
// z1, x1, y1, w1
vTemp1 = XM_PERMUTE_PS( vTemp1, _MM_SHUFFLE( 3, 0, 2, 1 ) );
// y2, z2, x2, w2
vTemp2 = XM_PERMUTE_PS( vTemp2, _MM_SHUFFLE( 3, 1, 0, 2 ) );
// Perform the right operation
vResult = XM_FNMADD_PS( vTemp1, vTemp2, vResult );
return vResult;
}
您不应期望使用该算法获得高质量的结果。它通过累积三角形面积加权的三角形法线来计算 per-vertex 法线。这意味着如果您的网格同时具有小三角形和大三角形,则小三角形的法线将被忽略,这是次优的。更好的方法是通过顶点处三角形的角度对法线进行加权。 See this article 获取更多信息。