对自动矢量化原因“1200”进行故障排除
Troubleshooting auto vectorize reason '1200'
带更新 4 的 MSVC 2013 Ultimate
不明白为什么我会在这个看似简单的示例中收到此错误
信息 C5002:由于“1200”原因,循环未矢量化
这是
1200 Loop包含循环携带的数据依赖
我看不出循环的迭代如何相互干扰。
__declspec( align( 16 ) ) class PhysicsSystem
{
public:
static const int32_t MaxEntities = 65535;
__declspec( align( 16 ) ) struct VectorizedXYZ
{
double mX[ MaxEntities ];
double mY[ MaxEntities ];
double mZ[ MaxEntities ];
VectorizedXYZ()
{
memset( mX, 0, sizeof( mX ) );
memset( mY, 0, sizeof( mY ) );
memset( mZ, 0, sizeof( mZ ) );
}
};
void Update( double dt )
{
for ( int32_t i = 0; i < MaxEntities; ++i ) <== 1200
{
mTmp.mX[ i ] = mPos.mX[ i ] + mVel.mX[ i ] * dt;
mTmp.mY[ i ] = mPos.mY[ i ] + mVel.mY[ i ] * dt;
mTmp.mZ[ i ] = mPos.mZ[ i ] + mVel.mZ[ i ] * dt;
}
}
private:
VectorizedXYZ mTmp;
VectorizedXYZ mPos;
VectorizedXYZ mVel;
};
编辑:根据 http://blogs.msdn.com/b/nativeconcurrency/archive/2012/05/08/auto-vectorizer-in-visual-studio-11-rules-for-loop-body.aspx 判断,这似乎是 "Example 1 – Embarrassingly Parallel" 的一个示例,但它的行为就像它认为数组不安全,不会出现别名,这让我感到困惑。
Edit2:如果有人能分享自动矢量化在这样一个看似简单的例子中失败的原因,那就太好了,但在修改了一段时间之后,我选择了自己掌权
void PhysicsSystem::Update( Real dt )
{
const __m128d mdt = { dt, dt };
// advance by 2 since we can do 2 at a time at double precision in __m128d
for ( size_t i = 0; i < MaxEntities; i += 2 )
{
__m128d posX = _mm_load_pd( &mPos.mX[ i ] );
__m128d posY = _mm_load_pd( &mPos.mY[ i ] );
__m128d posZ = _mm_load_pd( &mPos.mZ[ i ] );
__m128d velX = _mm_load_pd( &mVel.mX[ i ] );
__m128d velY = _mm_load_pd( &mVel.mY[ i ] );
__m128d velZ = _mm_load_pd( &mVel.mZ[ i ] );
__m128d velFrameX = _mm_mul_pd( velX, mdt );
__m128d velFrameY = _mm_mul_pd( velY, mdt );
__m128d velFrameZ = _mm_mul_pd( velZ, mdt );
_mm_store_pd( &mPos.mX[ i ], _mm_add_pd( posX, velFrameX ) );
_mm_store_pd( &mPos.mY[ i ], _mm_add_pd( posX, velFrameY ) );
_mm_store_pd( &mPos.mZ[ i ], _mm_add_pd( posX, velFrameZ ) );
}
}
不确定您的编译器是否支持它,但是为了强制执行一些适当的矢量化,您可以轻而易举地这样做:
void PhysicsSystem::Update( double dt ) {
double *tx=mTmp.mX, *ty=mTmp.mY, *tz=mTmp.mZ;
double *px=mPos.mX, *py=mPos.mY, *pz=mPos.mZ;
double *vx=mVel.mX, *vy=mVel.mY, *vz=mVel.mZ;
#pragma omp simd aligned( tx, ty, tz, px, py, pz, vx, vy, vz )
for ( int i = 0; i < MaxEntities; ++i ) {
tx[ i ] = px[ i ] + vx[ i ] * dt;
ty[ i ] = py[ i ] + vy[ i ] * dt;
tz[ i ] = pz[ i ] + vz[ i ] * dt;
}
}
然后您需要启用 OpenMP 支持以考虑该指令。
带更新 4 的 MSVC 2013 Ultimate
不明白为什么我会在这个看似简单的示例中收到此错误
信息 C5002:由于“1200”原因,循环未矢量化
这是
1200 Loop包含循环携带的数据依赖
我看不出循环的迭代如何相互干扰。
__declspec( align( 16 ) ) class PhysicsSystem
{
public:
static const int32_t MaxEntities = 65535;
__declspec( align( 16 ) ) struct VectorizedXYZ
{
double mX[ MaxEntities ];
double mY[ MaxEntities ];
double mZ[ MaxEntities ];
VectorizedXYZ()
{
memset( mX, 0, sizeof( mX ) );
memset( mY, 0, sizeof( mY ) );
memset( mZ, 0, sizeof( mZ ) );
}
};
void Update( double dt )
{
for ( int32_t i = 0; i < MaxEntities; ++i ) <== 1200
{
mTmp.mX[ i ] = mPos.mX[ i ] + mVel.mX[ i ] * dt;
mTmp.mY[ i ] = mPos.mY[ i ] + mVel.mY[ i ] * dt;
mTmp.mZ[ i ] = mPos.mZ[ i ] + mVel.mZ[ i ] * dt;
}
}
private:
VectorizedXYZ mTmp;
VectorizedXYZ mPos;
VectorizedXYZ mVel;
};
编辑:根据 http://blogs.msdn.com/b/nativeconcurrency/archive/2012/05/08/auto-vectorizer-in-visual-studio-11-rules-for-loop-body.aspx 判断,这似乎是 "Example 1 – Embarrassingly Parallel" 的一个示例,但它的行为就像它认为数组不安全,不会出现别名,这让我感到困惑。
Edit2:如果有人能分享自动矢量化在这样一个看似简单的例子中失败的原因,那就太好了,但在修改了一段时间之后,我选择了自己掌权
void PhysicsSystem::Update( Real dt )
{
const __m128d mdt = { dt, dt };
// advance by 2 since we can do 2 at a time at double precision in __m128d
for ( size_t i = 0; i < MaxEntities; i += 2 )
{
__m128d posX = _mm_load_pd( &mPos.mX[ i ] );
__m128d posY = _mm_load_pd( &mPos.mY[ i ] );
__m128d posZ = _mm_load_pd( &mPos.mZ[ i ] );
__m128d velX = _mm_load_pd( &mVel.mX[ i ] );
__m128d velY = _mm_load_pd( &mVel.mY[ i ] );
__m128d velZ = _mm_load_pd( &mVel.mZ[ i ] );
__m128d velFrameX = _mm_mul_pd( velX, mdt );
__m128d velFrameY = _mm_mul_pd( velY, mdt );
__m128d velFrameZ = _mm_mul_pd( velZ, mdt );
_mm_store_pd( &mPos.mX[ i ], _mm_add_pd( posX, velFrameX ) );
_mm_store_pd( &mPos.mY[ i ], _mm_add_pd( posX, velFrameY ) );
_mm_store_pd( &mPos.mZ[ i ], _mm_add_pd( posX, velFrameZ ) );
}
}
不确定您的编译器是否支持它,但是为了强制执行一些适当的矢量化,您可以轻而易举地这样做:
void PhysicsSystem::Update( double dt ) {
double *tx=mTmp.mX, *ty=mTmp.mY, *tz=mTmp.mZ;
double *px=mPos.mX, *py=mPos.mY, *pz=mPos.mZ;
double *vx=mVel.mX, *vy=mVel.mY, *vz=mVel.mZ;
#pragma omp simd aligned( tx, ty, tz, px, py, pz, vx, vy, vz )
for ( int i = 0; i < MaxEntities; ++i ) {
tx[ i ] = px[ i ] + vx[ i ] * dt;
ty[ i ] = py[ i ] + vy[ i ] * dt;
tz[ i ] = pz[ i ] + vz[ i ] * dt;
}
}
然后您需要启用 OpenMP 支持以考虑该指令。