__m256d TRANSPOSE4 等效?

__m256d TRANSPOSE4 Equivalent?

英特尔已包含 __MM_TRANPOSE4_PS 以转置 4x4 向量矩阵。我想用 __m256d 做同样的事情。但是,我似乎无法弄清楚如何以相同的方式获得 _mm256_shuffle_pd。

_MM_TRANSPOSE4_PS代码

#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) {                 \
            __m128 tmp3, tmp2, tmp1, tmp0;                          \
                                                                    \
            tmp0   = _mm_shuffle_ps((row0), (row1), 0x44);          \
            tmp2   = _mm_shuffle_ps((row0), (row1), 0xEE);          \
            tmp1   = _mm_shuffle_ps((row2), (row3), 0x44);          \
            tmp3   = _mm_shuffle_ps((row2), (row3), 0xEE);          \
                                                                    \
            (row0) = _mm_shuffle_ps(tmp0, tmp1, 0x88);              \
            (row1) = _mm_shuffle_ps(tmp0, tmp1, 0xDD);              \
            (row2) = _mm_shuffle_ps(tmp2, tmp3, 0x88);              \
            (row3) = _mm_shuffle_ps(tmp2, tmp3, 0xDD);              \
        }

我在循环中尝试 _MM_TRANSPOSE4_PD 我需要它

for (int copy = i; copy < m2.size();)
{
    __m256d row0 = _mm256_load_pd(m2data + copy);
    copy += m2.col();
    __m256d row1 = _mm256_load_pd(m2data + copy);
    copy += m2.col();
    __m256d row2 = _mm256_load_pd(m2data + copy);
    copy += m2.col();
    __m256d row3 = _mm256_load_pd(m2data + copy);
    copy += m2.col();

    __m256d tmp3, tmp2, tmp1, tmp0;

    tmp0 = _mm256_shuffle_pd(row0,row1, 0x44);          
    tmp2 = _mm256_shuffle_pd(row0,row1, 0xEE);
    tmp1 = _mm256_shuffle_pd(row2,row3, 0x44);
    tmp3 = _mm256_shuffle_pd(row2,row3, 0xEE);

    row0 = _mm256_shuffle_pd(tmp0, tmp1, 0x88);
    row1 = _mm256_shuffle_pd(tmp0, tmp1, 0xDD);
    row2 = _mm256_shuffle_pd(tmp2, tmp3, 0x88);
    row3 = _mm256_shuffle_pd(tmp2, tmp3, 0xDD);

    _mm256_store_pd(reinterpret_cast<double*>(buffer + counter++),row0);
    _mm256_store_pd(reinterpret_cast<double*>(buffer + counter++),row1);
    _mm256_store_pd(reinterpret_cast<double*>(buffer + counter++),row2);
    _mm256_store_pd(reinterpret_cast<double*>(buffer + counter++),row3);
}

这是我找到的解决方案的宏等价物。

  #define _MM_TRANSPOSE4_PD(row0,row1,row2,row3)                                 \
                {                                                                \
                    __m256d tmp3, tmp2, tmp1, tmp0;                              \
                                                                                 \
                    tmp0 = _mm256_shuffle_pd((row0),(row1), 0x0);                    \
                    tmp2 = _mm256_shuffle_pd((row0),(row1), 0xF);                \
                    tmp1 = _mm256_shuffle_pd((row2),(row3), 0x0);                    \
                    tmp3 = _mm256_shuffle_pd((row2),(row3), 0xF);                \
                                                                                 \
                    (row0) = _mm256_permute2f128_pd(tmp0, tmp1, 0x20);   \
                    (row1) = _mm256_permute2f128_pd(tmp2, tmp3, 0x20);   \
                    (row2) = _mm256_permute2f128_pd(tmp0, tmp1, 0x31);   \
                    (row3) = _mm256_permute2f128_pd(tmp2, tmp3, 0x31);   \
                }