相同的 AVX2 程序在 gcc 和 msvc 中产生不同的结果

Same AVX2 program yields different result in gcc & msvc

我正在尝试使用 AVX2 增加 md5 哈希的吞吐量。我使用了 simd_md5 用户提供的 simd_md5 库..

在 msvc2013 上,我得到了所有 8 个缓冲区的期望结果,但在 linux 上,当我 运行 相同的代码仅对前 4 个缓冲区匹配时,接下来的四个缓冲区结果以某种方式移动。

我在此处附上了与示例相同的代码

#ifndef MD5_AVX2_H
#define MD5_AVX2_H

#include <immintrin.h>
#include "md5_common.h"

typedef struct {
    __m256i state[8]; /* state (ABCD) */
    unsigned long int count[2]; /* number of bits, modulo 2^64 (lsb first) */
    unsigned char buffer1[64];
    unsigned char buffer2[64];
    unsigned char buffer3[64];
    unsigned char buffer4[64];
    unsigned char buffer5[64];
    unsigned char buffer6[64];
    unsigned char buffer7[64];
    unsigned char buffer8[64];
} MD5_AVX2_CTX;

#define AVX2_ROTATE_LEFT(x, n) _mm256_or_si256(_mm256_slli_epi32((x), (n)), _mm256_srli_epi32((x), (32-(n))))

#define AVX2_F(x, y, z) _mm256_or_si256(_mm256_and_si256((x), (y)), _mm256_andnot_si256((x), (z)))
#define AVX2_G(x, y, z) _mm256_or_si256(_mm256_and_si256((x), (z)), _mm256_andnot_si256((z), (y)))
#define AVX2_H(x, y, z) _mm256_xor_si256((x), _mm256_xor_si256((y), (z)))
#define AVX2_I(x, y, z) _mm256_xor_si256((y), _mm256_or_si256((x), _mm256_andnot_si256((z), _mm256_cmpeq_epi32((z), (z)))))

#define AVX2_STEP(f, a, b, c, d, x, s, ac) \
{ \
    (a) = _mm256_add_epi32((a), _mm256_add_epi32(_mm256_add_epi32(f((b), (c), (d)), (x)),(ac))); \
    (a) = AVX2_ROTATE_LEFT((a), (s)); \
    (a) = _mm256_add_epi32((a), (b)); \
}

void md5_avx2_init(MD5_AVX2_CTX *context);
void md5_avx2_update1(MD5_AVX2_CTX *context, unsigned char *input, unsigned int inputLen);
void md5_avx2_update8(MD5_AVX2_CTX *context, unsigned char *input1, unsigned char *input2, unsigned char *input3, unsigned char *input4, unsigned char *input5, unsigned char *input6, unsigned char *input7, unsigned char *input8, unsigned int inputLen);
void md5_avx2_final(unsigned char digests[8][16], MD5_AVX2_CTX *context);

#endif

c文件


#include "md5_common.h"
#include "md5_avx2.h"

static void avx2_decode(__m256i *output, unsigned char *input1, unsigned char *input2, unsigned char *input3, unsigned char *input4, unsigned char *input5, unsigned char *input6, unsigned char *input7, unsigned char *input8, unsigned int len)
{
    unsigned int i, j;
    for (i = 0, j = 0; j < len; i++, j += 4)
    {
        output[i] = _mm256_set_epi32(
            ((unsigned long int)input8[j]) | (((unsigned long int)input8[j+1]) << 8) | (((unsigned long int)input8[j+2]) << 16) | (((unsigned long int)input8[j+3]) << 24),
            ((unsigned long int)input7[j]) | (((unsigned long int)input7[j+1]) << 8) | (((unsigned long int)input7[j+2]) << 16) | (((unsigned long int)input7[j+3]) << 24),
            ((unsigned long int)input6[j]) | (((unsigned long int)input6[j+1]) << 8) | (((unsigned long int)input6[j+2]) << 16) | (((unsigned long int)input6[j+3]) << 24),
            ((unsigned long int)input5[j]) | (((unsigned long int)input5[j+1]) << 8) | (((unsigned long int)input5[j+2]) << 16) | (((unsigned long int)input5[j+3]) << 24),
            ((unsigned long int)input4[j]) | (((unsigned long int)input4[j+1]) << 8) | (((unsigned long int)input4[j+2]) << 16) | (((unsigned long int)input4[j+3]) << 24),
            ((unsigned long int)input3[j]) | (((unsigned long int)input3[j+1]) << 8) | (((unsigned long int)input3[j+2]) << 16) | (((unsigned long int)input3[j+3]) << 24),
            ((unsigned long int)input2[j]) | (((unsigned long int)input2[j+1]) << 8) | (((unsigned long int)input2[j+2]) << 16) | (((unsigned long int)input2[j+3]) << 24),
            ((unsigned long int)input1[j]) | (((unsigned long int)input1[j+1]) << 8) | (((unsigned long int)input1[j+2]) << 16) | (((unsigned long int)input1[j+3]) << 24)
            );
    }
}

static void md5_avx2_transform(__m256i *state, unsigned char block1[64], unsigned char block2[64], unsigned char block3[64], unsigned char block4[64], unsigned char block5[64], unsigned char block6[64], unsigned char block7[64], unsigned char block8[64])
{
    __m256i a = state[0], b = state[1], c = state[2], d = state[3], x[16];

    avx2_decode(x, block1, block2, block3, block4, block5, block6, block7, block8, 64);

    /* Round 1 */
    AVX2_STEP(AVX2_F, a, b, c, d, x[ 0], S11, _mm256_set1_epi32(0xd76aa478)); /* 1 */
    AVX2_STEP(AVX2_F, d, a, b, c, x[ 1], S12, _mm256_set1_epi32(0xe8c7b756)); /* 2 */
    AVX2_STEP(AVX2_F, c, d, a, b, x[ 2], S13, _mm256_set1_epi32(0x242070db)); /* 3 */
    AVX2_STEP(AVX2_F, b, c, d, a, x[ 3], S14, _mm256_set1_epi32(0xc1bdceee)); /* 4 */
    AVX2_STEP(AVX2_F, a, b, c, d, x[ 4], S11, _mm256_set1_epi32(0xf57c0faf)); /* 5 */
    AVX2_STEP(AVX2_F, d, a, b, c, x[ 5], S12, _mm256_set1_epi32(0x4787c62a)); /* 6 */
    AVX2_STEP(AVX2_F, c, d, a, b, x[ 6], S13, _mm256_set1_epi32(0xa8304613)); /* 7 */
    AVX2_STEP(AVX2_F, b, c, d, a, x[ 7], S14, _mm256_set1_epi32(0xfd469501)); /* 8 */
    AVX2_STEP(AVX2_F, a, b, c, d, x[ 8], S11, _mm256_set1_epi32(0x698098d8)); /* 9 */
    AVX2_STEP(AVX2_F, d, a, b, c, x[ 9], S12, _mm256_set1_epi32(0x8b44f7af)); /* 10 */
    AVX2_STEP(AVX2_F, c, d, a, b, x[10], S13, _mm256_set1_epi32(0xffff5bb1)); /* 11 */
    AVX2_STEP(AVX2_F, b, c, d, a, x[11], S14, _mm256_set1_epi32(0x895cd7be)); /* 12 */
    AVX2_STEP(AVX2_F, a, b, c, d, x[12], S11, _mm256_set1_epi32(0x6b901122)); /* 13 */
    AVX2_STEP(AVX2_F, d, a, b, c, x[13], S12, _mm256_set1_epi32(0xfd987193)); /* 14 */
    AVX2_STEP(AVX2_F, c, d, a, b, x[14], S13, _mm256_set1_epi32(0xa679438e)); /* 15 */
    AVX2_STEP(AVX2_F, b, c, d, a, x[15], S14, _mm256_set1_epi32(0x49b40821)); /* 16 */

    /* Round 2 */
    AVX2_STEP(AVX2_G, a, b, c, d, x[ 1], S21, _mm256_set1_epi32(0xf61e2562)); /* 17 */
    AVX2_STEP(AVX2_G, d, a, b, c, x[ 6], S22, _mm256_set1_epi32(0xc040b340)); /* 18 */
    AVX2_STEP(AVX2_G, c, d, a, b, x[11], S23, _mm256_set1_epi32(0x265e5a51)); /* 19 */
    AVX2_STEP(AVX2_G, b, c, d, a, x[ 0], S24, _mm256_set1_epi32(0xe9b6c7aa)); /* 20 */
    AVX2_STEP(AVX2_G, a, b, c, d, x[ 5], S21, _mm256_set1_epi32(0xd62f105d)); /* 21 */
    AVX2_STEP(AVX2_G, d, a, b, c, x[10], S22, _mm256_set1_epi32( 0x2441453)); /* 22 */
    AVX2_STEP(AVX2_G, c, d, a, b, x[15], S23, _mm256_set1_epi32(0xd8a1e681)); /* 23 */
    AVX2_STEP(AVX2_G, b, c, d, a, x[ 4], S24, _mm256_set1_epi32(0xe7d3fbc8)); /* 24 */
    AVX2_STEP(AVX2_G, a, b, c, d, x[ 9], S21, _mm256_set1_epi32(0x21e1cde6)); /* 25 */
    AVX2_STEP(AVX2_G, d, a, b, c, x[14], S22, _mm256_set1_epi32(0xc33707d6)); /* 26 */
    AVX2_STEP(AVX2_G, c, d, a, b, x[ 3], S23, _mm256_set1_epi32(0xf4d50d87)); /* 27 */
    AVX2_STEP(AVX2_G, b, c, d, a, x[ 8], S24, _mm256_set1_epi32(0x455a14ed)); /* 28 */
    AVX2_STEP(AVX2_G, a, b, c, d, x[13], S21, _mm256_set1_epi32(0xa9e3e905)); /* 29 */
    AVX2_STEP(AVX2_G, d, a, b, c, x[ 2], S22, _mm256_set1_epi32(0xfcefa3f8)); /* 30 */
    AVX2_STEP(AVX2_G, c, d, a, b, x[ 7], S23, _mm256_set1_epi32(0x676f02d9)); /* 31 */
    AVX2_STEP(AVX2_G, b, c, d, a, x[12], S24, _mm256_set1_epi32(0x8d2a4c8a)); /* 32 */

    /* Round 3 */
    AVX2_STEP(AVX2_H, a, b, c, d, x[ 5], S31, _mm256_set1_epi32(0xfffa3942)); /* 33 */
    AVX2_STEP(AVX2_H, d, a, b, c, x[ 8], S32, _mm256_set1_epi32(0x8771f681)); /* 34 */
    AVX2_STEP(AVX2_H, c, d, a, b, x[11], S33, _mm256_set1_epi32(0x6d9d6122)); /* 35 */
    AVX2_STEP(AVX2_H, b, c, d, a, x[14], S34, _mm256_set1_epi32(0xfde5380c)); /* 36 */
    AVX2_STEP(AVX2_H, a, b, c, d, x[ 1], S31, _mm256_set1_epi32(0xa4beea44)); /* 37 */
    AVX2_STEP(AVX2_H, d, a, b, c, x[ 4], S32, _mm256_set1_epi32(0x4bdecfa9)); /* 38 */
    AVX2_STEP(AVX2_H, c, d, a, b, x[ 7], S33, _mm256_set1_epi32(0xf6bb4b60)); /* 39 */
    AVX2_STEP(AVX2_H, b, c, d, a, x[10], S34, _mm256_set1_epi32(0xbebfbc70)); /* 40 */
    AVX2_STEP(AVX2_H, a, b, c, d, x[13], S31, _mm256_set1_epi32(0x289b7ec6)); /* 41 */
    AVX2_STEP(AVX2_H, d, a, b, c, x[ 0], S32, _mm256_set1_epi32(0xeaa127fa)); /* 42 */
    AVX2_STEP(AVX2_H, c, d, a, b, x[ 3], S33, _mm256_set1_epi32(0xd4ef3085)); /* 43 */
    AVX2_STEP(AVX2_H, b, c, d, a, x[ 6], S34, _mm256_set1_epi32( 0x4881d05)); /* 44 */
    AVX2_STEP(AVX2_H, a, b, c, d, x[ 9], S31, _mm256_set1_epi32(0xd9d4d039)); /* 45 */
    AVX2_STEP(AVX2_H, d, a, b, c, x[12], S32, _mm256_set1_epi32(0xe6db99e5)); /* 46 */
    AVX2_STEP(AVX2_H, c, d, a, b, x[15], S33, _mm256_set1_epi32(0x1fa27cf8)); /* 47 */
    AVX2_STEP(AVX2_H, b, c, d, a, x[ 2], S34, _mm256_set1_epi32(0xc4ac5665)); /* 48 */

    /* Round 4 */
    AVX2_STEP(AVX2_I, a, b, c, d, x[ 0], S41, _mm256_set1_epi32(0xf4292244)); /* 49 */
    AVX2_STEP(AVX2_I, d, a, b, c, x[ 7], S42, _mm256_set1_epi32(0x432aff97)); /* 50 */
    AVX2_STEP(AVX2_I, c, d, a, b, x[14], S43, _mm256_set1_epi32(0xab9423a7)); /* 51 */
    AVX2_STEP(AVX2_I, b, c, d, a, x[ 5], S44, _mm256_set1_epi32(0xfc93a039)); /* 52 */
    AVX2_STEP(AVX2_I, a, b, c, d, x[12], S41, _mm256_set1_epi32(0x655b59c3)); /* 53 */
    AVX2_STEP(AVX2_I, d, a, b, c, x[ 3], S42, _mm256_set1_epi32(0x8f0ccc92)); /* 54 */
    AVX2_STEP(AVX2_I, c, d, a, b, x[10], S43, _mm256_set1_epi32(0xffeff47d)); /* 55 */
    AVX2_STEP(AVX2_I, b, c, d, a, x[ 1], S44, _mm256_set1_epi32(0x85845dd1)); /* 56 */
    AVX2_STEP(AVX2_I, a, b, c, d, x[ 8], S41, _mm256_set1_epi32(0x6fa87e4f)); /* 57 */
    AVX2_STEP(AVX2_I, d, a, b, c, x[15], S42, _mm256_set1_epi32(0xfe2ce6e0)); /* 58 */
    AVX2_STEP(AVX2_I, c, d, a, b, x[ 6], S43, _mm256_set1_epi32(0xa3014314)); /* 59 */
    AVX2_STEP(AVX2_I, b, c, d, a, x[13], S44, _mm256_set1_epi32(0x4e0811a1)); /* 60 */
    AVX2_STEP(AVX2_I, a, b, c, d, x[ 4], S41, _mm256_set1_epi32(0xf7537e82)); /* 61 */
    AVX2_STEP(AVX2_I, d, a, b, c, x[11], S42, _mm256_set1_epi32(0xbd3af235)); /* 62 */
    AVX2_STEP(AVX2_I, c, d, a, b, x[ 2], S43, _mm256_set1_epi32(0x2ad7d2bb)); /* 63 */
    AVX2_STEP(AVX2_I, b, c, d, a, x[ 9], S44, _mm256_set1_epi32(0xeb86d391)); /* 64 */

    state[0] = _mm256_add_epi32(state[0], a);
    state[1] = _mm256_add_epi32(state[1], b);
    state[2] = _mm256_add_epi32(state[2], c);
    state[3] = _mm256_add_epi32(state[3], d);

    /* Zeroize sensitive information.
     */
    //memset((unsigned char *)x, 0, sizeof (x));
}

void md5_avx2_init(MD5_AVX2_CTX *context)
{
    context->count[0] = context->count[1] = 0;
    /* Load magic initialization constants. */
    context->state[0] = _mm256_set1_epi32(0x67452301);
    context->state[1] = _mm256_set1_epi32(0xefcdab89);
    context->state[2] = _mm256_set1_epi32(0x98badcfe);
    context->state[3] = _mm256_set1_epi32(0x10325476);
}

void md5_avx2_update1(MD5_AVX2_CTX *context, unsigned char *input, unsigned int inputLen)
{
    md5_avx2_update8(context, input, input, input, input, input, input, input, input, inputLen);
}

void md5_avx2_update8(MD5_AVX2_CTX *context, unsigned char *input1, unsigned char *input2, unsigned char *input3, unsigned char *input4, unsigned char *input5, unsigned char *input6, unsigned char *input7, unsigned char *input8, unsigned int inputLen)
{
    unsigned int i, index, partLen;

    /* Compute number of bytes mod 64 */
    index = (unsigned int)((context->count[0] >> 3) & 0x3F);

    /* Update number of bits */
    if ((context->count[0] += ((unsigned long int)inputLen << 3)) < ((unsigned long int)inputLen << 3))
        context->count[1]++;

    context->count[1] += ((unsigned long int)inputLen >> 29);

    partLen = 64 - index;

    /* Transform as many times as possible. */
    if (inputLen >= partLen)
    {
        memcpy((unsigned char *)&context->buffer1[index], (unsigned char *)input1, partLen);
        memcpy((unsigned char *)&context->buffer2[index], (unsigned char *)input2, partLen);
        memcpy((unsigned char *)&context->buffer3[index], (unsigned char *)input3, partLen);
        memcpy((unsigned char *)&context->buffer4[index], (unsigned char *)input4, partLen);
        memcpy((unsigned char *)&context->buffer5[index], (unsigned char *)input5, partLen);
        memcpy((unsigned char *)&context->buffer6[index], (unsigned char *)input6, partLen);
        memcpy((unsigned char *)&context->buffer7[index], (unsigned char *)input7, partLen);
        memcpy((unsigned char *)&context->buffer8[index], (unsigned char *)input8, partLen);

        md5_avx2_transform(context->state, context->buffer1, context->buffer2, context->buffer3, context->buffer4, context->buffer5, context->buffer6, context->buffer7, context->buffer8);

        for (i = partLen; i + 63 < inputLen; i += 64)
            md5_avx2_transform(context->state, &input1[i], &input2[i], &input3[i], &input4[i], &input5[i], &input6[i], &input7[i], &input8[i]);

        index = 0;
    }
    else
    {
        i = 0;
    }

    /* Buffer remaining input */
    memcpy((unsigned char *)&context->buffer1[index], (unsigned char *)&input1[i], inputLen-i);
    memcpy((unsigned char *)&context->buffer2[index], (unsigned char *)&input2[i], inputLen-i);
    memcpy((unsigned char *)&context->buffer3[index], (unsigned char *)&input3[i], inputLen-i);
    memcpy((unsigned char *)&context->buffer4[index], (unsigned char *)&input4[i], inputLen-i);
    memcpy((unsigned char *)&context->buffer5[index], (unsigned char *)&input5[i], inputLen-i);
    memcpy((unsigned char *)&context->buffer6[index], (unsigned char *)&input6[i], inputLen-i);
    memcpy((unsigned char *)&context->buffer7[index], (unsigned char *)&input7[i], inputLen-i);
    memcpy((unsigned char *)&context->buffer8[index], (unsigned char *)&input8[i], inputLen-i);
}

void md5_avx2_final(unsigned char digests[8][16], MD5_AVX2_CTX *context)
{
    unsigned char bits[8];
    unsigned int index, padLen;
    int i, j;

    /* Save number of bits */
    encode(bits, context->count, 8);

    /* Pad out to 56 mod 64. */
    index = (unsigned int)((context->count[0] >> 3) & 0x3f);
    padLen = (index < 56) ? (56 - index) : (120 - index);
    md5_avx2_update1(context, PADDING, padLen);

    /* Append length (before padding) */
    md5_avx2_update1(context, bits, 8);

    /* Store state in digest */
    for (i = 0; i < 8; i++)
        for (j = 0; j < 4; j++)
            encode(&digests[i][j*4], &((unsigned long int *) &context->state[j])[i], 4);

    /* Zeroize sensitive information. */
    //memset((unsigned char *)context, 0, sizeof (*context));
}

md5_common.h

#ifndef MD5_COMMON_H
#define MD5_COMMON_H

#define S11 7
#define S12 12
#define S13 17
#define S14 22
#define S21 5
#define S22 9
#define S23 14
#define S24 20
#define S31 4
#define S32 11
#define S33 16
#define S34 23
#define S41 6
#define S42 10
#define S43 15
#define S44 21

static unsigned char PADDING[64] =
{
    0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};

void encode(unsigned char *output, unsigned long int *input, unsigned int len);

#endif

主文件

int main(int argc, char *argv[])
{
    char *string = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz";
    int len = strlen(string);

    /* Test strings. */
    printf("TEST STRINGS\n");

        unsigned char digest[8][16];
    int i, j;

    MD5_AVX2_CTX context;       
    md5_avx2_init(&context);
    md5_avx2_update1(&context, string, len);
    md5_avx2_final(digest, &context);

    printf("AVX2 (\"%s\") = ", string);
    for (i = 0; i < 8; i++)
    {
        for (j = 0; j < 16; j++)
            printf ("%02x", digest[i][j]);
        printf("\n");
    }
}

44077f4856c7d6c519fce5dfc0fb1fcf

44077f4856c7d6c519fce5dfc0fb1fcf

44077f4856c7d6c519fce5dfc0fb1fcf

44077f4856c7d6c519fce5dfc0fb1fcf

44077f4856c7d6c519fce5dfc0fb1fcf

44077f4856c7d6c519fce5dfc0fb1fcf

44077f4856c7d6c519fce5dfc0fb1fcf

44077f4856c7d6c519fce5dfc0fb1fcf

这与 MSVC 2013 v120 c 编译器一起正常工作

但是使用 gcc 7.2 编译器我得到

44077f4856c7d6c519fce5dfc0fb1fcf

44077f4856c7d6c519fce5dfc0fb1fcf

44077f4856c7d6c519fce5dfc0fb1fcf

44077f4856c7d6c519fce5dfc0fb1fcf

56c7d6c519fce5dfc0fb1fcf00000000

56c7d6c519fce5dfc0fb1fcf00000000

56c7d6c519fce5dfc0fb1fcf00000000

56c7d6c519fce5dfc0fb1fcf00000000

有专家能帮我解释为什么相同的代码在不同编译器下的行为不同吗..没有启用优化..

我敢打赌 sizeof(long int) 是不同的。最好使用 uint64_t、uint32_t、uint8_t 等代替 long int 或 unsigned char。特别是如果你用它来操纵地址。我认为错误在这里:

encode(&digests[i][j*4], &((unsigned long int *) &context->state[j])[i], 4);

在Windows上是

encode(&digests[i][j*4], &((uint32_t*) &context->state[j])[i], 4);

在 Linux

encode(&digests[i][j*4], &((uint64_t*) &context->state[j])[i], 4);

所以你为 i >= 2 寻址过去的源数组

编辑:

我也不会直接转换为 uint64_t*。 参见:gcc, strict-aliasing, and horror stories

相反,我会使用 memcpy 复制 context->state[j]

union
{
   uint32_t elem[4];
   uint8_t buff[sizeof(elem)];
} u;
memcpy (u.buff, &context->state[j], sizeof(u.buff));

然后使用

encode(&digests[i][j*4], &u.elem[i], 4);

别忘了

#include <stdint.h>