有没有快速的方法让IBM的vncipher指令结果和Intel的一样mm_aesdec_si128?

Is there a fast way to make IBM's vncipher instruction result the same as Intels mm_aesdec_si128?

我正在移植一个应用程序,该应用程序使用 AES 加密和解密指令将一些数据从 x86 随机化到 POWER8。我用 _mm_aesdec_si128 指令撞墙了,它似乎做了一些与等效的 IBM __builtin_crypto_vncipher 不同的事情。 https://link.springer.com/content/pdf/10.1007/978-3-642-03317-9_4.pdf 的文档第 52-54 页提到它遵循 FIPS 197。 https://ibm.ent.box.com/s/jd5w15gz301s5b5dt375mshpq9c3lh4u 处的 IBM 文档第 305 页也说它遵循 FIPS197,唯一的区别是 InvMixColumns 的顺序和带轮密钥的 xor 被翻转了,但这会改变结果吗?

如果结果不同,他们怎么能说他们都遵循规范?

以下 C 程序在 x86 中运行良好,但在 ppc64 中将输出错误的 aesdec 结果。幸运的是,ppc64 中的 aesenc 按预期工作。

现在我通过使用 aesdec 的软件实现解决了这个问题,但我想在硬件中完成所有事情。

C程序:

//compile with "gcc -maes aestest.c -o aestest" in x86
//compile with "gcc -mcrypto -flax-vector-conversions aestest.c -o aestest" in power8

#include <stdio.h>
#include <stdint.h>
#include <string.h>

#ifdef __x86_64__
#include <x86intrin.h>
__m128i aesenc(__m128i d,__m128i k){
  return _mm_aesenc_si128(d,k);
}
__m128i aesdec(__m128i d,__m128i k){
  return _mm_aesdec_si128(d,k);
}
#endif

#ifdef __PPC64__
#include <endian.h>
#include <altivec.h>
#undef vector
#undef pixel
#undef bool
typedef __vector uint8_t __m128i;

//flip vector to BE order
__m128i vrev(__m128i v){
  #if __BYTE_ORDER == __BIG_ENDIAN
  return v;
  #else
  return vec_perm(v,(__m128i){0},(__m128i){15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0});
  #endif
}
__m128i aesenc(__m128i d,__m128i k){
  return vrev(__builtin_crypto_vcipher(vrev(d),vrev(k)));
}
__m128i aesdec(__m128i d,__m128i k){
  return vrev(__builtin_crypto_vncipher(vrev(d),vrev(k)));
}
#endif

void print_m128(char* msg,  __m128i v){
   uint8_t* t = (uint8_t*)&v;
   printf("%s: %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",msg,t[0],t[1],t[2],t[3], t[4],t[5],t[6],t[7], t[8],t[9],t[10],t[11], t[12],t[13],t[14],t[15]) ;
}


int main(int argc,char* argv[]){
  uint8_t msg[] = "0123456789abcde"; 
  uint8_t key1[] = {255,255,255,255, 255,255,255,255, 255,255,255,255, 255,255,255,255};
  uint8_t key2[] = {0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77,0x88,0x99,0xaa,0xbb,0xcc,0xdd,0xee,0xff};
  uint8_t* c;
  __m128i xmm1 = (__m128i){0};
  __m128i xmm2 = (__m128i){0};
  __m128i encR = (__m128i){0};
  __m128i decR = (__m128i){0};
//zero test
  printf("zero test\n");
  print_m128("xmm1",xmm1);
  print_m128("xmm2",xmm2);
  encR = aesenc(xmm1,xmm2);
  decR = aesdec(xmm1,xmm2);
  print_m128("enc ",encR);
  print_m128("dec ",decR);
//zero key test
  printf("zero key test\n");
  c = (uint8_t*)&xmm1;
  memcpy(c,msg,16);
  print_m128("xmm1",xmm1);
  print_m128("xmm2",xmm2);
  encR = aesenc(xmm1,xmm2);
  decR = aesdec(xmm1,xmm2);
  print_m128("enc ",encR);
  print_m128("dec ",decR);
//ff key test
  printf("ff key test\n");
  c = (uint8_t*)&xmm1;
  memcpy(c,msg,16);
  c = (uint8_t*)&xmm2;
  memcpy(c,key1,16);
  print_m128("xmm1",xmm1);
  print_m128("xmm2",xmm2);
  encR = aesenc(xmm1,xmm2);
  decR = aesdec(xmm1,xmm2);
  print_m128("enc ",encR);
  print_m128("dec ",decR);
//key test
  printf("key test\n");
  c = (uint8_t*)&xmm1;
  memcpy(c,msg,16);
  c = (uint8_t*)&xmm2;
  memcpy(c,key2,16);
  print_m128("xmm1",xmm1);
  print_m128("xmm2",xmm2);
  encR = aesenc(xmm1,xmm2);
  decR = aesdec(xmm1,xmm2);
  print_m128("enc ",encR);
  print_m128("dec ",decR);
}
#Results in x86:

zero test
xmm1: 00000000000000000000000000000000
xmm2: 00000000000000000000000000000000
enc : 63636363636363636363636363636363
dec : 52525252525252525252525252525252
zero key test
xmm1: 30313233343536373839616263646500
xmm2: 00000000000000000000000000000000
enc : 257af2b38828ceea727eb74610cbd39b
dec : a903befadbaa6d0dc8b9a78af780e18f
ff key test
xmm1: 30313233343536373839616263646500
xmm2: ffffffffffffffffffffffffffffffff
enc : da850d4c77d731158d8148b9ef342c64
dec : 56fc4105245592f237465875087f1e70
key test
xmm1: 30313233343536373839616263646500
xmm2: 00112233445566778899aabbccddeeff
enc : 256bd080cc7da89dfae71dfddc163d64
dec : a9129cc99fff0b7a40200d313b5d0f70

#Results in ppc64:

zero test
xmm1: 00000000000000000000000000000000
xmm2: 00000000000000000000000000000000
enc : 63636363636363636363636363636363
dec : 52525252525252525252525252525252
zero key test
xmm1: 30313233343536373839616263646500
xmm2: 00000000000000000000000000000000
enc : 257af2b38828ceea727eb74610cbd39b
dec : a903befadbaa6d0dc8b9a78af780e18f
ff key test
xmm1: 30313233343536373839616263646500
xmm2: ffffffffffffffffffffffffffffffff
enc : da850d4c77d731158d8148b9ef342c64
dec : 56fc4105245592f237465875087f1e70
key test
xmm1: 30313233343536373839616263646500
xmm2: 00112233445566778899aabbccddeeff
enc : 256bd080cc7da89dfae71dfddc163d64
dec : 03fc36273511a194eacea7df91b3a59e

如上所示,最后一个测试失败了。

解决方案是使用零密钥在中间进行异或步骤 return 身份,然后在最后使用真实密钥进行异或。

__m128i aesd(__m128i d,__m128i k){
  __m128i out = vrev(__builtin_crypto_vncipher(vrev(d),(__m128i){0}));
  return vec_xor(out,k);
}