C++ 的抗侧信道数学函数
Side-Channel-Resistant Math Functions for C++
我正在处理一个处理秘密数据的 SGX 项目,在某些时候,我需要计算一个浮点数的自然对数。评估过程应该是抗侧信道的,这意味着它的 运行 时间和内存访问模式应该独立于它的输入和输出。
有这样的实现吗?文献中是否解决了该问题?
标签sgx suggests that your hardware platform is a very recent x86_64 Intel CPU that also has support for AVX2 and the FMA operation. The key to an implementation that is invariant in run time and memory access pattern is the avoidance of branches. If the compiler cooperates and converts simple conditional assignments into appropriate conditional move or blend instructions, the implementation of logf()
below should work fine. However, relying on compiler code generation is brittle, and out of the various compilers offered by the Compiler Explorer, I could only get clang to deliver something close to the desired result,除了一个分支(a = t
在非规范输入处理中的条件赋值外,所有分支都已转换。
因此,您可能需要通过适当的指令而不是分支代码(例如使用内在函数)来手动执行结果选择。
正如 EOF 在评论中指出的那样,消除分支是一个必要但不充分的条件,因为单个浮点运算也可以有变量 运行time,即使它们只是加法,乘法和 FMA。这在快速处理特殊操作数(例如次正规(通常称为非正规))的体系结构上不是问题,例如GPU。但是,这是我使用和处理过的 x86 处理器上的一个问题。通常,最严重的可变性是由于非规范化结果而产生的,而来自非规范化源操作数的影响要小得多。
下面显示的代码包含使用原始参数 a
作为源操作数的多个操作,这使其面临由于非正规输入导致的 运行 时间变化的风险。是否应针对打算部署代码的特定平台仔细测试 运行 时间上升到噪声水平之上的潜在变化(例如,由于调用函数时管道状态的变化) .
#include <cstdint>
#include <cstring>
#include <cmath>
int __float_as_int (float a)
{
int r;
memcpy (&r, &a, sizeof(r));
return r;
}
float __int_as_float (int a)
{
float r;
memcpy (&r, &a, sizeof(r));
return r;
}
/* maximum error 0.85417 ulp */
float my_logf (float a)
{
float m, r, s, t, i, f, u;
int32_t e;
/* result for exceptional cases */
u = a + a; // silence NaNs if necessary
if (a < 0.0f) u = 0.0f / 0.0f; // NaN
if (a == 0.0f) u = -1.0f / 0.0f; // -Inf
/* result for non-exceptional cases */
i = 0.0f;
/* fix up denormal input if needed */
t = a * 8388608.0f;
if (a < 1.17549435e-38f) {
a = t;
i = -23.0f;
}
/* split argument into exponent and mantissa parts */
e = (__float_as_int (a) - 0x3f2aaaab) & 0xff800000;
m = __int_as_float (__float_as_int (a) - e);
i = fmaf ((float)e, 1.19209290e-7f, i);
/* m in [2/3, 4/3] */
f = m - 1.0f;
s = f * f;
/* Compute log1p(f) for f in [-1/3, 1/3] */
r = -0.130310059f;
t = 0.140869141f;
r = fmaf (r, s, -0.121489234f);
t = fmaf (t, s, 0.139809728f);
r = fmaf (r, s, -0.166844666f);
t = fmaf (t, s, 0.200121239f);
r = fmaf (r, s, -0.249996305f);
r = fmaf (t, f, r);
r = fmaf (r, f, 0.333331943f);
r = fmaf (r, f, -0.500000000f);
r = fmaf (r, s, f);
r = fmaf (i, 0.693147182f, r); // log(2)
/* late selection between exceptional and non-exceptional result */
if (!((a > 0.0f) && (a <= 3.40282347e+38f))) r = u;
return r;
}
上面确定的潜在问题可以通过在对数计算中执行特殊情况处理和基于可移植整数的代码选择结果来解决。明显的权衡是性能损失。非规范参数的处理需要基于前导零 (CLZ) 计数的规范化。虽然 x86 处理器有这方面的说明,但它们可能无法从 C++ 以可移植的方式访问。但是具有不变 运行time 的可移植实现可以直接构建。这导致了一个无分支的实现,我希望它能很好地与大多数编译器一起工作,但仔细检查生成的机器代码将是必不可少的。我使用 Compiler Explorer 来验证它是否按需要编译 gcc 11.1 and clang 12.0.1
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <math.h>
/* reinterpret bit pattern of IEEE-754 binary32 as a 32-bit unsigned integer */
int32_t float_as_int32 (float a)
{
uint32_t r;
memcpy (&r, &a, sizeof r);
return r;
}
/* reinterpret bit pattern of a 32-bit unsigned integer as IEEE-754 binary32 */
float int32_as_float (int32_t a)
{
float r;
memcpy (&r, &a, sizeof r);
return r;
}
/* branch free implementation of ((cond) ? a : b). cond must be in {0,1} */
int32_t mux (int cond, int32_t a, int32_t b)
{
return (1 - cond) * b + cond * a;
}
/* leading zero count with invariant runtime */
int clz (uint32_t a)
{
// Algorithm by aqrit,
int n = 158 - (((uint32_t)float_as_int32 ((float)(int32_t)(a & ~(a >> 1)))) >> 23);
n = mux (n < 0, 0, n); // clamp below
n = mux (n > 32, 32, n); // clamp above
return n;
}
/* Compute natural logarithm with a maximum error of 0.85089 ulp */
float my_logf (float a)
{
float m, r, s, t, i, f;
int32_t e, ia, ii, it, iu, im, shift, excp;
const int32_t abs_mask = 0x7fffffffu;
const int32_t qnan_bit = 0x00400000u;
const int32_t pos_infty = 0x7f800000u;
const int32_t neg_infty = 0xff800000u;
const int32_t indefinite = 0xffc00000u;
const int32_t zero_float = 0x00000000u; // 0.0f
const int32_t one_float = 0x3f800000u; // 1.0f
const int32_t tiny_float = 0x00800000u; // 1.17549435e-38f
const int32_t huge_float = 0x7f7fffffu; // 3.40282347e+38f
ia = float_as_int32 (a);
/* result for exceptional cases */
iu = mux ((int32_t)ia < 0, indefinite, ia); // return QNaN INDEFINITE
iu = mux ((ia & abs_mask) == 0, neg_infty, iu); // return -Inf
iu = mux ((ia & abs_mask) > pos_infty, ia | qnan_bit, iu); // convert to QNaN
/* result for non-exceptional cases */
shift = clz (ia) - 8;
it = (ia << shift) + ((23 - shift) << 23);
ii = mux (ia < tiny_float, -23, 0);
it = mux (ia < tiny_float, it, ia);
/* split argument into exponent and mantissa parts */
e = (it - 0x3f2aaaab) & 0xff800000;
m = int32_as_float (it - e);
i = fmaf ((float)e, 1.19209290e-7f, (float)ii);
/* m in [2/3, 4/3] */
f = m - 1.0f;
s = f * f;
/* Compute log1p(f) for f in [-1/3, 1/3] */
r = -0.130310059f;
t = 0.140869141f;
r = fmaf (r, s, -0.121483363f);
t = fmaf (t, s, 0.139814854f);
r = fmaf (r, s, -0.166846141f);
t = fmaf (t, s, 0.200120345f);
r = fmaf (r, s, -0.249996200f);
r = fmaf (t, f, r);
r = fmaf (r, f, 0.333331972f);
r = fmaf (r, f, -0.500000000f);
r = fmaf (r, s, f);
r = fmaf (i, 0.693147182f, r); // log(2)
/* late selection between exceptional and non-exceptional result */
excp = ((uint32_t)ia - 1) > ((uint32_t)huge_float - 1);
iu = mux (excp, iu, zero_float);
im = mux (excp, zero_float, one_float);
r = fmaf (int32_as_float (im), r, int32_as_float (iu));
return r;
}
/* reinterpret bit pattern of IEEE-754 binary64 as a 64-bit unsigned integer */
uint64_t double_as_uint64 (double a)
{
uint64_t r;
memcpy (&r, &a, sizeof r);
return r;
}
double floatUlpErr (float res, double ref)
{
uint64_t i, j, err;
int expoRef;
/* ulp error cannot be computed if either operand is NaN, infinity, zero */
if (isnan (res) || isnan (ref) || isinf (res) || isinf (ref) ||
(res == 0.0f) || (ref == 0.0f)) {
return 0.0;
}
/* Convert the float result to an "extended float". This is like a float
with 56 instead of 24 effective mantissa bits
*/
i = ((uint64_t)(uint32_t)float_as_int32 (res)) << 32;
/* Convert the double reference to an "extended float". If the reference is
>= 2^129, we need to clamp yo the maximum "extended float". If reference
is < 2^-126, we need to denormalize because of float's limited exponent
range.
*/
expoRef = (int)(((double_as_uint64(ref) >> 52) & 0x7ff) - 1023);
if (expoRef >= 129) {
j = (double_as_uint64(ref) & 0x8000000000000000ULL) |
0x7fffffffffffffffULL;
} else if (expoRef < -126) {
j = ((double_as_uint64(ref) << 11) | 0x8000000000000000ULL) >> 8;
j = j >> (-(expoRef + 126));
j = j | (double_as_uint64(ref) & 0x8000000000000000ULL);
} else {
j = ((double_as_uint64(ref) << 11) & 0x7fffffffffffffffULL) >> 8;
j = j | ((uint64_t)(expoRef + 127) << 55);
j = j | (double_as_uint64(ref) & 0x8000000000000000ULL);
}
err = (i < j) ? (j - i) : (i - j);
return err / 4294967296.0;
}
int main (void)
{
uint32_t diff, refi, resi, argi = 0;
float reff, res, arg;
double ref, ulp, maxulp = 0;
do {
arg = int32_as_float (argi);
ref = log ((double)arg);
reff = (float)ref;
res = my_logf (arg);
ulp = floatUlpErr (res, ref);
if (ulp > maxulp) maxulp = ulp;
resi = float_as_int32 (res);
refi = float_as_int32 (reff);
diff = (resi > refi) ? (resi - refi) : (refi - resi);
if (diff > 1) {
printf ("error: arg=%15.6a res=%15.6a ref=%15.6a\n", arg, res, ref);
return EXIT_FAILURE;
}
argi++;
} while (argi);
printf ("maximum ulp error: %.5f\n", maxulp);
return EXIT_SUCCESS;
}
我正在处理一个处理秘密数据的 SGX 项目,在某些时候,我需要计算一个浮点数的自然对数。评估过程应该是抗侧信道的,这意味着它的 运行 时间和内存访问模式应该独立于它的输入和输出。
有这样的实现吗?文献中是否解决了该问题?
标签sgx suggests that your hardware platform is a very recent x86_64 Intel CPU that also has support for AVX2 and the FMA operation. The key to an implementation that is invariant in run time and memory access pattern is the avoidance of branches. If the compiler cooperates and converts simple conditional assignments into appropriate conditional move or blend instructions, the implementation of logf()
below should work fine. However, relying on compiler code generation is brittle, and out of the various compilers offered by the Compiler Explorer, I could only get clang to deliver something close to the desired result,除了一个分支(a = t
在非规范输入处理中的条件赋值外,所有分支都已转换。
因此,您可能需要通过适当的指令而不是分支代码(例如使用内在函数)来手动执行结果选择。
正如 EOF 在评论中指出的那样,消除分支是一个必要但不充分的条件,因为单个浮点运算也可以有变量 运行time,即使它们只是加法,乘法和 FMA。这在快速处理特殊操作数(例如次正规(通常称为非正规))的体系结构上不是问题,例如GPU。但是,这是我使用和处理过的 x86 处理器上的一个问题。通常,最严重的可变性是由于非规范化结果而产生的,而来自非规范化源操作数的影响要小得多。
下面显示的代码包含使用原始参数 a
作为源操作数的多个操作,这使其面临由于非正规输入导致的 运行 时间变化的风险。是否应针对打算部署代码的特定平台仔细测试 运行 时间上升到噪声水平之上的潜在变化(例如,由于调用函数时管道状态的变化) .
#include <cstdint>
#include <cstring>
#include <cmath>
int __float_as_int (float a)
{
int r;
memcpy (&r, &a, sizeof(r));
return r;
}
float __int_as_float (int a)
{
float r;
memcpy (&r, &a, sizeof(r));
return r;
}
/* maximum error 0.85417 ulp */
float my_logf (float a)
{
float m, r, s, t, i, f, u;
int32_t e;
/* result for exceptional cases */
u = a + a; // silence NaNs if necessary
if (a < 0.0f) u = 0.0f / 0.0f; // NaN
if (a == 0.0f) u = -1.0f / 0.0f; // -Inf
/* result for non-exceptional cases */
i = 0.0f;
/* fix up denormal input if needed */
t = a * 8388608.0f;
if (a < 1.17549435e-38f) {
a = t;
i = -23.0f;
}
/* split argument into exponent and mantissa parts */
e = (__float_as_int (a) - 0x3f2aaaab) & 0xff800000;
m = __int_as_float (__float_as_int (a) - e);
i = fmaf ((float)e, 1.19209290e-7f, i);
/* m in [2/3, 4/3] */
f = m - 1.0f;
s = f * f;
/* Compute log1p(f) for f in [-1/3, 1/3] */
r = -0.130310059f;
t = 0.140869141f;
r = fmaf (r, s, -0.121489234f);
t = fmaf (t, s, 0.139809728f);
r = fmaf (r, s, -0.166844666f);
t = fmaf (t, s, 0.200121239f);
r = fmaf (r, s, -0.249996305f);
r = fmaf (t, f, r);
r = fmaf (r, f, 0.333331943f);
r = fmaf (r, f, -0.500000000f);
r = fmaf (r, s, f);
r = fmaf (i, 0.693147182f, r); // log(2)
/* late selection between exceptional and non-exceptional result */
if (!((a > 0.0f) && (a <= 3.40282347e+38f))) r = u;
return r;
}
上面确定的潜在问题可以通过在对数计算中执行特殊情况处理和基于可移植整数的代码选择结果来解决。明显的权衡是性能损失。非规范参数的处理需要基于前导零 (CLZ) 计数的规范化。虽然 x86 处理器有这方面的说明,但它们可能无法从 C++ 以可移植的方式访问。但是具有不变 运行time 的可移植实现可以直接构建。这导致了一个无分支的实现,我希望它能很好地与大多数编译器一起工作,但仔细检查生成的机器代码将是必不可少的。我使用 Compiler Explorer 来验证它是否按需要编译 gcc 11.1 and clang 12.0.1
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <math.h>
/* reinterpret bit pattern of IEEE-754 binary32 as a 32-bit unsigned integer */
int32_t float_as_int32 (float a)
{
uint32_t r;
memcpy (&r, &a, sizeof r);
return r;
}
/* reinterpret bit pattern of a 32-bit unsigned integer as IEEE-754 binary32 */
float int32_as_float (int32_t a)
{
float r;
memcpy (&r, &a, sizeof r);
return r;
}
/* branch free implementation of ((cond) ? a : b). cond must be in {0,1} */
int32_t mux (int cond, int32_t a, int32_t b)
{
return (1 - cond) * b + cond * a;
}
/* leading zero count with invariant runtime */
int clz (uint32_t a)
{
// Algorithm by aqrit,
int n = 158 - (((uint32_t)float_as_int32 ((float)(int32_t)(a & ~(a >> 1)))) >> 23);
n = mux (n < 0, 0, n); // clamp below
n = mux (n > 32, 32, n); // clamp above
return n;
}
/* Compute natural logarithm with a maximum error of 0.85089 ulp */
float my_logf (float a)
{
float m, r, s, t, i, f;
int32_t e, ia, ii, it, iu, im, shift, excp;
const int32_t abs_mask = 0x7fffffffu;
const int32_t qnan_bit = 0x00400000u;
const int32_t pos_infty = 0x7f800000u;
const int32_t neg_infty = 0xff800000u;
const int32_t indefinite = 0xffc00000u;
const int32_t zero_float = 0x00000000u; // 0.0f
const int32_t one_float = 0x3f800000u; // 1.0f
const int32_t tiny_float = 0x00800000u; // 1.17549435e-38f
const int32_t huge_float = 0x7f7fffffu; // 3.40282347e+38f
ia = float_as_int32 (a);
/* result for exceptional cases */
iu = mux ((int32_t)ia < 0, indefinite, ia); // return QNaN INDEFINITE
iu = mux ((ia & abs_mask) == 0, neg_infty, iu); // return -Inf
iu = mux ((ia & abs_mask) > pos_infty, ia | qnan_bit, iu); // convert to QNaN
/* result for non-exceptional cases */
shift = clz (ia) - 8;
it = (ia << shift) + ((23 - shift) << 23);
ii = mux (ia < tiny_float, -23, 0);
it = mux (ia < tiny_float, it, ia);
/* split argument into exponent and mantissa parts */
e = (it - 0x3f2aaaab) & 0xff800000;
m = int32_as_float (it - e);
i = fmaf ((float)e, 1.19209290e-7f, (float)ii);
/* m in [2/3, 4/3] */
f = m - 1.0f;
s = f * f;
/* Compute log1p(f) for f in [-1/3, 1/3] */
r = -0.130310059f;
t = 0.140869141f;
r = fmaf (r, s, -0.121483363f);
t = fmaf (t, s, 0.139814854f);
r = fmaf (r, s, -0.166846141f);
t = fmaf (t, s, 0.200120345f);
r = fmaf (r, s, -0.249996200f);
r = fmaf (t, f, r);
r = fmaf (r, f, 0.333331972f);
r = fmaf (r, f, -0.500000000f);
r = fmaf (r, s, f);
r = fmaf (i, 0.693147182f, r); // log(2)
/* late selection between exceptional and non-exceptional result */
excp = ((uint32_t)ia - 1) > ((uint32_t)huge_float - 1);
iu = mux (excp, iu, zero_float);
im = mux (excp, zero_float, one_float);
r = fmaf (int32_as_float (im), r, int32_as_float (iu));
return r;
}
/* reinterpret bit pattern of IEEE-754 binary64 as a 64-bit unsigned integer */
uint64_t double_as_uint64 (double a)
{
uint64_t r;
memcpy (&r, &a, sizeof r);
return r;
}
double floatUlpErr (float res, double ref)
{
uint64_t i, j, err;
int expoRef;
/* ulp error cannot be computed if either operand is NaN, infinity, zero */
if (isnan (res) || isnan (ref) || isinf (res) || isinf (ref) ||
(res == 0.0f) || (ref == 0.0f)) {
return 0.0;
}
/* Convert the float result to an "extended float". This is like a float
with 56 instead of 24 effective mantissa bits
*/
i = ((uint64_t)(uint32_t)float_as_int32 (res)) << 32;
/* Convert the double reference to an "extended float". If the reference is
>= 2^129, we need to clamp yo the maximum "extended float". If reference
is < 2^-126, we need to denormalize because of float's limited exponent
range.
*/
expoRef = (int)(((double_as_uint64(ref) >> 52) & 0x7ff) - 1023);
if (expoRef >= 129) {
j = (double_as_uint64(ref) & 0x8000000000000000ULL) |
0x7fffffffffffffffULL;
} else if (expoRef < -126) {
j = ((double_as_uint64(ref) << 11) | 0x8000000000000000ULL) >> 8;
j = j >> (-(expoRef + 126));
j = j | (double_as_uint64(ref) & 0x8000000000000000ULL);
} else {
j = ((double_as_uint64(ref) << 11) & 0x7fffffffffffffffULL) >> 8;
j = j | ((uint64_t)(expoRef + 127) << 55);
j = j | (double_as_uint64(ref) & 0x8000000000000000ULL);
}
err = (i < j) ? (j - i) : (i - j);
return err / 4294967296.0;
}
int main (void)
{
uint32_t diff, refi, resi, argi = 0;
float reff, res, arg;
double ref, ulp, maxulp = 0;
do {
arg = int32_as_float (argi);
ref = log ((double)arg);
reff = (float)ref;
res = my_logf (arg);
ulp = floatUlpErr (res, ref);
if (ulp > maxulp) maxulp = ulp;
resi = float_as_int32 (res);
refi = float_as_int32 (reff);
diff = (resi > refi) ? (resi - refi) : (refi - resi);
if (diff > 1) {
printf ("error: arg=%15.6a res=%15.6a ref=%15.6a\n", arg, res, ref);
return EXIT_FAILURE;
}
argi++;
} while (argi);
printf ("maximum ulp error: %.5f\n", maxulp);
return EXIT_SUCCESS;
}