如何将 128 位被除数除以 64 位除数,其中被除数的位均为 1,而我只需要商的 64 LSB?

How to divide a 128-bit dividend by a 64-bit divisor, where the dividend's bits are all 1's, and where I only need the 64 LSBs of the quotient?

我需要计算 ((2^128) - 1) / x。除数 x 是一个无符号的 64 位数字。被除数由两个无符号的 64 位数字(高位和低位)组成,其中两个数字都是 UINT64_MAX。我只能使用 64 位算法并且需要它是可移植的(不使用 GNU 的 __int128、MSCV 的 _udiv128、汇编或类似的东西)。我不需要商的高位,我只需要低64位。

如何进行这个操作?

另外:x >= 3x 不是 2 的幂。

编辑:我创建了自己的解决方案(在下面回答)。但我欢迎任何其他性能更好的解决方案:)

我不知道有任何优化适用于具有恒定被除数的整数除法。为了仔细检查,我尝试了一个带有 Compiler Explorer 的全一红利的测试用例。使用指定了最高优化级别的 gcc、icc 和 clang,生成的代码显示没有对除法应用优化。

当然可以构建高性能的 128 位除法例程,但根据个人经验,我知道这很容易出错,需要非常复杂的测试才能实现良好的测试覆盖率,包括极端情况,因为详尽无遗在此操作数大小下无法进行测试。设计和测试的工作量很容易超出 Whosebug 上的答案看似合理的数量级小数点后两位。

执行整数除法的一种简单方法是使用我们在小学时都学过的算法,但只能使用二进制。这使得关于下一个商位的决定变得特别容易:当当前部分余数大于或等于除数时为 1,否则为 0。使用普通二进制除法,我们唯一需要的整数运算是加法和减法。

我们可以构建可移植原语,通过模仿处理器的机器指令用于对多字整数进行操作的方式来对任何位长度的操作数执行这些操作:ADD 带进位,ADD 带进位, ADD 带入带出;类似于 SUB。在下面的代码中,我为此使用了简单的 C 宏;当然更复杂的方法是可能的。

由于我现在正在使用的系统不支持 128 位整数,因此我针对 64 位整数设计并测试了这种方法。当时的 128 位版本是简单的机械重命名练习。在现代 64 位处理器上,我希望这个 128 位除法函数能够在大约 3000 个周期内执行。

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <limits.h>

#define SUBCcc(a,b,cy,t0,t1,t2) \
  (t0=(b)+cy, t1=(a), cy=t0<cy, t2=t1<t0, cy=cy+t2, t1-t0)

#define SUBcc(a,b,cy,t0,t1) \
  (t0=(b), t1=(a), cy=t1<t0, t1-t0)

#define SUBC(a,b,cy,t0,t1) \
  (t0=(b)+cy, t1=(a), t1-t0)

#define ADDCcc(a,b,cy,t0,t1) \
  (t0=(b)+cy, t1=(a), cy=t0<cy, t0=t0+t1, t1=t0<t1, cy=cy+t1, t0=t0)

#define ADDcc(a,b,cy,t0,t1) \
  (t0=(b), t1=(a), t0=t0+t1, cy=t0<t1, t0=t0)

#define ADDC(a,b,cy,t0,t1) \
  (t0=(b)+cy, t1=(a), t0+t1)

typedef struct {
    uint64_t l;
    uint64_t h;
} my_uint128;

my_uint128 bitwise_division_128 (my_uint128 dvnd, my_uint128 dvsr)
{
    my_uint128 quot, rem, tmp;
    uint64_t cy, t0, t1, t2;
    int bits_left = CHAR_BIT * sizeof (my_uint128);
    
    quot.h = dvnd.h;
    quot.l = dvnd.l;
    rem.h = 0;
    rem.l = 0;
    do {
        quot.l = ADDcc  (quot.l, quot.l, cy, t0, t1);
        quot.h = ADDCcc (quot.h, quot.h, cy, t0, t1);
        rem.l  = ADDCcc (rem.l,  rem.l,  cy, t0, t1);
        rem.h  = ADDC   (rem.h,  rem.h,  cy, t0, t1);
        tmp.l  = SUBcc  (rem.l,  dvsr.l, cy, t0, t1);
        tmp.h  = SUBCcc (rem.h,  dvsr.h, cy, t0, t1, t2);
        if (!cy) { // remainder >= divisor
            rem.l = tmp.l;
            rem.h = tmp.h;
            quot.l = quot.l | 1;
        }
        bits_left--;
    } while (bits_left);
    return quot;
}

typedef struct {
    uint32_t l;
    uint32_t h;
} my_uint64;

my_uint64 bitwise_division_64 (my_uint64 dvnd, my_uint64 dvsr)
{
    my_uint64 quot, rem, tmp;
    uint32_t cy, t0, t1, t2;
    int bits_left = CHAR_BIT * sizeof (my_uint64);
    
    quot.h = dvnd.h;
    quot.l = dvnd.l;
    rem.h = 0;
    rem.l = 0;
    do {
        quot.l = ADDcc  (quot.l, quot.l, cy, t0, t1);
        quot.h = ADDCcc (quot.h, quot.h, cy, t0, t1);
        rem.l  = ADDCcc (rem.l,  rem.l,  cy, t0, t1);
        rem.h  = ADDC   (rem.h,  rem.h,  cy, t0, t1);
        tmp.l  = SUBcc  (rem.l,  dvsr.l, cy, t0, t1);
        tmp.h  = SUBCcc (rem.h,  dvsr.h, cy, t0, t1, t2);
        if (!cy) { // remainder >= divisor
            rem.l = tmp.l;
            rem.h = tmp.h;
            quot.l = quot.l | 1;
        }
        bits_left--;
    } while (bits_left);
    return quot;
}

/*
  https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J
  From: geo <gmars...@gmail.com>
  Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
  Subject: 64-bit KISS RNGs
  Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST)

  This 64-bit KISS RNG has three components, each nearly
  good enough to serve alone.    The components are:
  Multiply-With-Carry (MWC), period (2^121+2^63-1)
  Xorshift (XSH), period 2^64-1
  Congruential (CNG), period 2^64
*/
static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;
#define MWC64  (kiss64_t = (kiss64_x << 58) + kiss64_c, \
                kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \
                kiss64_c += (kiss64_x < kiss64_t), kiss64_x)
#define XSH64  (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \
                kiss64_y ^= (kiss64_y << 43))
#define CNG64  (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)

int main (void)
{
    uint64_t a, b, res, ref;
    my_uint64 aa, bb, rr;
    do {
        a = KISS64;
        b = KISS64;
        ref = a / b;

        aa.l = (uint32_t)a;
        aa.h = (uint32_t)(a >> 32);
        bb.l = (uint32_t)b;
        bb.h = (uint32_t)(b >> 32);
        rr = bitwise_division_64 (aa, bb);
        res = (((uint64_t)rr.h) << 32) + rr.l;

        if (ref != res) {
            printf ("a=%016llx b=%016llx res=%016llx ref=%016llx\n", a, b, res, ref);
            return EXIT_FAILURE;
        }
    } while (a);
    return EXIT_SUCCESS;
}

比按位计算更快的方法是计算除数的倒数,乘以被除数得到初步商,然后计算余数以精确调整商。整个计算可以在定点运算中完成。然而,在具有快速浮点单元的现代处理器上,使用双精度除法生成倒数的起始近似值更为方便。具有三次收敛的单个哈雷迭代然后导致全精度倒数。

倒数的 Halley 迭代是非常密集的整数乘法运算,64x64 位乘法与 128 位结果(下面代码中的umul64wide())是对性能至关重要的构建块。在现代 64 位架构上,这通常是在几个周期内执行的单个机器指令,但是可移植代码无法访问它。根据体系结构和编译器,模拟指令的可移植代码需要大约 15 到 20 条指令。

整个 128 位除法大约需要 300 个周期,或比简单的逐位计算快十倍。由于代码相当复杂,因此需要进行大量测试以确保正确运行。在下面的框架中,我使用基于模式的随机测试进行适度密集的测试,使用简单的按位实现作为参考。

下面 udiv128() 的实现假定编程环境使用符合 IEEE-754 的浮点算法,double 类型映射到 IEEE-754 的 binary64 类型,并且 double 个操作数的除法是正确舍入的。

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <limits.h>

typedef struct {
    uint64_t l;
    uint64_t h;
} my_uint128;

my_uint128 make_my_uint128 (uint64_t h, uint64_t l);
my_uint128 add128 (my_uint128 a, my_uint128 b);
my_uint128 sub128 (my_uint128 a, my_uint128 b);
my_uint128 lsl128 (my_uint128 a, int sh);
my_uint128 lsr128 (my_uint128 a, int sh);
my_uint128 not128 (my_uint128 a);
my_uint128 umul128lo (my_uint128 a, my_uint128 b);
my_uint128 umul128hi (my_uint128 a, my_uint128 b);
double my_uint128_to_double (my_uint128 a);
int lt128 (my_uint128 a, my_uint128 b);
int eq128 (my_uint128 a, my_uint128 b);
uint64_t double_as_uint64 (double a);
double uint64_as_double (uint64_t a);

#define FP64_EXPO_BIAS   (1023)
#define FP64_MANT_BITS   (53)
#define FP64_MANT_IBIT   (0x0010000000000000ULL)
#define FP64_MANT_MASK   (0x000fffffffffffffULL)
#define FP64_INC_EXP_128 (0x0800000000000000ULL)
#define FP64_MANT_ADJ    (2)  // adjustment to ensure underestimate

my_uint128 udiv128 (my_uint128 dividend, my_uint128 divisor)
{
    const my_uint128 zero = make_my_uint128 (0ULL, 0ULL);
    const my_uint128 one  = make_my_uint128 (0ULL, 1ULL);
    const my_uint128 two  = make_my_uint128 (0ULL, 2ULL);
    my_uint128 recip, temp, quo, rem;
    my_uint128 neg_divisor = sub128 (zero, divisor);
    double r;

    /* compute initial approximation for reciprocal; must be underestimate! */
    r = 1.0 / my_uint128_to_double (divisor);
    uint64_t i = double_as_uint64 (r) - FP64_MANT_ADJ + FP64_INC_EXP_128;
    temp = make_my_uint128 (0ULL, (i & FP64_MANT_MASK) | FP64_MANT_IBIT);
    int sh = (i >> (FP64_MANT_BITS-1)) - FP64_EXPO_BIAS - (FP64_MANT_BITS-1);
    recip = (sh < 0) ? lsr128 (temp, -sh) : lsl128 (temp, sh);

    /* perform Halley iteration with cubic convergence to refine reciprocal */
    temp = umul128lo (neg_divisor, recip);
    temp = add128 (umul128hi (temp, temp), temp);
    recip = add128 (umul128hi (recip, temp), recip);

    /* compute preliminary quotient and remainder */
    quo = umul128hi (dividend, recip); 
    rem = sub128 (dividend, umul128lo (divisor, quo));

    /* adjust quotient if too small; quotient off by 2 at most */
    if (! lt128 (rem, divisor)) {
        quo = add128 (quo, lt128 (sub128 (rem, divisor), divisor) ? one : two);
    }

    /* handle division by zero */
    if (eq128 (divisor, zero)) quo = not128 (zero);

    return quo;
}

#define SUBCcc(a,b,cy,t0,t1,t2) \
  (t0=(b)+cy, t1=(a), cy=t0<cy, t2=t1<t0, cy=cy+t2, t1-t0)

#define SUBcc(a,b,cy,t0,t1) \
  (t0=(b), t1=(a), cy=t1<t0, t1-t0)

#define SUBC(a,b,cy,t0,t1) \
  (t0=(b)+cy, t1=(a), t1-t0)

#define ADDCcc(a,b,cy,t0,t1) \
  (t0=(b)+cy, t1=(a), cy=t0<cy, t0=t0+t1, t1=t0<t1, cy=cy+t1, t0=t0)

#define ADDcc(a,b,cy,t0,t1) \
  (t0=(b), t1=(a), t0=t0+t1, cy=t0<t1, t0=t0)

#define ADDC(a,b,cy,t0,t1) \
  (t0=(b)+cy, t1=(a), t0+t1)

uint64_t double_as_uint64 (double a) 
{ 
    uint64_t r; 
    memcpy (&r, &a, sizeof r); 
    return r; 
}

double uint64_as_double (uint64_t a) 
{ 
    double r; 
    memcpy (&r, &a, sizeof r); 
    return r; 
}

my_uint128 add128 (my_uint128 a, my_uint128 b)
{
    uint64_t cy, t0, t1;
    a.l = ADDcc (a.l, b.l, cy, t0, t1);
    a.h = ADDC  (a.h, b.h, cy, t0, t1);
    return a;
}

my_uint128 sub128 (my_uint128 a, my_uint128 b)
{
    uint64_t cy, t0, t1;
    a.l = SUBcc (a.l, b.l, cy, t0, t1);
    a.h = SUBC  (a.h, b.h, cy, t0, t1);
    return a;
}

my_uint128 lsl128 (my_uint128 a, int sh)
{
    if (sh >= 64) {
        a.h = a.l << (sh - 64);
        a.l = 0ULL;
    } else if (sh) {
        a.h = (a.h << sh) + (a.l >> (64 - sh));
        a.l = a.l << sh;
    }
    return a;
}

my_uint128 lsr128 (my_uint128 a, int sh)
{
    if (sh >= 64) {
        a.l = a.h >> (sh - 64);
        a.h = 0ULL;
    } else if (sh) {
        a.l = (a.l >> sh) + (a.h << (64 - sh));
        a.h = a.h >> sh;
    } 
    return a;
}

my_uint128 not128 (my_uint128 a)
{
    a.l = ~a.l;
    a.h = ~a.h;
    return a;
}

int lt128 (my_uint128 a, my_uint128 b)
{
    uint64_t cy, t0, t1, t2;
    a.l = SUBcc  (a.l, b.l, cy, t0, t1);
    a.h = SUBCcc (a.h, b.h, cy, t0, t1, t2);
    return cy;
}

int eq128 (my_uint128 a, my_uint128 b)
{
    return (a.l == b.l) && (a.h == b.h);
}

// derived from Hacker's Delight 2nd ed. figure 8-2
my_uint128 umul64wide (uint64_t u, uint64_t v)
{
    my_uint128 r;
    uint64_t u0, v0, u1, v1, w0, w1, w2, t;
    u0 = (uint32_t)u;  u1 = u >> 32;
    v0 = (uint32_t)v;  v1 = v >> 32;
    w0 = u0 * v0;
    t  = u1 * v0 + (w0 >> 32);
    w1 = (uint32_t)t;
    w2 = t >> 32;
    w1 = u0 * v1 + w1;
    r.h = u1 * v1 + w2 + (w1 >> 32);
    r.l = (w1 << 32) + (uint32_t)w0;
    return r;
}

my_uint128 make_my_uint128 (uint64_t h, uint64_t l)
{
    my_uint128 r;
    r.h = h;
    r.l = l;
    return r;
}

my_uint128 umul128lo (my_uint128 a, my_uint128 b)
{
    my_uint128 r;
    r = umul64wide (a.l, b.l);
    r.h = r.h + a.l * b.h + a.h * b.l;
    return r;
}

my_uint128 umul128hi (my_uint128 a, my_uint128 b)
{
    my_uint128 t0, t1, t2, t3;
    t0 = umul64wide (a.l, b.l);
    t3 = add128 (umul64wide (a.h, b.l), make_my_uint128 (0ULL, t0.h));
    t1 = make_my_uint128 (0ULL, t3.l);
    t2 = make_my_uint128 (0ULL, t3.h);
    t1 = add128 (umul64wide (a.l, b.h), t1);
    return add128 (add128 (umul64wide (a.h, b.h), t2), make_my_uint128 (0ULL, t1.h));
}

double my_uint128_to_double (my_uint128 a)
{
    const int intbits = sizeof (a) * CHAR_BIT;
    const my_uint128 zero = make_my_uint128 (0ULL, 0ULL);
    my_uint128 rnd, i = a;
    uint64_t j;
    int sh = 0;
    double r;

    // normalize integer so MSB is set
    if (lt128 (i, make_my_uint128(0x0000000000000001ULL, 0))) {i = lsl128 (i,64); sh += 64; }
    if (lt128 (i, make_my_uint128(0x0000000100000000ULL, 0))) {i = lsl128 (i,32); sh += 32; }
    if (lt128 (i, make_my_uint128(0x0001000000000000ULL, 0))) {i = lsl128 (i,16); sh += 16; }
    if (lt128 (i, make_my_uint128(0x0100000000000000ULL, 0))) {i = lsl128 (i, 8); sh +=  8; } 
    if (lt128 (i, make_my_uint128(0x1000000000000000ULL, 0))) {i = lsl128 (i, 4); sh +=  4; }
    if (lt128 (i, make_my_uint128(0x4000000000000000ULL, 0))) {i = lsl128 (i, 2); sh +=  2; }
    if (lt128 (i, make_my_uint128(0x8000000000000000ULL, 0))) {i = lsl128 (i, 1); sh +=  1; }
    // form mantissa with explicit integer bit 
    rnd = lsl128 (i, FP64_MANT_BITS);
    i = lsr128 (i, intbits - FP64_MANT_BITS);
    j = i.l;
    // add in exponent, taking into account integer bit of mantissa
    if (! eq128 (a, zero)) {
        j += (uint64_t)(FP64_EXPO_BIAS + (intbits-1) - 1 - sh) << (FP64_MANT_BITS-1);
    }
    // round to nearest or even
    rnd.h = rnd.h | (rnd.l != 0);
    if ((rnd.h > 0x8000000000000000ULL) || 
        ((rnd.h == 0x8000000000000000ULL) && (j & 1))) j++;
    // reinterpret bit pattern as IEEE-754 'binary64'
    r = uint64_as_double (j);
    return r;
}

my_uint128 bitwise_division_128 (my_uint128 dvnd, my_uint128 dvsr)
{
    my_uint128 quot, rem, tmp;
    uint64_t cy, t0, t1, t2;
    int bits_left = CHAR_BIT * sizeof (dvsr);
    
    quot.h = dvnd.h;
    quot.l = dvnd.l;
    rem.h = 0;
    rem.l = 0;
    do {
        quot.l = ADDcc  (quot.l, quot.l, cy, t0, t1);
        quot.h = ADDCcc (quot.h, quot.h, cy, t0, t1);
        rem.l  = ADDCcc (rem.l,  rem.l,  cy, t0, t1);
        rem.h  = ADDC   (rem.h,  rem.h,  cy, t0, t1);
        tmp.l  = SUBcc  (rem.l,  dvsr.l, cy, t0, t1);
        tmp.h  = SUBCcc (rem.h,  dvsr.h, cy, t0, t1, t2);
        if (!cy) { // remainder >= divisor
            rem.l = tmp.l;
            rem.h = tmp.h;
            quot.l = quot.l | 1;
        }
        bits_left--;
    } while (bits_left);
    return quot;
}

/*
  https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J
  From: geo <gmars...@gmail.com>
  Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
  Subject: 64-bit KISS RNGs
  Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST)

  This 64-bit KISS RNG has three components, each nearly
  good enough to serve alone.    The components are:
  Multiply-With-Carry (MWC), period (2^121+2^63-1)
  Xorshift (XSH), period 2^64-1
  Congruential (CNG), period 2^64
*/
static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;
#define MWC64  (kiss64_t = (kiss64_x << 58) + kiss64_c, \
                kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \
                kiss64_c += (kiss64_x < kiss64_t), kiss64_x)
#define XSH64  (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \
                kiss64_y ^= (kiss64_y << 43))
#define CNG64  (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)

my_uint128 v[100000]; /* FIXME: size appropriately */

int main (void)
{
    const my_uint128 zero = make_my_uint128 (0ULL, 0ULL);
    const my_uint128 one = make_my_uint128 (0ULL, 1ULL);
    my_uint128 dividend, divisor, quot, ref;
    int i, j, patterns, idx = 0, nbrBits = sizeof (v[0]) * CHAR_BIT;
    int patterns_done = 0;

    /* pattern class 1: 2**i */
    for (i = 0; i < nbrBits; i++) {
        v [idx] = lsl128 (one, i);
        idx++;
    }
    /* pattern class 2: 2**i-1 */
    for (i = 0; i < nbrBits; i++) {
        v [idx] = sub128 (lsl128 (one, i), one);
        idx++;
    }
    /* pattern class 3: 2**i+1 */
    for (i = 0; i < nbrBits; i++) {
        v [idx] = add128 (lsl128 (one, i), one); 
        idx++;
    }
    /* pattern class 4: 2**i + 2**j */
    for (i = 0; i < nbrBits; i++) {
        for (j = 0; j < nbrBits; j++) {
            v [idx] = add128 (lsl128 (one, i), lsl128 (one, j));
            idx++;
        }
    }
    /* pattern class 5: 2**i - 2**j */
    for (i = 0; i < nbrBits; i++) {
        for (j = 0; j < nbrBits; j++) {
            v [idx] = sub128 (lsl128 (one, i), lsl128 (one, j));
            idx++;
        }
    }
    patterns = idx;
    /* pattern class 6: one's complement of pattern classes 1 through 5 */
    for (i = 0; i < patterns; i++) {
        v [idx] = not128 (v [i]);
        idx++;
    }
    /* pattern class 7: two's complement of pattern classes 1 through 5 */
    for (i = 0; i < patterns; i++) {
        v [idx] = sub128 (zero, v[i]);
        idx++;
    }
    patterns = idx;
    printf ("Starting pattern-based tests. Number of patterns: %d\n", patterns);

    for (long long int k = 0; k < 100000000000LL; k++) {
        if (k < patterns * patterns) {
            dividend = v [k / patterns];
            divisor  = v [k % patterns];
        } else {
            if (!patterns_done) {
                printf ("Starting random tests\n");
                patterns_done = 1;
            }
            dividend.l = KISS64;
            dividend.h = KISS64;
            divisor.h  = KISS64;
            divisor.l  = KISS64;
        }
        /* exclude cases with undefined results: division by zero */
        if (! eq128 (divisor, zero)) {
            quot = udiv128 (dividend, divisor);
            ref = bitwise_division_128 (dividend, divisor);
            if (! eq128 (quot, ref)) {
                printf ("@ (%016llx_%016llx, %016llx_%016llx): quot = %016llx_%016llx  ref=%016llx_%016llx\n", 
                        dividend.h, dividend.l, divisor.h, divisor.l, 
                        quot.h, quot.l, ref.h, ref.l);
                return EXIT_FAILURE;
            }
        }
    }
    printf ("unsigned 128-bit division: tests passed\n");
    return EXIT_SUCCESS;
}

这就是我最终编写的代码。我确信有更快的替代方案,但至少这是实用的。

基于:https://en.wikipedia.org/wiki/Division_algorithm#Integer_division_(unsigned)_with_remainder。适用于此特定用例。

// q = (2^128 - 1) / d, where q is the 64 LSBs of the quotient
uint64_t two_pow_128_minus_1_div_d(uint64_t d) {
    uint64_t q = 0, r_hi = 0, r_lo = 0;

    for (int i = 127; i >= 0; --i) {
        r_hi = (r_hi << 1) | (r_lo >> 63);
        r_lo <<= 1;

        r_lo |= 1UL;

        if (r_hi || r_lo >= d) {
            const uint64_t borrow = d > r_lo;
            r_lo -= d;
            r_hi -= borrow;

            if (i < 64)
                q |= 1UL << i;
        }
    }
    return q;
}