在 Sparc 32 位上处理值 > 2^32 的整数

Question

我编写了一个小程序来测量花在循环中的时间（通过内联 Sparc 汇编代码片段）。

一切正常，直到我将迭代次数设置为大约 4.0+9（高于 2^32）。

这是代码片段：

#include <stdio.h>
#include <sys/time.h>
#include <unistd.h>
#include <math.h>
#include <stdint.h>

int main (int argc, char *argv[])
{
  // For indices
  int i;
  // Set the number of executions
  int nRunning = atoi(argv[1]);
  // Set the sums
  double avgSum = 0.0;
  double stdSum = 0.0;
  // Average of execution time
  double averageRuntime = 0.0;
  // Standard deviation of execution time
  double deviationRuntime = 0.0;

  // Init sum
  unsigned long long int sum = 0;
  // Number of iterations
  unsigned long long int nLoop = 4000000000ULL;
  //uint64_t nLoop = 4000000000;

  // DEBUG
  printf("sizeof(unsigned long long int) = %zu\n",sizeof(unsigned long long int));
  printf("sizeof(unsigned long int) = %zu\n",sizeof(unsigned long int));

  // Time intervals
  struct timeval tv1, tv2;
  double diff;

  // Loop for multiple executions
  for (i=0; i<nRunning; i++)
  {
   // Start time
   gettimeofday (&tv1, NULL);

   // Loop with Sparc assembly into C source
   asm volatile ("clr %%g1\n\t" 
                 "clr %%g2\n\t" 
                 "mov %1, %%g1\n" // %1 = input parameter
                 "loop:\n\t" 
                 "add %%g2, 1, %%g2\n\t" 
                 "subcc %%g1, 1, %%g1\n\t" 
                 "bne loop\n\t" 
                 "nop\n\t" 
                 "mov %%g2, %0\n" // %0 = output parameter
                 : "=r" (sum)     // output
                 : "r" (nLoop)    // input
                 : "g1", "g2");   // clobbers

   // End time
   gettimeofday (&tv2, NULL);

   // Compute runtime for loop
   diff = (tv2.tv_sec - tv1.tv_sec) * 1000000ULL + (tv2.tv_usec - tv1.tv_usec);

   // Summing diff time
   avgSum += diff;
   stdSum += (diff*diff);

   // DEBUG
   printf("diff = %e\n", diff);
   printf("avgSum = %e\n", avgSum);

  }
  // Compute final averageRuntime   
  averageRuntime = avgSum/nRunning;

  // Compute standard deviation
  deviationRuntime = sqrt(stdSum/nRunning-averageRuntime*averageRuntime);

  // Print results
  printf("(Average Elapsed time, Standard deviation) = %e usec  %e usec\n", averageRuntime, deviationRuntime);
  // Print sum from assembly loop
  printf("Sum = %llu\n", sum);

例如，nLoop < 2^32，我得到 diff、avgSum 和 stdSum 的正确值。事实上， printf 和 nLoop = 4.0e+9 给出：

sizeof(unsigned long long int) = 8
sizeof(unsigned long int) = 4
diff = 9.617167e+06
avgSum = 9.617167e+06
diff = 9.499878e+06
avgSum = 1.911704e+07
(Average Elapsed time, Standard deviation) = 9.558522e+06 usec  5.864450e+04 usec
Sum = 4000000000

代码是在 Debian Sparc 32 bits Etch 上用 gcc 4.1.2 编译的。

不幸的是，如果我以 nLoop = 5.0e+9 为例，我得到的测量时间值很小且不正确；这是本例中的 printf 输出：

sizeof(unsigned long long int) = 8
sizeof(unsigned long int) = 4
diff = 5.800000e+01
avgSum = 5.800000e+01
diff = 4.000000e+00
avgSum = 6.200000e+01
(Average Elapsed time, Standard deviation) = 3.100000e+01 usec  2.700000e+01 usec
Sum = 5000000000

我不知道问题出在哪里，我使用 uint64_t 进行了其他测试但没有成功。

可能问题是我用 32 位处理 large integers (> 2^32) OS 或者它可能是不支持 8 字节整数的汇编内联代码。

如果有人能给我一些线索来修复这个错误，

此致

更新 1 :

按照 @Andrew Henle 的建议，我采用了相同的代码，但没有使用内联 Sparc 程序集片段，而是放置了一个简单的循环。

这是带有简单循环的程序，它得到了 nLoop = 5.0e+9（参见行“unsigned long long int nLoop = 5000000000ULL;”，因此在 limit 2^32-1 上方：

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <unistd.h>
#include <math.h>
#include <stdint.h>

int main (int argc, char *argv[])
{
  // For indices of nRunning
  int i;
  // For indices of nRunning
  unsigned long long int j;
  // Set the number of executions
  int nRunning = atoi(argv[1]);
  // Set the sums
  unsigned long long int avgSum = 0;
  unsigned long long int stdSum = 0;
  // Average of execution time
  double averageRuntime = 0.0;
  // Standard deviation of execution time
  double deviationRuntime = 0.0;

  // Init sum
  unsigned long long int sum;
  // Number of iterations
  unsigned long long int nLoop = 5000000000ULL;

  // DEBUG
  printf("sizeof(unsigned long long int) = %zu\n",sizeof(unsigned long long int));
  printf("sizeof(unsigned long int) = %zu\n",sizeof(unsigned long int));

  // Time intervals
  struct timeval tv1, tv2;
  unsigned long long int diff;

  // Loop for multiple executions
  for (i=0; i<nRunning; i++)
  {
   // Reset sum
   sum = 0;

   // Start time
   gettimeofday (&tv1, NULL);

   // Loop with Sparc assembly into C source
   /* asm volatile ("clr %%g1\n\t" 
                 "clr %%g2\n\t" 
                 "mov %1, %%g1\n" // %1 = input parameter
         "loop:\n\t" 
         "add %%g2, 1, %%g2\n\t" 
         "subcc %%g1, 1, %%g1\n\t" 
         "bne loop\n\t" 
         "nop\n\t" 
         "mov %%g2, %0\n" // %0 = output parameter
         : "=r" (sum)     // output
         : "r" (nLoop)    // input
         : "g1", "g2");   // clobbers
   */

   // Classic loop
   for (j=0; j<nLoop; j++)
      sum ++;

   // End time
   gettimeofday (&tv2, NULL);

   // Compute runtime for loop
   diff = (unsigned long long int) ((tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec));

   // Summing diff time
   avgSum += diff;
   stdSum += (diff*diff);

   // DEBUG
   printf("diff = %llu\n", diff);
   printf("avgSum = %llu\n", avgSum);
   printf("stdSum = %llu\n", stdSum);
   // Print sum from assembly loop
   printf("Sum = %llu\n", sum);

  }
  // Compute final averageRuntime   
  averageRuntime = avgSum/nRunning;

  // Compute standard deviation
  deviationRuntime = sqrt(stdSum/nRunning-averageRuntime*averageRuntime);

  // Print results
  printf("(Average Elapsed time, Standard deviation) = %e usec  %e usec\n", averageRuntime, deviationRuntime);

  return 0;

}

此代码片段运行良好，即变量 sum 打印为（参见“printf("Sum = %llu\n", sum)”；）：

Sum = 5000000000

所以问题出在带有 Sparc Assembly 块的版本上。

我怀疑，在这个汇编代码中，行 "mov %1, %%g1\n" // %1 = input parameter 错误地将 nLoop 存储到 %g1 register 中（我认为 %g1 是一个 32 位寄存器，所以无法存储高于 2^32-1) 的值。

但是，输出参数（变量 sum）位于 :

"mov %%g2, %0\n" // %0 = output parameter

高于限制，因为它等于 5000000000。

我在有和没有汇编循环的版本之间附加了 vimdiff :

左边是 With Assembly 编程，右边是 Without Assembly（只是一个简单的循环代替）

我提醒你我的问题是，对于 nLoop > 2^32-1 和汇编循环，我在执行结束时得到一个有效的 sum 参数但无效（太短）average 和 standard deviation 次（进入循环）；这是 nLoop = 5000000000ULL 的输出示例：

sizeof(unsigned long long int) = 8
sizeof(unsigned long int) = 4
diff = 17
avgSum = 17
stdSum = 289
Sum = 5000000000
diff = 4
avgSum = 21
stdSum = 305
Sum = 5000000000
(Average Elapsed time, Standard deviation) = 1.000000e+01 usec  7.211103e+00 usec

取nLoop = 4.0e+9，即nLoop = 4000000000ULL，没有问题，时间值有效。

更新 2：

我正在通过生成汇编代码进行更深入的搜索。带nLoop = 4000000000 (4.0e+9)的版本如下：

    .file   "loop-WITH-asm-inline-4-Billions.c"
    .section    ".rodata"
    .align 8
.LLC1:
    .asciz  "sizeof(unsigned long long int) = %zu\n"
    .align 8
.LLC2:
    .asciz  "sizeof(unsigned long int) = %zu\n"
    .align 8
.LLC3:
    .asciz  "diff = %llu\n"
    .align 8
.LLC4:
    .asciz  "avgSum = %llu\n"
    .align 8
.LLC5:
    .asciz  "stdSum = %llu\n"
    .align 8
.LLC6:
    .asciz  "Sum = %llu\n"
    .global __udivdi3
    .global __cmpdi2
    .global __floatdidf
    .align 8
.LLC7:
    .asciz  "(Average Elapsed time, Standard deviation) = %e usec  %e usec\n"
    .align 8
.LLC0:
    .long   0
    .long   0
    .section    ".text"
    .align 4
    .global main
    .type   main, #function
    .proc   04
main:
    save    %sp, -248, %sp
    st  %i0, [%fp+68]
    st  %i1, [%fp+72]
    ld  [%fp+72], %g1
    add %g1, 4, %g1
    ld  [%g1], %g1
    mov %g1, %o0
    call    atoi, 0
     nop
    mov %o0, %g1
    st  %g1, [%fp-68]
    st  %g0, [%fp-64]
    st  %g0, [%fp-60]
    st  %g0, [%fp-56]
    st  %g0, [%fp-52]
    sethi   %hi(.LLC0), %g1
    or  %g1, %lo(.LLC0), %g1
    ldd [%g1], %f8
    std %f8, [%fp-48]
    sethi   %hi(.LLC0), %g1
    or  %g1, %lo(.LLC0), %g1
    ldd [%g1], %f8
    std %f8, [%fp-40]
    mov 0, %g2
    sethi   %hi(4000000000), %g3
    std %g2, [%fp-24]
    sethi   %hi(.LLC1), %g1
    or  %g1, %lo(.LLC1), %o0
    mov 8, %o1
    call    printf, 0
     nop
    sethi   %hi(.LLC2), %g1
    or  %g1, %lo(.LLC2), %o0
    mov 4, %o1
    call    printf, 0
     nop
    st  %g0, [%fp-84]
    b   .LL2
     nop
.LL3:
    st  %g0, [%fp-32]
    st  %g0, [%fp-28]
    add %fp, -92, %g1
    mov %g1, %o0
    mov 0, %o1
    call    gettimeofday, 0
     nop
    ldd [%fp-24], %o4
    clr %g1
    clr %g2
    mov %o4, %g1
loop:
    add %g2, 1, %g2
    subcc %g1, 1, %g1
    bne loop
    nop
    mov %g2, %o4

    std %o4, [%fp-32]
    add %fp, -100, %g1
    mov %g1, %o0
    mov 0, %o1
    call    gettimeofday, 0
     nop
    ld  [%fp-100], %g2
    ld  [%fp-92], %g1
    sub %g2, %g1, %g2
    sethi   %hi(999424), %g1
    or  %g1, 576, %g1
    smul    %g2, %g1, %g3
    ld  [%fp-96], %g2
    ld  [%fp-88], %g1
    sub %g2, %g1, %g1
    add %g3, %g1, %g1
    st  %g1, [%fp-12]
    sra %g1, 31, %g1
    st  %g1, [%fp-16]
    ldd [%fp-64], %o4
    ldd [%fp-16], %g2
    addcc   %o5, %g3, %g3
    addx    %o4, %g2, %g2
    std %g2, [%fp-64]
    ld  [%fp-16], %g2
    ld  [%fp-12], %g1
    smul    %g2, %g1, %g4
    ld  [%fp-16], %g2
    ld  [%fp-12], %g1
    smul    %g2, %g1, %g1
    add %g4, %g1, %g4
    ld  [%fp-12], %g2
    ld  [%fp-12], %g1
    umul    %g2, %g1, %g3
    rd  %y, %g2
    add %g4, %g2, %g4
    mov %g4, %g2
    ldd [%fp-56], %o4
    addcc   %o5, %g3, %g3
    addx    %o4, %g2, %g2
    std %g2, [%fp-56]
    sethi   %hi(.LLC3), %g1
    or  %g1, %lo(.LLC3), %o0
    ld  [%fp-16], %o1
    ld  [%fp-12], %o2
    call    printf, 0
     nop
    sethi   %hi(.LLC4), %g1
    or  %g1, %lo(.LLC4), %o0
    ld  [%fp-64], %o1
    ld  [%fp-60], %o2
    call    printf, 0
     nop
    sethi   %hi(.LLC5), %g1
    or  %g1, %lo(.LLC5), %o0
    ld  [%fp-56], %o1
    ld  [%fp-52], %o2
    call    printf, 0
     nop
    sethi   %hi(.LLC6), %g1
    or  %g1, %lo(.LLC6), %o0
    ld  [%fp-32], %o1
    ld  [%fp-28], %o2
    call    printf, 0
     nop
    ld  [%fp-84], %g1
    add %g1, 1, %g1
    st  %g1, [%fp-84]
.LL2:
    ld  [%fp-84], %g2
    ld  [%fp-68], %g1
    cmp %g2, %g1
    bl  .LL3
     nop
    ld  [%fp-68], %g1
    sra %g1, 31, %g1
    ld  [%fp-68], %g3
    mov %g1, %g2
    ldd [%fp-64], %o0
    mov %g2, %o2
    mov %g3, %o3
    call    __udivdi3, 0
     nop
    mov %o0, %g2
    mov %o1, %g3
    std %g2, [%fp-136]
    ldd [%fp-136], %o0
    mov 0, %o2
    mov 0, %o3
    call    __cmpdi2, 0
     nop
    mov %o0, %g1
    cmp %g1, 1
    bl  .LL6
     nop
    ldd [%fp-136], %o0
    call    __floatdidf, 0
     nop
    std %f0, [%fp-144]
    b   .LL5
     nop
.LL6:
    ldd [%fp-136], %o4
    and %o4, 0, %g2
    and %o5, 1, %g3
    ld  [%fp-136], %o5
    sll %o5, 31, %g1
    ld  [%fp-132], %g4
    srl %g4, 1, %o5
    or  %o5, %g1, %o5
    ld  [%fp-136], %g1
    srl %g1, 1, %o4
    or  %g2, %o4, %g2
    or  %g3, %o5, %g3
    mov %g2, %o0
    mov %g3, %o1
    call    __floatdidf, 0
     nop
    std %f0, [%fp-144]
    ldd [%fp-144], %f8
    ldd [%fp-144], %f10
    faddd   %f8, %f10, %f8
    std %f8, [%fp-144]
.LL5:
    ldd [%fp-144], %f8
    std %f8, [%fp-48]
    ld  [%fp-68], %g1
    sra %g1, 31, %g1
    ld  [%fp-68], %g3
    mov %g1, %g2
    ldd [%fp-56], %o0
    mov %g2, %o2
    mov %g3, %o3
    call    __udivdi3, 0
     nop
    mov %o0, %g2
    mov %o1, %g3
    std %g2, [%fp-128]
    ldd [%fp-128], %o0
    mov 0, %o2
    mov 0, %o3
    call    __cmpdi2, 0
     nop
    mov %o0, %g1
    cmp %g1, 1
    bl  .LL8
     nop
    ldd [%fp-128], %o0
    call    __floatdidf, 0
     nop
    std %f0, [%fp-120]
    b   .LL7
     nop
.LL8:
    ldd [%fp-128], %o4
    and %o4, 0, %g2
    and %o5, 1, %g3
    ld  [%fp-128], %o5
    sll %o5, 31, %g1
    ld  [%fp-124], %g4
    srl %g4, 1, %o5
    or  %o5, %g1, %o5
    ld  [%fp-128], %g1
    srl %g1, 1, %o4
    or  %g2, %o4, %g2
    or  %g3, %o5, %g3
    mov %g2, %o0
    mov %g3, %o1
    call    __floatdidf, 0
     nop
    std %f0, [%fp-120]
    ldd [%fp-120], %f8
    ldd [%fp-120], %f10
    faddd   %f8, %f10, %f8
    std %f8, [%fp-120]
.LL7:
    ldd [%fp-48], %f8
    ldd [%fp-48], %f10
    fmuld   %f8, %f10, %f8
    ldd [%fp-120], %f10
    fsubd   %f10, %f8, %f8
    std %f8, [%fp-112]
    ldd [%fp-112], %f8
    fsqrtd  %f8, %f8
    std %f8, [%fp-152]
    ldd [%fp-152], %f10
    ldd [%fp-152], %f8
    fcmpd   %f10, %f8
    nop
    fbe .LL9
     nop
    ldd [%fp-112], %o0
    call    sqrt, 0
     nop
    std %f0, [%fp-152]
.LL9:
    ldd [%fp-152], %f8
    std %f8, [%fp-40]
    sethi   %hi(.LLC7), %g1
    or  %g1, %lo(.LLC7), %o0
    ld  [%fp-48], %o1
    ld  [%fp-44], %o2
    ld  [%fp-40], %o3
    ld  [%fp-36], %o4
    call    printf, 0
     nop
    mov 0, %g1
    mov %g1, %i0
    restore
    jmp %o7+8
     nop
    .size   main, .-main
    .ident  "GCC: (GNU) 4.1.2 20061115 (prerelease) (Debian 4.1.1-21)"
    .section    ".note.GNU-stack"

当我用nLoop = 5000000000 (5.0e+9)生成汇编代码版本时，差异如下图所示（vimdiff）：

“40亿”版本的区块：

mov     0, %g2                                                                                                                           
sethi   %hi(4000000000), %g3

在“50 亿”版本中被替换为：

 mov     1, %g2
 sethi   %hi(705032192), %g3                                                   
 or      %g3, 512, %g3

我可以看到 5.0+e9 不能在 32 位上编码，因为指令

sethi   %hi(705032192), %g3

奇怪的是，当我编译版本“5 Billions”汇编代码时，输出参数 sum 计算得很好，即等于 5 Billions，我无法解释它。

欢迎任何帮助或评论，谢谢。

Answer 1

您似乎在对 64 位值的二分之一进行 32 位运算

从生成的代码中，此处 nLoop 是 double-load 到 %o4 和 %o5（因为它是 64 -bit long long 值):

    ldd [%fp-24], %o4
    clr %g1
    clr %g2

然后您只需使用 %o4:

    mov %o4, %g1                ; <---- what about %o5????
loop:
    add %g2, 1, %g2
    subcc %g1, 1, %g1
    bne loop
    nop
    mov %g2, %o4

要完成此工作，请重新编写汇编代码以将 %o4 + %o5 一起视为 64 位值。

Answer 2

很大程度上取决于您使用的 sparc 版本和 ABI。如果您使用的是 sparc v8 或更早版本，则只有 32 位寄存器的 32 位 mode。在这种情况下，当您尝试将 5000000000 加载到 32 位寄存器时，它会失败并加载 5000000000 mod 2³²（即 705032704）。这似乎正在发生。

另一方面，如果您在 32 位 mode（通常称为 v8plus）中有一个 64 位 sparc 处理器运行，那么您可以使用 64 位寄存器，所以这个会工作。

在 Sparc 32 位上处理值 > 2^32 的整数

Handling of integer with values > 2^32 on Sparc 32 bits

c

assembly

sparc

long-integer