在 Sparc 32 位上处理值 > 2^32 的整数
Handling of integer with values > 2^32 on Sparc 32 bits
我编写了一个小程序来测量花在循环中的时间(通过内联 Sparc 汇编代码片段)。
一切正常,直到我将迭代次数设置为大约 4.0+9(高于 2^32)。
这是代码片段:
#include <stdio.h>
#include <sys/time.h>
#include <unistd.h>
#include <math.h>
#include <stdint.h>
int main (int argc, char *argv[])
{
// For indices
int i;
// Set the number of executions
int nRunning = atoi(argv[1]);
// Set the sums
double avgSum = 0.0;
double stdSum = 0.0;
// Average of execution time
double averageRuntime = 0.0;
// Standard deviation of execution time
double deviationRuntime = 0.0;
// Init sum
unsigned long long int sum = 0;
// Number of iterations
unsigned long long int nLoop = 4000000000ULL;
//uint64_t nLoop = 4000000000;
// DEBUG
printf("sizeof(unsigned long long int) = %zu\n",sizeof(unsigned long long int));
printf("sizeof(unsigned long int) = %zu\n",sizeof(unsigned long int));
// Time intervals
struct timeval tv1, tv2;
double diff;
// Loop for multiple executions
for (i=0; i<nRunning; i++)
{
// Start time
gettimeofday (&tv1, NULL);
// Loop with Sparc assembly into C source
asm volatile ("clr %%g1\n\t"
"clr %%g2\n\t"
"mov %1, %%g1\n" // %1 = input parameter
"loop:\n\t"
"add %%g2, 1, %%g2\n\t"
"subcc %%g1, 1, %%g1\n\t"
"bne loop\n\t"
"nop\n\t"
"mov %%g2, %0\n" // %0 = output parameter
: "=r" (sum) // output
: "r" (nLoop) // input
: "g1", "g2"); // clobbers
// End time
gettimeofday (&tv2, NULL);
// Compute runtime for loop
diff = (tv2.tv_sec - tv1.tv_sec) * 1000000ULL + (tv2.tv_usec - tv1.tv_usec);
// Summing diff time
avgSum += diff;
stdSum += (diff*diff);
// DEBUG
printf("diff = %e\n", diff);
printf("avgSum = %e\n", avgSum);
}
// Compute final averageRuntime
averageRuntime = avgSum/nRunning;
// Compute standard deviation
deviationRuntime = sqrt(stdSum/nRunning-averageRuntime*averageRuntime);
// Print results
printf("(Average Elapsed time, Standard deviation) = %e usec %e usec\n", averageRuntime, deviationRuntime);
// Print sum from assembly loop
printf("Sum = %llu\n", sum);
例如,nLoop
< 2^32,我得到 diff
、avgSum
和 stdSum
的正确值。事实上, printf
和 nLoop = 4.0e+9
给出:
sizeof(unsigned long long int) = 8
sizeof(unsigned long int) = 4
diff = 9.617167e+06
avgSum = 9.617167e+06
diff = 9.499878e+06
avgSum = 1.911704e+07
(Average Elapsed time, Standard deviation) = 9.558522e+06 usec 5.864450e+04 usec
Sum = 4000000000
代码是在 Debian Sparc 32 bits Etch
上用 gcc 4.1.2
编译的。
不幸的是,如果我以 nLoop = 5.0e+9
为例,我得到的测量时间值很小且不正确;这是本例中的 printf 输出:
sizeof(unsigned long long int) = 8
sizeof(unsigned long int) = 4
diff = 5.800000e+01
avgSum = 5.800000e+01
diff = 4.000000e+00
avgSum = 6.200000e+01
(Average Elapsed time, Standard deviation) = 3.100000e+01 usec 2.700000e+01 usec
Sum = 5000000000
我不知道问题出在哪里,我使用 uint64_t
进行了其他测试但没有成功。
可能问题是我用 32 位处理 large integers (> 2^32)
OS 或者它可能是不支持 8 字节整数的汇编内联代码。
如果有人能给我一些线索来修复这个错误,
此致
更新 1 :
按照 @Andrew Henle
的建议,我采用了相同的代码,但没有使用内联 Sparc 程序集片段,而是放置了一个简单的循环。
这是带有简单循环的程序,它得到了 nLoop = 5.0e+9
(参见行“unsigned long long int nLoop = 5000000000ULL;
”,因此在 limit 2^32-1
上方:
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <unistd.h>
#include <math.h>
#include <stdint.h>
int main (int argc, char *argv[])
{
// For indices of nRunning
int i;
// For indices of nRunning
unsigned long long int j;
// Set the number of executions
int nRunning = atoi(argv[1]);
// Set the sums
unsigned long long int avgSum = 0;
unsigned long long int stdSum = 0;
// Average of execution time
double averageRuntime = 0.0;
// Standard deviation of execution time
double deviationRuntime = 0.0;
// Init sum
unsigned long long int sum;
// Number of iterations
unsigned long long int nLoop = 5000000000ULL;
// DEBUG
printf("sizeof(unsigned long long int) = %zu\n",sizeof(unsigned long long int));
printf("sizeof(unsigned long int) = %zu\n",sizeof(unsigned long int));
// Time intervals
struct timeval tv1, tv2;
unsigned long long int diff;
// Loop for multiple executions
for (i=0; i<nRunning; i++)
{
// Reset sum
sum = 0;
// Start time
gettimeofday (&tv1, NULL);
// Loop with Sparc assembly into C source
/* asm volatile ("clr %%g1\n\t"
"clr %%g2\n\t"
"mov %1, %%g1\n" // %1 = input parameter
"loop:\n\t"
"add %%g2, 1, %%g2\n\t"
"subcc %%g1, 1, %%g1\n\t"
"bne loop\n\t"
"nop\n\t"
"mov %%g2, %0\n" // %0 = output parameter
: "=r" (sum) // output
: "r" (nLoop) // input
: "g1", "g2"); // clobbers
*/
// Classic loop
for (j=0; j<nLoop; j++)
sum ++;
// End time
gettimeofday (&tv2, NULL);
// Compute runtime for loop
diff = (unsigned long long int) ((tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec));
// Summing diff time
avgSum += diff;
stdSum += (diff*diff);
// DEBUG
printf("diff = %llu\n", diff);
printf("avgSum = %llu\n", avgSum);
printf("stdSum = %llu\n", stdSum);
// Print sum from assembly loop
printf("Sum = %llu\n", sum);
}
// Compute final averageRuntime
averageRuntime = avgSum/nRunning;
// Compute standard deviation
deviationRuntime = sqrt(stdSum/nRunning-averageRuntime*averageRuntime);
// Print results
printf("(Average Elapsed time, Standard deviation) = %e usec %e usec\n", averageRuntime, deviationRuntime);
return 0;
}
此代码片段运行良好,即变量 sum
打印为
(参见“printf("Sum = %llu\n", sum)
”;):
Sum = 5000000000
所以问题出在带有 Sparc Assembly 块的版本上。
我怀疑,在这个汇编代码中,行 "mov %1, %%g1\n" // %1 = input parameter
错误地将 nLoop
存储到 %g1 register
中(我认为 %g1
是一个 32 位寄存器,所以无法存储高于 2^32-1
) 的值。
但是,输出参数(变量 sum
)位于 :
"mov %%g2, %0\n" // %0 = output parameter
高于限制,因为它等于 5000000000。
我在有和没有汇编循环的版本之间附加了 vimdiff :
左边是 With Assembly 编程,右边是 Without Assembly(只是一个简单的循环代替)
我提醒你我的问题是,对于 nLoop > 2^32-1 和汇编循环,我在执行结束时得到一个有效的 sum
参数但无效(太短)average
和 standard deviation
次(进入循环);这是 nLoop = 5000000000ULL
的输出示例:
sizeof(unsigned long long int) = 8
sizeof(unsigned long int) = 4
diff = 17
avgSum = 17
stdSum = 289
Sum = 5000000000
diff = 4
avgSum = 21
stdSum = 305
Sum = 5000000000
(Average Elapsed time, Standard deviation) = 1.000000e+01 usec 7.211103e+00 usec
取nLoop = 4.0e+9
,即nLoop = 4000000000ULL
,没有问题,时间值有效。
更新 2:
我正在通过生成汇编代码进行更深入的搜索。带nLoop = 4000000000 (4.0e+9)
的版本如下:
.file "loop-WITH-asm-inline-4-Billions.c"
.section ".rodata"
.align 8
.LLC1:
.asciz "sizeof(unsigned long long int) = %zu\n"
.align 8
.LLC2:
.asciz "sizeof(unsigned long int) = %zu\n"
.align 8
.LLC3:
.asciz "diff = %llu\n"
.align 8
.LLC4:
.asciz "avgSum = %llu\n"
.align 8
.LLC5:
.asciz "stdSum = %llu\n"
.align 8
.LLC6:
.asciz "Sum = %llu\n"
.global __udivdi3
.global __cmpdi2
.global __floatdidf
.align 8
.LLC7:
.asciz "(Average Elapsed time, Standard deviation) = %e usec %e usec\n"
.align 8
.LLC0:
.long 0
.long 0
.section ".text"
.align 4
.global main
.type main, #function
.proc 04
main:
save %sp, -248, %sp
st %i0, [%fp+68]
st %i1, [%fp+72]
ld [%fp+72], %g1
add %g1, 4, %g1
ld [%g1], %g1
mov %g1, %o0
call atoi, 0
nop
mov %o0, %g1
st %g1, [%fp-68]
st %g0, [%fp-64]
st %g0, [%fp-60]
st %g0, [%fp-56]
st %g0, [%fp-52]
sethi %hi(.LLC0), %g1
or %g1, %lo(.LLC0), %g1
ldd [%g1], %f8
std %f8, [%fp-48]
sethi %hi(.LLC0), %g1
or %g1, %lo(.LLC0), %g1
ldd [%g1], %f8
std %f8, [%fp-40]
mov 0, %g2
sethi %hi(4000000000), %g3
std %g2, [%fp-24]
sethi %hi(.LLC1), %g1
or %g1, %lo(.LLC1), %o0
mov 8, %o1
call printf, 0
nop
sethi %hi(.LLC2), %g1
or %g1, %lo(.LLC2), %o0
mov 4, %o1
call printf, 0
nop
st %g0, [%fp-84]
b .LL2
nop
.LL3:
st %g0, [%fp-32]
st %g0, [%fp-28]
add %fp, -92, %g1
mov %g1, %o0
mov 0, %o1
call gettimeofday, 0
nop
ldd [%fp-24], %o4
clr %g1
clr %g2
mov %o4, %g1
loop:
add %g2, 1, %g2
subcc %g1, 1, %g1
bne loop
nop
mov %g2, %o4
std %o4, [%fp-32]
add %fp, -100, %g1
mov %g1, %o0
mov 0, %o1
call gettimeofday, 0
nop
ld [%fp-100], %g2
ld [%fp-92], %g1
sub %g2, %g1, %g2
sethi %hi(999424), %g1
or %g1, 576, %g1
smul %g2, %g1, %g3
ld [%fp-96], %g2
ld [%fp-88], %g1
sub %g2, %g1, %g1
add %g3, %g1, %g1
st %g1, [%fp-12]
sra %g1, 31, %g1
st %g1, [%fp-16]
ldd [%fp-64], %o4
ldd [%fp-16], %g2
addcc %o5, %g3, %g3
addx %o4, %g2, %g2
std %g2, [%fp-64]
ld [%fp-16], %g2
ld [%fp-12], %g1
smul %g2, %g1, %g4
ld [%fp-16], %g2
ld [%fp-12], %g1
smul %g2, %g1, %g1
add %g4, %g1, %g4
ld [%fp-12], %g2
ld [%fp-12], %g1
umul %g2, %g1, %g3
rd %y, %g2
add %g4, %g2, %g4
mov %g4, %g2
ldd [%fp-56], %o4
addcc %o5, %g3, %g3
addx %o4, %g2, %g2
std %g2, [%fp-56]
sethi %hi(.LLC3), %g1
or %g1, %lo(.LLC3), %o0
ld [%fp-16], %o1
ld [%fp-12], %o2
call printf, 0
nop
sethi %hi(.LLC4), %g1
or %g1, %lo(.LLC4), %o0
ld [%fp-64], %o1
ld [%fp-60], %o2
call printf, 0
nop
sethi %hi(.LLC5), %g1
or %g1, %lo(.LLC5), %o0
ld [%fp-56], %o1
ld [%fp-52], %o2
call printf, 0
nop
sethi %hi(.LLC6), %g1
or %g1, %lo(.LLC6), %o0
ld [%fp-32], %o1
ld [%fp-28], %o2
call printf, 0
nop
ld [%fp-84], %g1
add %g1, 1, %g1
st %g1, [%fp-84]
.LL2:
ld [%fp-84], %g2
ld [%fp-68], %g1
cmp %g2, %g1
bl .LL3
nop
ld [%fp-68], %g1
sra %g1, 31, %g1
ld [%fp-68], %g3
mov %g1, %g2
ldd [%fp-64], %o0
mov %g2, %o2
mov %g3, %o3
call __udivdi3, 0
nop
mov %o0, %g2
mov %o1, %g3
std %g2, [%fp-136]
ldd [%fp-136], %o0
mov 0, %o2
mov 0, %o3
call __cmpdi2, 0
nop
mov %o0, %g1
cmp %g1, 1
bl .LL6
nop
ldd [%fp-136], %o0
call __floatdidf, 0
nop
std %f0, [%fp-144]
b .LL5
nop
.LL6:
ldd [%fp-136], %o4
and %o4, 0, %g2
and %o5, 1, %g3
ld [%fp-136], %o5
sll %o5, 31, %g1
ld [%fp-132], %g4
srl %g4, 1, %o5
or %o5, %g1, %o5
ld [%fp-136], %g1
srl %g1, 1, %o4
or %g2, %o4, %g2
or %g3, %o5, %g3
mov %g2, %o0
mov %g3, %o1
call __floatdidf, 0
nop
std %f0, [%fp-144]
ldd [%fp-144], %f8
ldd [%fp-144], %f10
faddd %f8, %f10, %f8
std %f8, [%fp-144]
.LL5:
ldd [%fp-144], %f8
std %f8, [%fp-48]
ld [%fp-68], %g1
sra %g1, 31, %g1
ld [%fp-68], %g3
mov %g1, %g2
ldd [%fp-56], %o0
mov %g2, %o2
mov %g3, %o3
call __udivdi3, 0
nop
mov %o0, %g2
mov %o1, %g3
std %g2, [%fp-128]
ldd [%fp-128], %o0
mov 0, %o2
mov 0, %o3
call __cmpdi2, 0
nop
mov %o0, %g1
cmp %g1, 1
bl .LL8
nop
ldd [%fp-128], %o0
call __floatdidf, 0
nop
std %f0, [%fp-120]
b .LL7
nop
.LL8:
ldd [%fp-128], %o4
and %o4, 0, %g2
and %o5, 1, %g3
ld [%fp-128], %o5
sll %o5, 31, %g1
ld [%fp-124], %g4
srl %g4, 1, %o5
or %o5, %g1, %o5
ld [%fp-128], %g1
srl %g1, 1, %o4
or %g2, %o4, %g2
or %g3, %o5, %g3
mov %g2, %o0
mov %g3, %o1
call __floatdidf, 0
nop
std %f0, [%fp-120]
ldd [%fp-120], %f8
ldd [%fp-120], %f10
faddd %f8, %f10, %f8
std %f8, [%fp-120]
.LL7:
ldd [%fp-48], %f8
ldd [%fp-48], %f10
fmuld %f8, %f10, %f8
ldd [%fp-120], %f10
fsubd %f10, %f8, %f8
std %f8, [%fp-112]
ldd [%fp-112], %f8
fsqrtd %f8, %f8
std %f8, [%fp-152]
ldd [%fp-152], %f10
ldd [%fp-152], %f8
fcmpd %f10, %f8
nop
fbe .LL9
nop
ldd [%fp-112], %o0
call sqrt, 0
nop
std %f0, [%fp-152]
.LL9:
ldd [%fp-152], %f8
std %f8, [%fp-40]
sethi %hi(.LLC7), %g1
or %g1, %lo(.LLC7), %o0
ld [%fp-48], %o1
ld [%fp-44], %o2
ld [%fp-40], %o3
ld [%fp-36], %o4
call printf, 0
nop
mov 0, %g1
mov %g1, %i0
restore
jmp %o7+8
nop
.size main, .-main
.ident "GCC: (GNU) 4.1.2 20061115 (prerelease) (Debian 4.1.1-21)"
.section ".note.GNU-stack"
当我用nLoop = 5000000000 (5.0e+9)
生成汇编代码版本时,差异如下图所示(vimdiff
):
“40亿”版本的区块:
mov 0, %g2
sethi %hi(4000000000), %g3
在“50 亿”版本中被替换为:
mov 1, %g2
sethi %hi(705032192), %g3
or %g3, 512, %g3
我可以看到 5.0+e9
不能在 32 位上编码,因为指令
sethi %hi(705032192), %g3
奇怪的是,当我编译版本“5 Billions”汇编代码时,输出参数 sum
计算得很好,即等于 5 Billions
,我无法解释它。
欢迎任何帮助或评论,谢谢。
您似乎在对 64 位值的二分之一进行 32 位运算
从生成的代码中,此处 nLoop
是 double-load 到 %o4
和 %o5
(因为它是 64 -bit long long
值):
ldd [%fp-24], %o4
clr %g1
clr %g2
然后您只需使用 %o4
:
mov %o4, %g1 ; <---- what about %o5????
loop:
add %g2, 1, %g2
subcc %g1, 1, %g1
bne loop
nop
mov %g2, %o4
要完成此工作,请重新编写汇编代码以将 %o4
+ %o5
一起视为 64 位值。
很大程度上取决于您使用的 sparc 版本和 ABI。如果您使用的是 sparc v8 或更早版本,则只有 32 位寄存器的 32 位 mode。在这种情况下,当您尝试将 5000000000 加载到 32 位寄存器时,它会失败并加载 5000000000 mod 232(即 705032704)。这似乎正在发生。
另一方面,如果您在 32 位 mode(通常称为 v8plus)中有一个 64 位 sparc 处理器 运行,那么您可以使用 64 位寄存器,所以这个会工作。
我编写了一个小程序来测量花在循环中的时间(通过内联 Sparc 汇编代码片段)。
一切正常,直到我将迭代次数设置为大约 4.0+9(高于 2^32)。
这是代码片段:
#include <stdio.h>
#include <sys/time.h>
#include <unistd.h>
#include <math.h>
#include <stdint.h>
int main (int argc, char *argv[])
{
// For indices
int i;
// Set the number of executions
int nRunning = atoi(argv[1]);
// Set the sums
double avgSum = 0.0;
double stdSum = 0.0;
// Average of execution time
double averageRuntime = 0.0;
// Standard deviation of execution time
double deviationRuntime = 0.0;
// Init sum
unsigned long long int sum = 0;
// Number of iterations
unsigned long long int nLoop = 4000000000ULL;
//uint64_t nLoop = 4000000000;
// DEBUG
printf("sizeof(unsigned long long int) = %zu\n",sizeof(unsigned long long int));
printf("sizeof(unsigned long int) = %zu\n",sizeof(unsigned long int));
// Time intervals
struct timeval tv1, tv2;
double diff;
// Loop for multiple executions
for (i=0; i<nRunning; i++)
{
// Start time
gettimeofday (&tv1, NULL);
// Loop with Sparc assembly into C source
asm volatile ("clr %%g1\n\t"
"clr %%g2\n\t"
"mov %1, %%g1\n" // %1 = input parameter
"loop:\n\t"
"add %%g2, 1, %%g2\n\t"
"subcc %%g1, 1, %%g1\n\t"
"bne loop\n\t"
"nop\n\t"
"mov %%g2, %0\n" // %0 = output parameter
: "=r" (sum) // output
: "r" (nLoop) // input
: "g1", "g2"); // clobbers
// End time
gettimeofday (&tv2, NULL);
// Compute runtime for loop
diff = (tv2.tv_sec - tv1.tv_sec) * 1000000ULL + (tv2.tv_usec - tv1.tv_usec);
// Summing diff time
avgSum += diff;
stdSum += (diff*diff);
// DEBUG
printf("diff = %e\n", diff);
printf("avgSum = %e\n", avgSum);
}
// Compute final averageRuntime
averageRuntime = avgSum/nRunning;
// Compute standard deviation
deviationRuntime = sqrt(stdSum/nRunning-averageRuntime*averageRuntime);
// Print results
printf("(Average Elapsed time, Standard deviation) = %e usec %e usec\n", averageRuntime, deviationRuntime);
// Print sum from assembly loop
printf("Sum = %llu\n", sum);
例如,nLoop
< 2^32,我得到 diff
、avgSum
和 stdSum
的正确值。事实上, printf
和 nLoop = 4.0e+9
给出:
sizeof(unsigned long long int) = 8
sizeof(unsigned long int) = 4
diff = 9.617167e+06
avgSum = 9.617167e+06
diff = 9.499878e+06
avgSum = 1.911704e+07
(Average Elapsed time, Standard deviation) = 9.558522e+06 usec 5.864450e+04 usec
Sum = 4000000000
代码是在 Debian Sparc 32 bits Etch
上用 gcc 4.1.2
编译的。
不幸的是,如果我以 nLoop = 5.0e+9
为例,我得到的测量时间值很小且不正确;这是本例中的 printf 输出:
sizeof(unsigned long long int) = 8
sizeof(unsigned long int) = 4
diff = 5.800000e+01
avgSum = 5.800000e+01
diff = 4.000000e+00
avgSum = 6.200000e+01
(Average Elapsed time, Standard deviation) = 3.100000e+01 usec 2.700000e+01 usec
Sum = 5000000000
我不知道问题出在哪里,我使用 uint64_t
进行了其他测试但没有成功。
可能问题是我用 32 位处理 large integers (> 2^32)
OS 或者它可能是不支持 8 字节整数的汇编内联代码。
如果有人能给我一些线索来修复这个错误,
此致
更新 1 :
按照 @Andrew Henle
的建议,我采用了相同的代码,但没有使用内联 Sparc 程序集片段,而是放置了一个简单的循环。
这是带有简单循环的程序,它得到了 nLoop = 5.0e+9
(参见行“unsigned long long int nLoop = 5000000000ULL;
”,因此在 limit 2^32-1
上方:
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <unistd.h>
#include <math.h>
#include <stdint.h>
int main (int argc, char *argv[])
{
// For indices of nRunning
int i;
// For indices of nRunning
unsigned long long int j;
// Set the number of executions
int nRunning = atoi(argv[1]);
// Set the sums
unsigned long long int avgSum = 0;
unsigned long long int stdSum = 0;
// Average of execution time
double averageRuntime = 0.0;
// Standard deviation of execution time
double deviationRuntime = 0.0;
// Init sum
unsigned long long int sum;
// Number of iterations
unsigned long long int nLoop = 5000000000ULL;
// DEBUG
printf("sizeof(unsigned long long int) = %zu\n",sizeof(unsigned long long int));
printf("sizeof(unsigned long int) = %zu\n",sizeof(unsigned long int));
// Time intervals
struct timeval tv1, tv2;
unsigned long long int diff;
// Loop for multiple executions
for (i=0; i<nRunning; i++)
{
// Reset sum
sum = 0;
// Start time
gettimeofday (&tv1, NULL);
// Loop with Sparc assembly into C source
/* asm volatile ("clr %%g1\n\t"
"clr %%g2\n\t"
"mov %1, %%g1\n" // %1 = input parameter
"loop:\n\t"
"add %%g2, 1, %%g2\n\t"
"subcc %%g1, 1, %%g1\n\t"
"bne loop\n\t"
"nop\n\t"
"mov %%g2, %0\n" // %0 = output parameter
: "=r" (sum) // output
: "r" (nLoop) // input
: "g1", "g2"); // clobbers
*/
// Classic loop
for (j=0; j<nLoop; j++)
sum ++;
// End time
gettimeofday (&tv2, NULL);
// Compute runtime for loop
diff = (unsigned long long int) ((tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec));
// Summing diff time
avgSum += diff;
stdSum += (diff*diff);
// DEBUG
printf("diff = %llu\n", diff);
printf("avgSum = %llu\n", avgSum);
printf("stdSum = %llu\n", stdSum);
// Print sum from assembly loop
printf("Sum = %llu\n", sum);
}
// Compute final averageRuntime
averageRuntime = avgSum/nRunning;
// Compute standard deviation
deviationRuntime = sqrt(stdSum/nRunning-averageRuntime*averageRuntime);
// Print results
printf("(Average Elapsed time, Standard deviation) = %e usec %e usec\n", averageRuntime, deviationRuntime);
return 0;
}
此代码片段运行良好,即变量 sum
打印为
(参见“printf("Sum = %llu\n", sum)
”;):
Sum = 5000000000
所以问题出在带有 Sparc Assembly 块的版本上。
我怀疑,在这个汇编代码中,行 "mov %1, %%g1\n" // %1 = input parameter
错误地将 nLoop
存储到 %g1 register
中(我认为 %g1
是一个 32 位寄存器,所以无法存储高于 2^32-1
) 的值。
但是,输出参数(变量 sum
)位于 :
"mov %%g2, %0\n" // %0 = output parameter
高于限制,因为它等于 5000000000。
我在有和没有汇编循环的版本之间附加了 vimdiff :
左边是 With Assembly 编程,右边是 Without Assembly(只是一个简单的循环代替)
我提醒你我的问题是,对于 nLoop > 2^32-1 和汇编循环,我在执行结束时得到一个有效的 sum
参数但无效(太短)average
和 standard deviation
次(进入循环);这是 nLoop = 5000000000ULL
的输出示例:
sizeof(unsigned long long int) = 8
sizeof(unsigned long int) = 4
diff = 17
avgSum = 17
stdSum = 289
Sum = 5000000000
diff = 4
avgSum = 21
stdSum = 305
Sum = 5000000000
(Average Elapsed time, Standard deviation) = 1.000000e+01 usec 7.211103e+00 usec
取nLoop = 4.0e+9
,即nLoop = 4000000000ULL
,没有问题,时间值有效。
更新 2:
我正在通过生成汇编代码进行更深入的搜索。带nLoop = 4000000000 (4.0e+9)
的版本如下:
.file "loop-WITH-asm-inline-4-Billions.c"
.section ".rodata"
.align 8
.LLC1:
.asciz "sizeof(unsigned long long int) = %zu\n"
.align 8
.LLC2:
.asciz "sizeof(unsigned long int) = %zu\n"
.align 8
.LLC3:
.asciz "diff = %llu\n"
.align 8
.LLC4:
.asciz "avgSum = %llu\n"
.align 8
.LLC5:
.asciz "stdSum = %llu\n"
.align 8
.LLC6:
.asciz "Sum = %llu\n"
.global __udivdi3
.global __cmpdi2
.global __floatdidf
.align 8
.LLC7:
.asciz "(Average Elapsed time, Standard deviation) = %e usec %e usec\n"
.align 8
.LLC0:
.long 0
.long 0
.section ".text"
.align 4
.global main
.type main, #function
.proc 04
main:
save %sp, -248, %sp
st %i0, [%fp+68]
st %i1, [%fp+72]
ld [%fp+72], %g1
add %g1, 4, %g1
ld [%g1], %g1
mov %g1, %o0
call atoi, 0
nop
mov %o0, %g1
st %g1, [%fp-68]
st %g0, [%fp-64]
st %g0, [%fp-60]
st %g0, [%fp-56]
st %g0, [%fp-52]
sethi %hi(.LLC0), %g1
or %g1, %lo(.LLC0), %g1
ldd [%g1], %f8
std %f8, [%fp-48]
sethi %hi(.LLC0), %g1
or %g1, %lo(.LLC0), %g1
ldd [%g1], %f8
std %f8, [%fp-40]
mov 0, %g2
sethi %hi(4000000000), %g3
std %g2, [%fp-24]
sethi %hi(.LLC1), %g1
or %g1, %lo(.LLC1), %o0
mov 8, %o1
call printf, 0
nop
sethi %hi(.LLC2), %g1
or %g1, %lo(.LLC2), %o0
mov 4, %o1
call printf, 0
nop
st %g0, [%fp-84]
b .LL2
nop
.LL3:
st %g0, [%fp-32]
st %g0, [%fp-28]
add %fp, -92, %g1
mov %g1, %o0
mov 0, %o1
call gettimeofday, 0
nop
ldd [%fp-24], %o4
clr %g1
clr %g2
mov %o4, %g1
loop:
add %g2, 1, %g2
subcc %g1, 1, %g1
bne loop
nop
mov %g2, %o4
std %o4, [%fp-32]
add %fp, -100, %g1
mov %g1, %o0
mov 0, %o1
call gettimeofday, 0
nop
ld [%fp-100], %g2
ld [%fp-92], %g1
sub %g2, %g1, %g2
sethi %hi(999424), %g1
or %g1, 576, %g1
smul %g2, %g1, %g3
ld [%fp-96], %g2
ld [%fp-88], %g1
sub %g2, %g1, %g1
add %g3, %g1, %g1
st %g1, [%fp-12]
sra %g1, 31, %g1
st %g1, [%fp-16]
ldd [%fp-64], %o4
ldd [%fp-16], %g2
addcc %o5, %g3, %g3
addx %o4, %g2, %g2
std %g2, [%fp-64]
ld [%fp-16], %g2
ld [%fp-12], %g1
smul %g2, %g1, %g4
ld [%fp-16], %g2
ld [%fp-12], %g1
smul %g2, %g1, %g1
add %g4, %g1, %g4
ld [%fp-12], %g2
ld [%fp-12], %g1
umul %g2, %g1, %g3
rd %y, %g2
add %g4, %g2, %g4
mov %g4, %g2
ldd [%fp-56], %o4
addcc %o5, %g3, %g3
addx %o4, %g2, %g2
std %g2, [%fp-56]
sethi %hi(.LLC3), %g1
or %g1, %lo(.LLC3), %o0
ld [%fp-16], %o1
ld [%fp-12], %o2
call printf, 0
nop
sethi %hi(.LLC4), %g1
or %g1, %lo(.LLC4), %o0
ld [%fp-64], %o1
ld [%fp-60], %o2
call printf, 0
nop
sethi %hi(.LLC5), %g1
or %g1, %lo(.LLC5), %o0
ld [%fp-56], %o1
ld [%fp-52], %o2
call printf, 0
nop
sethi %hi(.LLC6), %g1
or %g1, %lo(.LLC6), %o0
ld [%fp-32], %o1
ld [%fp-28], %o2
call printf, 0
nop
ld [%fp-84], %g1
add %g1, 1, %g1
st %g1, [%fp-84]
.LL2:
ld [%fp-84], %g2
ld [%fp-68], %g1
cmp %g2, %g1
bl .LL3
nop
ld [%fp-68], %g1
sra %g1, 31, %g1
ld [%fp-68], %g3
mov %g1, %g2
ldd [%fp-64], %o0
mov %g2, %o2
mov %g3, %o3
call __udivdi3, 0
nop
mov %o0, %g2
mov %o1, %g3
std %g2, [%fp-136]
ldd [%fp-136], %o0
mov 0, %o2
mov 0, %o3
call __cmpdi2, 0
nop
mov %o0, %g1
cmp %g1, 1
bl .LL6
nop
ldd [%fp-136], %o0
call __floatdidf, 0
nop
std %f0, [%fp-144]
b .LL5
nop
.LL6:
ldd [%fp-136], %o4
and %o4, 0, %g2
and %o5, 1, %g3
ld [%fp-136], %o5
sll %o5, 31, %g1
ld [%fp-132], %g4
srl %g4, 1, %o5
or %o5, %g1, %o5
ld [%fp-136], %g1
srl %g1, 1, %o4
or %g2, %o4, %g2
or %g3, %o5, %g3
mov %g2, %o0
mov %g3, %o1
call __floatdidf, 0
nop
std %f0, [%fp-144]
ldd [%fp-144], %f8
ldd [%fp-144], %f10
faddd %f8, %f10, %f8
std %f8, [%fp-144]
.LL5:
ldd [%fp-144], %f8
std %f8, [%fp-48]
ld [%fp-68], %g1
sra %g1, 31, %g1
ld [%fp-68], %g3
mov %g1, %g2
ldd [%fp-56], %o0
mov %g2, %o2
mov %g3, %o3
call __udivdi3, 0
nop
mov %o0, %g2
mov %o1, %g3
std %g2, [%fp-128]
ldd [%fp-128], %o0
mov 0, %o2
mov 0, %o3
call __cmpdi2, 0
nop
mov %o0, %g1
cmp %g1, 1
bl .LL8
nop
ldd [%fp-128], %o0
call __floatdidf, 0
nop
std %f0, [%fp-120]
b .LL7
nop
.LL8:
ldd [%fp-128], %o4
and %o4, 0, %g2
and %o5, 1, %g3
ld [%fp-128], %o5
sll %o5, 31, %g1
ld [%fp-124], %g4
srl %g4, 1, %o5
or %o5, %g1, %o5
ld [%fp-128], %g1
srl %g1, 1, %o4
or %g2, %o4, %g2
or %g3, %o5, %g3
mov %g2, %o0
mov %g3, %o1
call __floatdidf, 0
nop
std %f0, [%fp-120]
ldd [%fp-120], %f8
ldd [%fp-120], %f10
faddd %f8, %f10, %f8
std %f8, [%fp-120]
.LL7:
ldd [%fp-48], %f8
ldd [%fp-48], %f10
fmuld %f8, %f10, %f8
ldd [%fp-120], %f10
fsubd %f10, %f8, %f8
std %f8, [%fp-112]
ldd [%fp-112], %f8
fsqrtd %f8, %f8
std %f8, [%fp-152]
ldd [%fp-152], %f10
ldd [%fp-152], %f8
fcmpd %f10, %f8
nop
fbe .LL9
nop
ldd [%fp-112], %o0
call sqrt, 0
nop
std %f0, [%fp-152]
.LL9:
ldd [%fp-152], %f8
std %f8, [%fp-40]
sethi %hi(.LLC7), %g1
or %g1, %lo(.LLC7), %o0
ld [%fp-48], %o1
ld [%fp-44], %o2
ld [%fp-40], %o3
ld [%fp-36], %o4
call printf, 0
nop
mov 0, %g1
mov %g1, %i0
restore
jmp %o7+8
nop
.size main, .-main
.ident "GCC: (GNU) 4.1.2 20061115 (prerelease) (Debian 4.1.1-21)"
.section ".note.GNU-stack"
当我用nLoop = 5000000000 (5.0e+9)
生成汇编代码版本时,差异如下图所示(vimdiff
):
“40亿”版本的区块:
mov 0, %g2
sethi %hi(4000000000), %g3
在“50 亿”版本中被替换为:
mov 1, %g2
sethi %hi(705032192), %g3
or %g3, 512, %g3
我可以看到 5.0+e9
不能在 32 位上编码,因为指令
sethi %hi(705032192), %g3
奇怪的是,当我编译版本“5 Billions”汇编代码时,输出参数 sum
计算得很好,即等于 5 Billions
,我无法解释它。
欢迎任何帮助或评论,谢谢。
您似乎在对 64 位值的二分之一进行 32 位运算
从生成的代码中,此处 nLoop
是 double-load 到 %o4
和 %o5
(因为它是 64 -bit long long
值):
ldd [%fp-24], %o4
clr %g1
clr %g2
然后您只需使用 %o4
:
mov %o4, %g1 ; <---- what about %o5????
loop:
add %g2, 1, %g2
subcc %g1, 1, %g1
bne loop
nop
mov %g2, %o4
要完成此工作,请重新编写汇编代码以将 %o4
+ %o5
一起视为 64 位值。
很大程度上取决于您使用的 sparc 版本和 ABI。如果您使用的是 sparc v8 或更早版本,则只有 32 位寄存器的 32 位 mode。在这种情况下,当您尝试将 5000000000 加载到 32 位寄存器时,它会失败并加载 5000000000 mod 232(即 705032704)。这似乎正在发生。
另一方面,如果您在 32 位 mode(通常称为 v8plus)中有一个 64 位 sparc 处理器 运行,那么您可以使用 64 位寄存器,所以这个会工作。