如何解释 CUDA 的 inline PTX Internal Compiler Error
How to explain inline PTX Internal Compiler Error of CUDA
我想测量 gpu 全局内存的缓存行为,下面是我设计的微基准。我想要做的是从全局内存地址 r_add0 加载并将其存储到共享内存 s_tvalue[0] 中。出于某种原因,我需要用内联 PTX 代码替换全局内存中的加载指令。
i = *r_addr0;
//asm("ldu.global.f64.cs %1, [%2];":"=l"(i):"l"(r_addr0));
s_tvalue[0] = i;
然而,当我用nvcc编译它时,它抱怨编译错误
error: Internal Compiler Error (codegen): "asm operand index requested is larger than the number of asm operands provided!"
有谁知道我代码的原因。
完整代码见下:
__global__ void global_latency (long long * my_array,
long long array_length, int position,
long long *d_time)
{
unsigned int start_time, end_time;
__shared__ long long s_tvalue[2];//2: number of threads per block
int k;
long long i, j;
for(k=0; k<2; k++)
s_tvalue[k] = 0L;
long long addr0,addr1;
addr0=(long long)my_array;
addr1 = ( addr0 ^ (1 << position));
long long *r_addr0, *r_addr1;
r_addr0 = (long long *)addr0;
r_addr1 = (long long *)addr1;
start_time = clock();
//i = *r_addr0;
asm("ldu.global.f64.cs %1, [%2];":"=l"(i):"l"(r_addr0));
s_tvalue[0] = i;
//j = *r_addr1;
asm("ld.global.f64.cs %3, [%4];" : "=l"(j):"l"(r_addr1));
s_tvalue[1] = j;
end_time = clock();
d_time[0] = end_time-start_time;
d_time[1] = s_tvalue[0];
printf("[%p]=%lld\n",addr0,d_time[1]);
d_time[2] = s_tvalue[1];
printf("[%p]=%lld\n",addr1,d_time[2]);
}
根据我的经验,令牌是从零开始的。由于您只有 2 个参数,因此是 %0 和 %1。您正在使用 %2,"is larger than the number of asm operands provided."
我想测量 gpu 全局内存的缓存行为,下面是我设计的微基准。我想要做的是从全局内存地址 r_add0 加载并将其存储到共享内存 s_tvalue[0] 中。出于某种原因,我需要用内联 PTX 代码替换全局内存中的加载指令。
i = *r_addr0;
//asm("ldu.global.f64.cs %1, [%2];":"=l"(i):"l"(r_addr0));
s_tvalue[0] = i;
然而,当我用nvcc编译它时,它抱怨编译错误
error: Internal Compiler Error (codegen): "asm operand index requested is larger than the number of asm operands provided!"
有谁知道我代码的原因。
完整代码见下:
__global__ void global_latency (long long * my_array,
long long array_length, int position,
long long *d_time)
{
unsigned int start_time, end_time;
__shared__ long long s_tvalue[2];//2: number of threads per block
int k;
long long i, j;
for(k=0; k<2; k++)
s_tvalue[k] = 0L;
long long addr0,addr1;
addr0=(long long)my_array;
addr1 = ( addr0 ^ (1 << position));
long long *r_addr0, *r_addr1;
r_addr0 = (long long *)addr0;
r_addr1 = (long long *)addr1;
start_time = clock();
//i = *r_addr0;
asm("ldu.global.f64.cs %1, [%2];":"=l"(i):"l"(r_addr0));
s_tvalue[0] = i;
//j = *r_addr1;
asm("ld.global.f64.cs %3, [%4];" : "=l"(j):"l"(r_addr1));
s_tvalue[1] = j;
end_time = clock();
d_time[0] = end_time-start_time;
d_time[1] = s_tvalue[0];
printf("[%p]=%lld\n",addr0,d_time[1]);
d_time[2] = s_tvalue[1];
printf("[%p]=%lld\n",addr1,d_time[2]);
}
根据我的经验,令牌是从零开始的。由于您只有 2 个参数,因此是 %0 和 %1。您正在使用 %2,"is larger than the number of asm operands provided."