CUDA Funnel Shift 代码生成
CUDA Funnel Shift code generation
CUDA 是否在使用 64 位整数时在 SM35 内部生成漏斗移位指令?我的内核使用 SM35 目标编译到 PTX,它显示常规 shl.b64 指令。我知道此 PTX 输出尚未完全优化,一旦加载模块 (cuModuleLoad),将生成原生 arch 代码。
shl.b64 %rd5, %rd4, 16;
驱动API应该不会成为代码分析的障碍。尝试使用运行时 API 编写测试用例。在评论中回复此问题:
But really I'm wondering about whether the final device code generated out of "shl.b64" would be a shf of the lower and upper 32-bit words.
我认为在某些情况下,ptxas(包括驱动程序 JIT 引擎)可以将 PTX 逻辑移位指令转换为 PTX 漏斗移位指令 (shf) 的 SASS 等效指令 (SHF)。
这是一个完整的示例:
$ cat t625.cu
#include <stdio.h>
__global__ void my_kernel(unsigned long long data)
{
unsigned long long my_data = data >> 15;
printf("data = %ld\n",my_data);
}
int main(){
my_kernel<<<1,1>>>(2ULL<<40);
cudaDeviceSynchronize();
}
[bob@cluster1 misc]$ nvcc -arch=sm_35 -ptx t625.cu
[bob@cluster1 misc]$ cat t625.ptx
*************EXCERPT**************
// .globl _Z9my_kernely
.visible .entry _Z9my_kernely(
.param .u64 _Z9my_kernely_param_0
)
{
.local .align 8 .b8 __local_depot6[8];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .s32 %r<2>;
.reg .s64 %rd<8>;
mov.u64 %rd7, __local_depot6;
cvta.local.u64 %SP, %rd7;
ld.param.u64 %rd1, [_Z9my_kernely_param_0];
add.u64 %rd2, %SP, 0;
cvta.to.local.u64 %rd3, %rd2;
shr.u64 %rd4, %rd1, 15; ***** NOTE *****
st.local.u64 [%rd3], %rd4;
mov.u64 %rd5, $str;
cvta.global.u64 %rd6, %rd5;
// Callseq Start 0
{
.reg .b32 temp_param_reg;
// <end>}
.param .b64 param0;
st.param.b64 [param0+0], %rd6;
.param .b64 param1;
st.param.b64 [param1+0], %rd2;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r1, [retval0+0];
//{
}// Callseq End 0
ret;
}
*************EXCERPT**************
$ nvcc -arch=sm_35 t625.cu -o t625
$ cuobjdump -sass t625
*************EXCERPT**************
Fatbin elf code:
================
arch = sm_35
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit
code for sm_35
Function : _Z9my_kernely
.headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
/* 0x0880109c10801000 */
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
/*0010*/ ISUB R1, R1, 0x8; /* 0xc0880000041c0405 */
/*0018*/ MOV R0, c[0x0][0x144]; /* 0x64c03c00289c0002 */
/*0020*/ MOV R3, c[0x0][0x140]; /* 0x64c03c00281c000e */
/*0028*/ MOV32I R4, 0x0; /* 0x74000000001fc012 */
/*0030*/ LOP.OR R6, R1, c[0x0][0x24]; /* 0x62001000049c041a */
/*0038*/ MOV32I R5, 0x0; /* 0x74000000001fc016 */
/* 0x0880b80010a0109c */
/*0048*/ SHF.R.U64 R2, R3, 0xf, R0; ***** NOTE *****
/*0050*/ SHF.R.U64.HI R3, RZ, 0xf, R0; ***** NOTE *****
/*0058*/ LOP32I.AND R0, R6, 0xffffff; /* 0x20007fffff9c1800 */
/*0060*/ MOV R7, RZ; /* 0xe4c03c007f9c001e */
/*0068*/ STL.64 [R0], R2; /* 0x7aa80000001c000a */
/*0070*/ JCAL 0x0; /* 0x1100000000000100 */
/*0078*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */
/* 0x08000000000000b8 */
/*0088*/ EXIT; /* 0x18000000001c003c */
/*0090*/ BRA 0x90; /* 0x12007ffffc1c003c */
/*0098*/ NOP; /* 0x85800000001c3c02 */
/*00a0*/ NOP; /* 0x85800000001c3c02 */
/*00a8*/ NOP; /* 0x85800000001c3c02 */
/*00b0*/ NOP; /* 0x85800000001c3c02 */
/*00b8*/ NOP; /* 0x85800000001c3c02 */
..............................
Fatbin ptx code:
================
arch = sm_35
code version = [4,2]
producer = cuda
host = linux
compile_size = 64bit
compressed
[bob@cluster1 misc]$
*************EXCERPT**************
在 ***** NOTE *****
标记的行中,PTX shr instruction (non-funnel-shift) is being converted to SASS SHF(漏斗转移)说明。
CUDA 是否在使用 64 位整数时在 SM35 内部生成漏斗移位指令?我的内核使用 SM35 目标编译到 PTX,它显示常规 shl.b64 指令。我知道此 PTX 输出尚未完全优化,一旦加载模块 (cuModuleLoad),将生成原生 arch 代码。
shl.b64 %rd5, %rd4, 16;
驱动API应该不会成为代码分析的障碍。尝试使用运行时 API 编写测试用例。在评论中回复此问题:
But really I'm wondering about whether the final device code generated out of "shl.b64" would be a shf of the lower and upper 32-bit words.
我认为在某些情况下,ptxas(包括驱动程序 JIT 引擎)可以将 PTX 逻辑移位指令转换为 PTX 漏斗移位指令 (shf) 的 SASS 等效指令 (SHF)。
这是一个完整的示例:
$ cat t625.cu
#include <stdio.h>
__global__ void my_kernel(unsigned long long data)
{
unsigned long long my_data = data >> 15;
printf("data = %ld\n",my_data);
}
int main(){
my_kernel<<<1,1>>>(2ULL<<40);
cudaDeviceSynchronize();
}
[bob@cluster1 misc]$ nvcc -arch=sm_35 -ptx t625.cu
[bob@cluster1 misc]$ cat t625.ptx
*************EXCERPT**************
// .globl _Z9my_kernely
.visible .entry _Z9my_kernely(
.param .u64 _Z9my_kernely_param_0
)
{
.local .align 8 .b8 __local_depot6[8];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .s32 %r<2>;
.reg .s64 %rd<8>;
mov.u64 %rd7, __local_depot6;
cvta.local.u64 %SP, %rd7;
ld.param.u64 %rd1, [_Z9my_kernely_param_0];
add.u64 %rd2, %SP, 0;
cvta.to.local.u64 %rd3, %rd2;
shr.u64 %rd4, %rd1, 15; ***** NOTE *****
st.local.u64 [%rd3], %rd4;
mov.u64 %rd5, $str;
cvta.global.u64 %rd6, %rd5;
// Callseq Start 0
{
.reg .b32 temp_param_reg;
// <end>}
.param .b64 param0;
st.param.b64 [param0+0], %rd6;
.param .b64 param1;
st.param.b64 [param1+0], %rd2;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r1, [retval0+0];
//{
}// Callseq End 0
ret;
}
*************EXCERPT**************
$ nvcc -arch=sm_35 t625.cu -o t625
$ cuobjdump -sass t625
*************EXCERPT**************
Fatbin elf code:
================
arch = sm_35
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit
code for sm_35
Function : _Z9my_kernely
.headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
/* 0x0880109c10801000 */
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
/*0010*/ ISUB R1, R1, 0x8; /* 0xc0880000041c0405 */
/*0018*/ MOV R0, c[0x0][0x144]; /* 0x64c03c00289c0002 */
/*0020*/ MOV R3, c[0x0][0x140]; /* 0x64c03c00281c000e */
/*0028*/ MOV32I R4, 0x0; /* 0x74000000001fc012 */
/*0030*/ LOP.OR R6, R1, c[0x0][0x24]; /* 0x62001000049c041a */
/*0038*/ MOV32I R5, 0x0; /* 0x74000000001fc016 */
/* 0x0880b80010a0109c */
/*0048*/ SHF.R.U64 R2, R3, 0xf, R0; ***** NOTE *****
/*0050*/ SHF.R.U64.HI R3, RZ, 0xf, R0; ***** NOTE *****
/*0058*/ LOP32I.AND R0, R6, 0xffffff; /* 0x20007fffff9c1800 */
/*0060*/ MOV R7, RZ; /* 0xe4c03c007f9c001e */
/*0068*/ STL.64 [R0], R2; /* 0x7aa80000001c000a */
/*0070*/ JCAL 0x0; /* 0x1100000000000100 */
/*0078*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */
/* 0x08000000000000b8 */
/*0088*/ EXIT; /* 0x18000000001c003c */
/*0090*/ BRA 0x90; /* 0x12007ffffc1c003c */
/*0098*/ NOP; /* 0x85800000001c3c02 */
/*00a0*/ NOP; /* 0x85800000001c3c02 */
/*00a8*/ NOP; /* 0x85800000001c3c02 */
/*00b0*/ NOP; /* 0x85800000001c3c02 */
/*00b8*/ NOP; /* 0x85800000001c3c02 */
..............................
Fatbin ptx code:
================
arch = sm_35
code version = [4,2]
producer = cuda
host = linux
compile_size = 64bit
compressed
[bob@cluster1 misc]$
*************EXCERPT**************
在 ***** NOTE *****
标记的行中,PTX shr instruction (non-funnel-shift) is being converted to SASS SHF(漏斗转移)说明。