如何找出哪个线程正在 GPU 的哪个核心上执行?
How can I find out which thread is getting executed on which core of the GPU?
我正在用 Cuda 开发一些简单的程序,我想知道哪个线程在 GPU 的哪个核心上执行。我正在使用 Visual Studio 2012 并且我有 NVIDIA GeForce 610M 显卡.
是否可以这样做...我已经在 google 上搜索了很多,但都是徒劳的。
编辑 :
我知道这个问题很奇怪,但我的大学项目指南要求我这样做。
结合 PTX manual and a simple inline-PTX wrapper 中的信息,以下功能应该可以满足您的需求:
static __device__ __inline__ uint32_t __mysmid(){
uint32_t smid;
asm volatile("mov.u32 %0, %%smid;" : "=r"(smid));
return smid;}
以上函数将告诉您(线程)代码正在哪个多处理器上执行。
static __device__ __inline__ uint32_t __mywarpid(){
uint32_t warpid;
asm volatile("mov.u32 %0, %%warpid;" : "=r"(warpid));
return warpid;}
上面的函数会告诉你(线程)代码属于哪个warp。
static __device__ __inline__ uint32_t __mylaneid(){
uint32_t laneid;
asm volatile("mov.u32 %0, %%laneid;" : "=r"(laneid));
return laneid;}
上面的函数会告诉你(线程)代码属于哪个warp lane。
请注意,在动态并行的情况下(可能还有调试等其他场景),此信息为 volatile and may change during program execution。
请参阅 programming guide 以了解 multiprocessor 和 warp.
等术语的定义
这是一个完整的示例:
$ cat t646.cu
#include <stdio.h>
#include <stdint.h>
static __device__ __inline__ uint32_t __mysmid(){
uint32_t smid;
asm volatile("mov.u32 %0, %%smid;" : "=r"(smid));
return smid;}
static __device__ __inline__ uint32_t __mywarpid(){
uint32_t warpid;
asm volatile("mov.u32 %0, %%warpid;" : "=r"(warpid));
return warpid;}
static __device__ __inline__ uint32_t __mylaneid(){
uint32_t laneid;
asm volatile("mov.u32 %0, %%laneid;" : "=r"(laneid));
return laneid;}
__global__ void mykernel(){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
printf("I am thread %d, my SM ID is %d, my warp ID is %d, and my warp lane is %d\n", idx, __mysmid(), __mywarpid(), __mylaneid());
}
int main(){
mykernel<<<4,4>>>();
cudaDeviceSynchronize();
return 0;
}
$ nvcc -arch=sm_20 -o t646 t646.cu
$ ./t646
I am thread 0, my SM ID is 0, my warp ID is 0, and my warp lane is 0
I am thread 1, my SM ID is 0, my warp ID is 0, and my warp lane is 1
I am thread 2, my SM ID is 0, my warp ID is 0, and my warp lane is 2
I am thread 3, my SM ID is 0, my warp ID is 0, and my warp lane is 3
I am thread 8, my SM ID is 3, my warp ID is 0, and my warp lane is 0
I am thread 9, my SM ID is 3, my warp ID is 0, and my warp lane is 1
I am thread 10, my SM ID is 3, my warp ID is 0, and my warp lane is 2
I am thread 11, my SM ID is 3, my warp ID is 0, and my warp lane is 3
I am thread 12, my SM ID is 4, my warp ID is 0, and my warp lane is 0
I am thread 13, my SM ID is 4, my warp ID is 0, and my warp lane is 1
I am thread 14, my SM ID is 4, my warp ID is 0, and my warp lane is 2
I am thread 15, my SM ID is 4, my warp ID is 0, and my warp lane is 3
I am thread 4, my SM ID is 1, my warp ID is 0, and my warp lane is 0
I am thread 5, my SM ID is 1, my warp ID is 0, and my warp lane is 1
I am thread 6, my SM ID is 1, my warp ID is 0, and my warp lane is 2
I am thread 7, my SM ID is 1, my warp ID is 0, and my warp lane is 3
$
请注意,以上输出会因您使用的 GPU 类型而异 运行。不要期望你的输出和上面的完全一样。
我正在用 Cuda 开发一些简单的程序,我想知道哪个线程在 GPU 的哪个核心上执行。我正在使用 Visual Studio 2012 并且我有 NVIDIA GeForce 610M 显卡.
是否可以这样做...我已经在 google 上搜索了很多,但都是徒劳的。
编辑 :
我知道这个问题很奇怪,但我的大学项目指南要求我这样做。
结合 PTX manual and a simple inline-PTX wrapper 中的信息,以下功能应该可以满足您的需求:
static __device__ __inline__ uint32_t __mysmid(){
uint32_t smid;
asm volatile("mov.u32 %0, %%smid;" : "=r"(smid));
return smid;}
以上函数将告诉您(线程)代码正在哪个多处理器上执行。
static __device__ __inline__ uint32_t __mywarpid(){
uint32_t warpid;
asm volatile("mov.u32 %0, %%warpid;" : "=r"(warpid));
return warpid;}
上面的函数会告诉你(线程)代码属于哪个warp。
static __device__ __inline__ uint32_t __mylaneid(){
uint32_t laneid;
asm volatile("mov.u32 %0, %%laneid;" : "=r"(laneid));
return laneid;}
上面的函数会告诉你(线程)代码属于哪个warp lane。
请注意,在动态并行的情况下(可能还有调试等其他场景),此信息为 volatile and may change during program execution。
请参阅 programming guide 以了解 multiprocessor 和 warp.
等术语的定义这是一个完整的示例:
$ cat t646.cu
#include <stdio.h>
#include <stdint.h>
static __device__ __inline__ uint32_t __mysmid(){
uint32_t smid;
asm volatile("mov.u32 %0, %%smid;" : "=r"(smid));
return smid;}
static __device__ __inline__ uint32_t __mywarpid(){
uint32_t warpid;
asm volatile("mov.u32 %0, %%warpid;" : "=r"(warpid));
return warpid;}
static __device__ __inline__ uint32_t __mylaneid(){
uint32_t laneid;
asm volatile("mov.u32 %0, %%laneid;" : "=r"(laneid));
return laneid;}
__global__ void mykernel(){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
printf("I am thread %d, my SM ID is %d, my warp ID is %d, and my warp lane is %d\n", idx, __mysmid(), __mywarpid(), __mylaneid());
}
int main(){
mykernel<<<4,4>>>();
cudaDeviceSynchronize();
return 0;
}
$ nvcc -arch=sm_20 -o t646 t646.cu
$ ./t646
I am thread 0, my SM ID is 0, my warp ID is 0, and my warp lane is 0
I am thread 1, my SM ID is 0, my warp ID is 0, and my warp lane is 1
I am thread 2, my SM ID is 0, my warp ID is 0, and my warp lane is 2
I am thread 3, my SM ID is 0, my warp ID is 0, and my warp lane is 3
I am thread 8, my SM ID is 3, my warp ID is 0, and my warp lane is 0
I am thread 9, my SM ID is 3, my warp ID is 0, and my warp lane is 1
I am thread 10, my SM ID is 3, my warp ID is 0, and my warp lane is 2
I am thread 11, my SM ID is 3, my warp ID is 0, and my warp lane is 3
I am thread 12, my SM ID is 4, my warp ID is 0, and my warp lane is 0
I am thread 13, my SM ID is 4, my warp ID is 0, and my warp lane is 1
I am thread 14, my SM ID is 4, my warp ID is 0, and my warp lane is 2
I am thread 15, my SM ID is 4, my warp ID is 0, and my warp lane is 3
I am thread 4, my SM ID is 1, my warp ID is 0, and my warp lane is 0
I am thread 5, my SM ID is 1, my warp ID is 0, and my warp lane is 1
I am thread 6, my SM ID is 1, my warp ID is 0, and my warp lane is 2
I am thread 7, my SM ID is 1, my warp ID is 0, and my warp lane is 3
$
请注意,以上输出会因您使用的 GPU 类型而异 运行。不要期望你的输出和上面的完全一样。