拦截 CUDA 调用
Intercepting CUDA calls
我正在尝试拦截来自 pytorch 库的 cudaMemcpy 调用以进行分析。我注意到 NVIDIA 在 CUDA 工具包示例中有一个 cuHook 示例。但是,该示例需要修改应用程序本身的源代码,而在这种情况下我不能这样做。那么有没有办法在不修改应用程序源码的情况下,写一个hook拦截CUDA调用呢?
CUDA 运行time API 调用可以挂钩(在 linux 上)使用 "LD_PRELOAD trick" 如果正在 运行 的应用程序是动态的链接到 CUDA 运行时间库 (libcudart.so
)。
这是 linux 上的一个简单示例:
$ cat mylib.cpp
#include <stdio.h>
#include <unistd.h>
#include <dlfcn.h>
#include <cuda_runtime.h>
cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind )
{
cudaError_t (*lcudaMemcpy) ( void*, const void*, size_t, cudaMemcpyKind) = (cudaError_t (*) ( void* , const void* , size_t , cudaMemcpyKind ))dlsym(RTLD_NEXT, "cudaMemcpy");
printf("cudaMemcpy hooked\n");
return lcudaMemcpy( dst, src, count, kind );
}
cudaError_t cudaMemcpyAsync ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t str )
{
cudaError_t (*lcudaMemcpyAsync) ( void*, const void*, size_t, cudaMemcpyKind, cudaStream_t) = (cudaError_t (*) ( void* , const void* , size_t , cudaMemcpyKind, cudaStream_t ))dlsym(RTLD_NEXT, "cudaMemcpyAsync");
printf("cudaMemcpyAsync hooked\n");
return lcudaMemcpyAsync( dst, src, count, kind, str );
}
$ g++ -I/usr/local/cuda/include -fPIC -shared -o libmylib.so mylib.cpp -ldl -L/usr/local/cuda/lib64 -lcudart
$ cat t1.cu
#include <stdio.h>
int main(){
int a, *d_a;
cudaMalloc(&d_a, sizeof(d_a[0]));
cudaMemcpy(d_a, &a, sizeof(a), cudaMemcpyHostToDevice);
cudaStream_t str;
cudaStreamCreate(&str);
cudaMemcpyAsync(d_a, &a, sizeof(a), cudaMemcpyHostToDevice);
cudaMemcpyAsync(d_a, &a, sizeof(a), cudaMemcpyHostToDevice, str);
cudaDeviceSynchronize();
}
$ nvcc -o t1 t1.cu -cudart shared
$ LD_LIBRARY_PATH=/usr/local/cuda/lib64 LD_PRELOAD=./libmylib.so cuda-memcheck ./t1
========= CUDA-MEMCHECK
cudaMemcpy hooked
cudaMemcpyAsync hooked
cudaMemcpyAsync hooked
========= ERROR SUMMARY: 0 errors
$
(CentOS 7、CUDA 10.2)
使用 pytorch 进行的简单测试似乎表明它有效:
$ docker run --gpus all -it nvcr.io/nvidia/pytorch:20.08-py3
...
Status: Downloaded newer image for nvcr.io/nvidia/pytorch:20.08-py3
=============
== PyTorch ==
=============
NVIDIA Release 20.08 (build 15516749)
PyTorch Version 1.7.0a0+8deb4fe
Container image Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
Copyright (c) 2014-2020 Facebook Inc.
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
Copyright (c) 2011-2013 NYU (Clement Farabet)
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
Copyright (c) 2015 Google Inc.
Copyright (c) 2015 Yangqing Jia
Copyright (c) 2013-2016 The Caffe contributors
All rights reserved.
Various files include modifications (c) NVIDIA CORPORATION. All rights reserved.
NVIDIA modifications are covered by the license terms that apply to the underlying project or file.
NOTE: MOFED driver for multi-node communication was not detected.
Multi-node communication performance may be reduced.
NOTE: The SHMEM allocation limit is set to the default of 64MB. This may be
insufficient for PyTorch. NVIDIA recommends the use of the following flags:
nvidia-docker run --ipc=host ...
...
root@946934df529b:/workspace# cat mylib.cpp
#include <stdio.h>
#include <unistd.h>
#include <dlfcn.h>
#include <cuda_runtime.h>
cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind )
{
cudaError_t (*lcudaMemcpy) ( void*, const void*, size_t, cudaMemcpyKind) = (cudaError_t (*) ( void* , const void* , size_t , cudaMemcpyKind ))dlsym(RTLD_NEXT, "cudaMemcpy");
printf("cudaMemcpy hooked\n");
return lcudaMemcpy( dst, src, count, kind );
}
cudaError_t cudaMemcpyAsync ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t str )
{
cudaError_t (*lcudaMemcpyAsync) ( void*, const void*, size_t, cudaMemcpyKind, cudaStream_t) = (cudaError_t (*) ( void* , const void* , size_t , cudaMemcpyKind, cudaStream_t ))dlsym(RTLD_NEXT, "cudaMemcpyAsync");
printf("cudaMemcpyAsync hooked\n");
return lcudaMemcpyAsync( dst, src, count, kind, str );
}
root@946934df529b:/workspace# g++ -I/usr/local/cuda/include -fPIC -shared -o libmylib.so mylib.cpp -ldl -L/usr/local/cuda/lib64 -lcudart
root@946934df529b:/workspace# cat tt.py
import torch
device = torch.cuda.current_device()
x = torch.randn(1024, 1024).to(device)
y = torch.randn(1024, 1024).to(device)
z = torch.matmul(x, y)
root@946934df529b:/workspace# LD_LIBRARY_PATH=/usr/local/cuda/lib64 LD_PRELOAD=./libmylib.so python tt.py
cudaMemcpyAsync hooked
cudaMemcpyAsync hooked
root@946934df529b:/workspace#
(使用 NVIDIA NGC PyTorch container)
我正在尝试拦截来自 pytorch 库的 cudaMemcpy 调用以进行分析。我注意到 NVIDIA 在 CUDA 工具包示例中有一个 cuHook 示例。但是,该示例需要修改应用程序本身的源代码,而在这种情况下我不能这样做。那么有没有办法在不修改应用程序源码的情况下,写一个hook拦截CUDA调用呢?
CUDA 运行time API 调用可以挂钩(在 linux 上)使用 "LD_PRELOAD trick" 如果正在 运行 的应用程序是动态的链接到 CUDA 运行时间库 (libcudart.so
)。
这是 linux 上的一个简单示例:
$ cat mylib.cpp
#include <stdio.h>
#include <unistd.h>
#include <dlfcn.h>
#include <cuda_runtime.h>
cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind )
{
cudaError_t (*lcudaMemcpy) ( void*, const void*, size_t, cudaMemcpyKind) = (cudaError_t (*) ( void* , const void* , size_t , cudaMemcpyKind ))dlsym(RTLD_NEXT, "cudaMemcpy");
printf("cudaMemcpy hooked\n");
return lcudaMemcpy( dst, src, count, kind );
}
cudaError_t cudaMemcpyAsync ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t str )
{
cudaError_t (*lcudaMemcpyAsync) ( void*, const void*, size_t, cudaMemcpyKind, cudaStream_t) = (cudaError_t (*) ( void* , const void* , size_t , cudaMemcpyKind, cudaStream_t ))dlsym(RTLD_NEXT, "cudaMemcpyAsync");
printf("cudaMemcpyAsync hooked\n");
return lcudaMemcpyAsync( dst, src, count, kind, str );
}
$ g++ -I/usr/local/cuda/include -fPIC -shared -o libmylib.so mylib.cpp -ldl -L/usr/local/cuda/lib64 -lcudart
$ cat t1.cu
#include <stdio.h>
int main(){
int a, *d_a;
cudaMalloc(&d_a, sizeof(d_a[0]));
cudaMemcpy(d_a, &a, sizeof(a), cudaMemcpyHostToDevice);
cudaStream_t str;
cudaStreamCreate(&str);
cudaMemcpyAsync(d_a, &a, sizeof(a), cudaMemcpyHostToDevice);
cudaMemcpyAsync(d_a, &a, sizeof(a), cudaMemcpyHostToDevice, str);
cudaDeviceSynchronize();
}
$ nvcc -o t1 t1.cu -cudart shared
$ LD_LIBRARY_PATH=/usr/local/cuda/lib64 LD_PRELOAD=./libmylib.so cuda-memcheck ./t1
========= CUDA-MEMCHECK
cudaMemcpy hooked
cudaMemcpyAsync hooked
cudaMemcpyAsync hooked
========= ERROR SUMMARY: 0 errors
$
(CentOS 7、CUDA 10.2)
使用 pytorch 进行的简单测试似乎表明它有效:
$ docker run --gpus all -it nvcr.io/nvidia/pytorch:20.08-py3
...
Status: Downloaded newer image for nvcr.io/nvidia/pytorch:20.08-py3
=============
== PyTorch ==
=============
NVIDIA Release 20.08 (build 15516749)
PyTorch Version 1.7.0a0+8deb4fe
Container image Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
Copyright (c) 2014-2020 Facebook Inc.
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
Copyright (c) 2011-2013 NYU (Clement Farabet)
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
Copyright (c) 2015 Google Inc.
Copyright (c) 2015 Yangqing Jia
Copyright (c) 2013-2016 The Caffe contributors
All rights reserved.
Various files include modifications (c) NVIDIA CORPORATION. All rights reserved.
NVIDIA modifications are covered by the license terms that apply to the underlying project or file.
NOTE: MOFED driver for multi-node communication was not detected.
Multi-node communication performance may be reduced.
NOTE: The SHMEM allocation limit is set to the default of 64MB. This may be
insufficient for PyTorch. NVIDIA recommends the use of the following flags:
nvidia-docker run --ipc=host ...
...
root@946934df529b:/workspace# cat mylib.cpp
#include <stdio.h>
#include <unistd.h>
#include <dlfcn.h>
#include <cuda_runtime.h>
cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind )
{
cudaError_t (*lcudaMemcpy) ( void*, const void*, size_t, cudaMemcpyKind) = (cudaError_t (*) ( void* , const void* , size_t , cudaMemcpyKind ))dlsym(RTLD_NEXT, "cudaMemcpy");
printf("cudaMemcpy hooked\n");
return lcudaMemcpy( dst, src, count, kind );
}
cudaError_t cudaMemcpyAsync ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t str )
{
cudaError_t (*lcudaMemcpyAsync) ( void*, const void*, size_t, cudaMemcpyKind, cudaStream_t) = (cudaError_t (*) ( void* , const void* , size_t , cudaMemcpyKind, cudaStream_t ))dlsym(RTLD_NEXT, "cudaMemcpyAsync");
printf("cudaMemcpyAsync hooked\n");
return lcudaMemcpyAsync( dst, src, count, kind, str );
}
root@946934df529b:/workspace# g++ -I/usr/local/cuda/include -fPIC -shared -o libmylib.so mylib.cpp -ldl -L/usr/local/cuda/lib64 -lcudart
root@946934df529b:/workspace# cat tt.py
import torch
device = torch.cuda.current_device()
x = torch.randn(1024, 1024).to(device)
y = torch.randn(1024, 1024).to(device)
z = torch.matmul(x, y)
root@946934df529b:/workspace# LD_LIBRARY_PATH=/usr/local/cuda/lib64 LD_PRELOAD=./libmylib.so python tt.py
cudaMemcpyAsync hooked
cudaMemcpyAsync hooked
root@946934df529b:/workspace#
(使用 NVIDIA NGC PyTorch container)