cudaMemcpyAsync 的奇怪行为: 1. cudaMemcpyKind 没有区别。 2.复制失败,但无声
Odd behavior of cudaMemcpyAsync: 1. cudaMemcpyKind makes no difference. 2. Copy fails, but silently
我正在熟悉配备 Pascal P100 GPUs+Nvlink 的新集群。我写了一个乒乓程序来测试 gpu<->gpu 和 gpu<->cpu 带宽和点对点访问。 (我知道 cuda 样本包含这样的程序,但我想自己做,以便更好地理解。)Nvlink 带宽似乎合理(~35 GB/s 双向,理论最大值为 40)。然而,在调试 ping-pong 时,我发现了一些奇怪的行为。
首先,无论我指定什么 cudaMemcpyKind,cudaMemcpyAsync 都会成功,例如,如果 cudaMemcpyAsync 正在将内存从主机复制到设备,即使我将 cudaMemcpyDeviceToHost 作为种类传递,它也会成功。
其次,当主机内存未被页面锁定时,cudaMemcpyAsync 执行以下操作:
- 将内存从主机复制到设备似乎成功(没有段错误或 cuda 运行时错误,并且数据似乎正确传输)。
- 将内存从设备复制到主机无声地失败:没有发生段错误,并且在 memcpy returns cudaSuccess 之后执行 cudaDeviceSynchronize,但是检查数据显示 gpu 上的数据没有正确传输到主机。
这种行为是否符合预期?我已经包含了一个在我的系统上演示它的最小工作示例代码(该示例不是乒乓应用程序,它所做的只是使用各种参数测试 cudaMemcpyAsync)。
P100 启用了 UVA,因此我认为 cudaMemcpyAsync 只是推断 src 和 dst 指针的位置并忽略 cudaMemcpyKind 参数是合理的。但是,我不确定为什么 cudaMemcpyAsync 无法为非页锁定主机内存抛出错误。我的印象是这是一个严格的禁忌。
#include <stdio.h>
#include <cuda_runtime.h>
#include <stdlib.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void checkDataDevice( int* current, int* next, int expected_current_val, int n )
{
int tid = threadIdx.x + blockIdx.x*blockDim.x;
for( int i = tid; i < n; i += blockDim.x*gridDim.x )
{
if( current[i] != expected_current_val )
printf( "Error on device: expected = %d, current[%d] = %d\n"
, expected_current_val
, i
, current[i] );
// Increment the data so the next copy is properly tested
next[i] = current[i] + 1;
}
}
void checkDataHost( int* current, int* next, int expected_current_val, int n )
{
for( int i = 0; i < n; i++ )
{
if( current[i] != expected_current_val )
printf( "Error on host: expected = %d, current[%d] = %d\n"
, expected_current_val
, i
, current[i] );
// Increment the data so the next copy is properly tested
next[i] = current[i] + 1;
}
}
int main( int argc, char** argv )
{
bool pagelocked = true;
// invoking the executable with any additional argument(s) will turn off page locked memory, i.e.,
// Run with pagelocked memory: ./a.out
// Run with ordinary malloc'd memory: ./a.out jkfdlsja
if( argc > 1 )
pagelocked = false;
int copybytes = 1e8; // Ok to use int instead of size_t for 1e8.
cudaStream_t* stream = (cudaStream_t*)malloc( sizeof(cudaStream_t) );
cudaStreamCreate( stream );
int* srcHost;
int* dstHost;
int* srcDevice;
int* dstDevice;
cudaMalloc( (void**)&srcDevice, copybytes );
cudaMalloc( (void**)&dstDevice, copybytes );
if( pagelocked )
{
printf( "Using page locked memory\n" );
cudaMallocHost( (void**)&srcHost, copybytes );
cudaMallocHost( (void**)&dstHost, copybytes );
}
else
{
printf( "Using non page locked memory\n" );
srcHost = (int*)malloc( copybytes );
dstHost = (int*)malloc( copybytes );
}
for( int i = 0; i < copybytes/sizeof(int); i++ )
srcHost[i] = 1;
cudaMemcpyKind kinds[4];
kinds[0] = cudaMemcpyHostToDevice;
kinds[1] = cudaMemcpyDeviceToHost;
kinds[2] = cudaMemcpyHostToHost;
kinds[3] = cudaMemcpyDeviceToDevice;
// Test cudaMemcpyAsync in both directions,
// iterating through all "cudaMemcpyKinds" to verify
// that they don't matter.
int expected_current_val = 1;
for( int kind = 0; kind<4; kind++ )
{
// Host to device copy
cudaMemcpyAsync( dstDevice
, srcHost
, copybytes
, kinds[kind]
, *stream );
gpuErrchk( cudaDeviceSynchronize() );
checkDataDevice<<<56*8,256>>>( dstDevice
, srcDevice
, expected_current_val
, copybytes/sizeof(int) );
expected_current_val++;
// Device to host copy
cudaMemcpyAsync( dstHost
, srcDevice
, copybytes
, kinds[kind]
, *stream );
gpuErrchk( cudaDeviceSynchronize() );
checkDataHost( dstHost
, srcHost
, expected_current_val
, copybytes/sizeof(int) );
expected_current_val++;
}
free( stream );
cudaFree( srcDevice );
cudaFree( dstDevice );
if( pagelocked )
{
cudaFreeHost( srcHost );
cudaFreeHost( dstHost );
}
else
{
free( srcHost );
free( dstHost );
}
return 0;
}
当 CUDA 代码出现问题时,我 强烈 建议使用 rigorous(== 每次调用 return 代码已选中)proper CUDA error checking.
您的错误检查存在缺陷,这些缺陷导致您有些困惑。
首先,在页面锁定的情况下,给定的(映射的)指针在主机和设备上都是accessible/valid。因此,每个可能的方向枚举(H2D、D2H、D2D、H2H)都是合法有效的。因此,不会 return 编辑任何错误并且 复制操作成功。
在非页面锁定的情况下,上述情况并非如此,因此一般来说,指示的传输方向最好与隐含的传输方向相匹配,如指针所检查的那样。如果没有,cudaMemcpyAsync
将 return 错误代码 (cudaErrorInvalidValue
== 11)。在您的情况下,您忽略了此错误结果。如果你有足够的耐心,你可以自己证明这一点(如果你只是标记第一个错误会更好,而不是打印出 10M+ 元素中的每个不匹配),通过 运行 你的代码 cuda-memcheck
(当您在使用 CUDA 代码时遇到问题时可以做的另一件好事)或者进行适当、严格的错误检查。
当 cudaMemcpyAsync
操作指示失败时,操作未成功完成,因此未复制数据,并且您的数据检查指示不匹配。希望现在这并不奇怪,因为预期的复制操作实际上并没有发生(也没有失败 "silently")。
也许您很困惑,认为捕获任何类型的异步操作错误的方法是执行 cudaDeviceSynchronize
然后检查错误。
这对 cudaMemcpyAsync
不正确。在调用 cudaMemcpyAsync
操作时可以检测到的错误将由调用本身立即 returned,并且 不会 returned作为后续 CUDA 调用的结果(很明显),因为这种类型的错误是非粘性的。
故事的寓意:
- 进行适当的 CUDA 错误检查。严格。
- 运行 您的代码
cuda-memcheck
.
这是一个完整的示例,对您的代码进行了微不足道的修改以在失败案例中生成输出 "sane",证明在失败案例中指示了错误:
$ cat t153.cu
#include <stdio.h>
#include <stdlib.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void checkDataDevice( int* current, int* next, int expected_current_val, int n )
{
int tid = threadIdx.x + blockIdx.x*blockDim.x;
for( int i = tid; i < n; i += blockDim.x*gridDim.x )
{
if( current[i] != expected_current_val )
printf( "Error on device: expected = %d, current[%d] = %d\n"
, expected_current_val
, i
, current[i] );
// Increment the data so the next copy is properly tested
next[i] = current[i] + 1;
}
}
void checkDataHost( int* current, int* next, int expected_current_val, int n )
{
for( int i = 0; i < n; i++ )
{
if( current[i] != expected_current_val ){
printf( "Error on host: expected = %d, current[%d] = %d\n"
, expected_current_val
, i
, current[i] );
exit(0);}
// Increment the data so the next copy is properly tested
next[i] = current[i] + 1;
}
}
int main( int argc, char** argv )
{
bool pagelocked = true;
// invoking the executable with any additional argument(s) will turn off page locked memory, i.e.,
// Run with pagelocked memory: ./a.out
// Run with ordinary malloc'd memory: ./a.out jkfdlsja
if( argc > 1 )
pagelocked = false;
int copybytes = 1e8; // Ok to use int instead of size_t for 1e8.
cudaStream_t* stream = (cudaStream_t*)malloc( sizeof(cudaStream_t) );
cudaStreamCreate( stream );
int* srcHost;
int* dstHost;
int* srcDevice;
int* dstDevice;
cudaMalloc( (void**)&srcDevice, copybytes );
cudaMalloc( (void**)&dstDevice, copybytes );
if( pagelocked )
{
printf( "Using page locked memory\n" );
cudaMallocHost( (void**)&srcHost, copybytes );
cudaMallocHost( (void**)&dstHost, copybytes );
}
else
{
printf( "Using non page locked memory\n" );
srcHost = (int*)malloc( copybytes );
dstHost = (int*)malloc( copybytes );
}
for( int i = 0; i < copybytes/sizeof(int); i++ )
srcHost[i] = 1;
cudaMemcpyKind kinds[4];
kinds[0] = cudaMemcpyHostToDevice;
kinds[1] = cudaMemcpyDeviceToHost;
kinds[2] = cudaMemcpyHostToHost;
kinds[3] = cudaMemcpyDeviceToDevice;
// Test cudaMemcpyAsync in both directions,
// iterating through all "cudaMemcpyKinds" to verify
// that they don't matter.
int expected_current_val = 1;
for( int kind = 0; kind<4; kind++ )
{
// Host to device copy
cudaMemcpyAsync( dstDevice
, srcHost
, copybytes
, kinds[kind]
, *stream );
gpuErrchk( cudaDeviceSynchronize() );
checkDataDevice<<<56*8,256>>>( dstDevice
, srcDevice
, expected_current_val
, copybytes/sizeof(int) );
expected_current_val++;
// Device to host copy
cudaMemcpyAsync( dstHost
, srcDevice
, copybytes
, kinds[kind]
, *stream );
gpuErrchk( cudaDeviceSynchronize() );
checkDataHost( dstHost
, srcHost
, expected_current_val
, copybytes/sizeof(int) );
expected_current_val++;
}
free( stream );
cudaFree( srcDevice );
cudaFree( dstDevice );
if( pagelocked )
{
cudaFreeHost( srcHost );
cudaFreeHost( dstHost );
}
else
{
free( srcHost );
free( dstHost );
}
return 0;
}
$ nvcc -arch=sm_61 -o t153 t153.cu
$ cuda-memcheck ./t153 a
========= CUDA-MEMCHECK
Using non page locked memory
========= Program hit cudaErrorInvalidValue (error 11) due to "invalid argument" on CUDA API call to cudaMemcpyAsync.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x2ef423]
========= Host Frame:./t153 [0x489a3]
========= Host Frame:./t153 [0x2e11]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21ec5]
========= Host Frame:./t153 [0x2a49]
=========
Error on host: expected = 2, current[0] = 0
========= ERROR SUMMARY: 1 error
$
我正在熟悉配备 Pascal P100 GPUs+Nvlink 的新集群。我写了一个乒乓程序来测试 gpu<->gpu 和 gpu<->cpu 带宽和点对点访问。 (我知道 cuda 样本包含这样的程序,但我想自己做,以便更好地理解。)Nvlink 带宽似乎合理(~35 GB/s 双向,理论最大值为 40)。然而,在调试 ping-pong 时,我发现了一些奇怪的行为。
首先,无论我指定什么 cudaMemcpyKind,cudaMemcpyAsync 都会成功,例如,如果 cudaMemcpyAsync 正在将内存从主机复制到设备,即使我将 cudaMemcpyDeviceToHost 作为种类传递,它也会成功。
其次,当主机内存未被页面锁定时,cudaMemcpyAsync 执行以下操作:
- 将内存从主机复制到设备似乎成功(没有段错误或 cuda 运行时错误,并且数据似乎正确传输)。
- 将内存从设备复制到主机无声地失败:没有发生段错误,并且在 memcpy returns cudaSuccess 之后执行 cudaDeviceSynchronize,但是检查数据显示 gpu 上的数据没有正确传输到主机。
这种行为是否符合预期?我已经包含了一个在我的系统上演示它的最小工作示例代码(该示例不是乒乓应用程序,它所做的只是使用各种参数测试 cudaMemcpyAsync)。
P100 启用了 UVA,因此我认为 cudaMemcpyAsync 只是推断 src 和 dst 指针的位置并忽略 cudaMemcpyKind 参数是合理的。但是,我不确定为什么 cudaMemcpyAsync 无法为非页锁定主机内存抛出错误。我的印象是这是一个严格的禁忌。
#include <stdio.h>
#include <cuda_runtime.h>
#include <stdlib.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void checkDataDevice( int* current, int* next, int expected_current_val, int n )
{
int tid = threadIdx.x + blockIdx.x*blockDim.x;
for( int i = tid; i < n; i += blockDim.x*gridDim.x )
{
if( current[i] != expected_current_val )
printf( "Error on device: expected = %d, current[%d] = %d\n"
, expected_current_val
, i
, current[i] );
// Increment the data so the next copy is properly tested
next[i] = current[i] + 1;
}
}
void checkDataHost( int* current, int* next, int expected_current_val, int n )
{
for( int i = 0; i < n; i++ )
{
if( current[i] != expected_current_val )
printf( "Error on host: expected = %d, current[%d] = %d\n"
, expected_current_val
, i
, current[i] );
// Increment the data so the next copy is properly tested
next[i] = current[i] + 1;
}
}
int main( int argc, char** argv )
{
bool pagelocked = true;
// invoking the executable with any additional argument(s) will turn off page locked memory, i.e.,
// Run with pagelocked memory: ./a.out
// Run with ordinary malloc'd memory: ./a.out jkfdlsja
if( argc > 1 )
pagelocked = false;
int copybytes = 1e8; // Ok to use int instead of size_t for 1e8.
cudaStream_t* stream = (cudaStream_t*)malloc( sizeof(cudaStream_t) );
cudaStreamCreate( stream );
int* srcHost;
int* dstHost;
int* srcDevice;
int* dstDevice;
cudaMalloc( (void**)&srcDevice, copybytes );
cudaMalloc( (void**)&dstDevice, copybytes );
if( pagelocked )
{
printf( "Using page locked memory\n" );
cudaMallocHost( (void**)&srcHost, copybytes );
cudaMallocHost( (void**)&dstHost, copybytes );
}
else
{
printf( "Using non page locked memory\n" );
srcHost = (int*)malloc( copybytes );
dstHost = (int*)malloc( copybytes );
}
for( int i = 0; i < copybytes/sizeof(int); i++ )
srcHost[i] = 1;
cudaMemcpyKind kinds[4];
kinds[0] = cudaMemcpyHostToDevice;
kinds[1] = cudaMemcpyDeviceToHost;
kinds[2] = cudaMemcpyHostToHost;
kinds[3] = cudaMemcpyDeviceToDevice;
// Test cudaMemcpyAsync in both directions,
// iterating through all "cudaMemcpyKinds" to verify
// that they don't matter.
int expected_current_val = 1;
for( int kind = 0; kind<4; kind++ )
{
// Host to device copy
cudaMemcpyAsync( dstDevice
, srcHost
, copybytes
, kinds[kind]
, *stream );
gpuErrchk( cudaDeviceSynchronize() );
checkDataDevice<<<56*8,256>>>( dstDevice
, srcDevice
, expected_current_val
, copybytes/sizeof(int) );
expected_current_val++;
// Device to host copy
cudaMemcpyAsync( dstHost
, srcDevice
, copybytes
, kinds[kind]
, *stream );
gpuErrchk( cudaDeviceSynchronize() );
checkDataHost( dstHost
, srcHost
, expected_current_val
, copybytes/sizeof(int) );
expected_current_val++;
}
free( stream );
cudaFree( srcDevice );
cudaFree( dstDevice );
if( pagelocked )
{
cudaFreeHost( srcHost );
cudaFreeHost( dstHost );
}
else
{
free( srcHost );
free( dstHost );
}
return 0;
}
当 CUDA 代码出现问题时,我 强烈 建议使用 rigorous(== 每次调用 return 代码已选中)proper CUDA error checking.
您的错误检查存在缺陷,这些缺陷导致您有些困惑。
首先,在页面锁定的情况下,给定的(映射的)指针在主机和设备上都是accessible/valid。因此,每个可能的方向枚举(H2D、D2H、D2D、H2H)都是合法有效的。因此,不会 return 编辑任何错误并且 复制操作成功。
在非页面锁定的情况下,上述情况并非如此,因此一般来说,指示的传输方向最好与隐含的传输方向相匹配,如指针所检查的那样。如果没有,cudaMemcpyAsync
将 return 错误代码 (cudaErrorInvalidValue
== 11)。在您的情况下,您忽略了此错误结果。如果你有足够的耐心,你可以自己证明这一点(如果你只是标记第一个错误会更好,而不是打印出 10M+ 元素中的每个不匹配),通过 运行 你的代码 cuda-memcheck
(当您在使用 CUDA 代码时遇到问题时可以做的另一件好事)或者进行适当、严格的错误检查。
当 cudaMemcpyAsync
操作指示失败时,操作未成功完成,因此未复制数据,并且您的数据检查指示不匹配。希望现在这并不奇怪,因为预期的复制操作实际上并没有发生(也没有失败 "silently")。
也许您很困惑,认为捕获任何类型的异步操作错误的方法是执行 cudaDeviceSynchronize
然后检查错误。
这对 cudaMemcpyAsync
不正确。在调用 cudaMemcpyAsync
操作时可以检测到的错误将由调用本身立即 returned,并且 不会 returned作为后续 CUDA 调用的结果(很明显),因为这种类型的错误是非粘性的。
故事的寓意:
- 进行适当的 CUDA 错误检查。严格。
- 运行 您的代码
cuda-memcheck
.
这是一个完整的示例,对您的代码进行了微不足道的修改以在失败案例中生成输出 "sane",证明在失败案例中指示了错误:
$ cat t153.cu
#include <stdio.h>
#include <stdlib.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void checkDataDevice( int* current, int* next, int expected_current_val, int n )
{
int tid = threadIdx.x + blockIdx.x*blockDim.x;
for( int i = tid; i < n; i += blockDim.x*gridDim.x )
{
if( current[i] != expected_current_val )
printf( "Error on device: expected = %d, current[%d] = %d\n"
, expected_current_val
, i
, current[i] );
// Increment the data so the next copy is properly tested
next[i] = current[i] + 1;
}
}
void checkDataHost( int* current, int* next, int expected_current_val, int n )
{
for( int i = 0; i < n; i++ )
{
if( current[i] != expected_current_val ){
printf( "Error on host: expected = %d, current[%d] = %d\n"
, expected_current_val
, i
, current[i] );
exit(0);}
// Increment the data so the next copy is properly tested
next[i] = current[i] + 1;
}
}
int main( int argc, char** argv )
{
bool pagelocked = true;
// invoking the executable with any additional argument(s) will turn off page locked memory, i.e.,
// Run with pagelocked memory: ./a.out
// Run with ordinary malloc'd memory: ./a.out jkfdlsja
if( argc > 1 )
pagelocked = false;
int copybytes = 1e8; // Ok to use int instead of size_t for 1e8.
cudaStream_t* stream = (cudaStream_t*)malloc( sizeof(cudaStream_t) );
cudaStreamCreate( stream );
int* srcHost;
int* dstHost;
int* srcDevice;
int* dstDevice;
cudaMalloc( (void**)&srcDevice, copybytes );
cudaMalloc( (void**)&dstDevice, copybytes );
if( pagelocked )
{
printf( "Using page locked memory\n" );
cudaMallocHost( (void**)&srcHost, copybytes );
cudaMallocHost( (void**)&dstHost, copybytes );
}
else
{
printf( "Using non page locked memory\n" );
srcHost = (int*)malloc( copybytes );
dstHost = (int*)malloc( copybytes );
}
for( int i = 0; i < copybytes/sizeof(int); i++ )
srcHost[i] = 1;
cudaMemcpyKind kinds[4];
kinds[0] = cudaMemcpyHostToDevice;
kinds[1] = cudaMemcpyDeviceToHost;
kinds[2] = cudaMemcpyHostToHost;
kinds[3] = cudaMemcpyDeviceToDevice;
// Test cudaMemcpyAsync in both directions,
// iterating through all "cudaMemcpyKinds" to verify
// that they don't matter.
int expected_current_val = 1;
for( int kind = 0; kind<4; kind++ )
{
// Host to device copy
cudaMemcpyAsync( dstDevice
, srcHost
, copybytes
, kinds[kind]
, *stream );
gpuErrchk( cudaDeviceSynchronize() );
checkDataDevice<<<56*8,256>>>( dstDevice
, srcDevice
, expected_current_val
, copybytes/sizeof(int) );
expected_current_val++;
// Device to host copy
cudaMemcpyAsync( dstHost
, srcDevice
, copybytes
, kinds[kind]
, *stream );
gpuErrchk( cudaDeviceSynchronize() );
checkDataHost( dstHost
, srcHost
, expected_current_val
, copybytes/sizeof(int) );
expected_current_val++;
}
free( stream );
cudaFree( srcDevice );
cudaFree( dstDevice );
if( pagelocked )
{
cudaFreeHost( srcHost );
cudaFreeHost( dstHost );
}
else
{
free( srcHost );
free( dstHost );
}
return 0;
}
$ nvcc -arch=sm_61 -o t153 t153.cu
$ cuda-memcheck ./t153 a
========= CUDA-MEMCHECK
Using non page locked memory
========= Program hit cudaErrorInvalidValue (error 11) due to "invalid argument" on CUDA API call to cudaMemcpyAsync.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x2ef423]
========= Host Frame:./t153 [0x489a3]
========= Host Frame:./t153 [0x2e11]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21ec5]
========= Host Frame:./t153 [0x2a49]
=========
Error on host: expected = 2, current[0] = 0
========= ERROR SUMMARY: 1 error
$