使用 CUDA 图进行点对点数据传输
PeerToPeer data transfer with CUDA graphs
利用 CUDA Graphs,我想通过 NVLink 将一些数据从一个 GPU 传输到另一个 GPU。定义图和节点后,我按如下方式填充 memcpy
参数以从 GPU 0 传输到 1:
cudaMemcpy3DPeerParms memcpyParams = {0};
memset(&memcpyParams, 0, sizeof(memcpyParams));
memcpyParams.srcDevice = 0;
memcpyParams.srcArray = NULL;
memcpyParams.srcPos = make_cudaPos(0, 0, 0);
memcpyParams.srcPtr =
make_cudaPitchedPtr((void *)d_inputs[0], data_size, data_count, 1);
memcpyParams.dstDevice = 1;
memcpyParams.dstArray = NULL;
memcpyParams.dstPos = make_cudaPos(0, 0, 0);
memcpyParams.dstPtr =
make_cudaPitchedPtr(d_results[1], data_size, data_count, 1);
memcpyParams.extent = make_cudaExtent(data_size, 1, 1);
// Add the first copy node with no dependency
cudaGraphAddMemcpyNode(©_0to1, graph, NULL, 0, &memcpyParams);
编译器不喜欢 cudaGraphAddMemcpyNode
的最后一个参数。它说:
error: argument of type "cudaMemcpy3DPeerParms *" is incompatible with parameter of type "const cudaMemcpy3DParms *"
结构的静态转换不起作用,它们也有不同的参数。来自驱动程序头文件:
/**
* CUDA 3D memory copying parameters
*/
struct __device_builtin__ cudaMemcpy3DParms
{
cudaArray_t srcArray; /**< Source memory address */
struct cudaPos srcPos; /**< Source position offset */
struct cudaPitchedPtr srcPtr; /**< Pitched source memory address */
cudaArray_t dstArray; /**< Destination memory address */
struct cudaPos dstPos; /**< Destination position offset */
struct cudaPitchedPtr dstPtr; /**< Pitched destination memory address */
struct cudaExtent extent; /**< Requested memory copy size */
enum cudaMemcpyKind kind; /**< Type of transfer */
};
/**
* CUDA 3D cross-device memory copying parameters
*/
struct __device_builtin__ cudaMemcpy3DPeerParms
{
cudaArray_t srcArray; /**< Source memory address */
struct cudaPos srcPos; /**< Source position offset */
struct cudaPitchedPtr srcPtr; /**< Pitched source memory address */
int srcDevice; /**< Source device */
cudaArray_t dstArray; /**< Destination memory address */
struct cudaPos dstPos; /**< Destination position offset */
struct cudaPitchedPtr dstPtr; /**< Pitched destination memory address */
int dstDevice; /**< Destination device */
struct cudaExtent extent; /**< Requested memory copy size */
};
在我看来,没有像 cudaGraphAddMemcpyNode
这样接受 cudaMemcpy3DPeerParms
参数的函数。那么,如何在 CUDA 图形中使用 NVLink 将一段数据从一个 GPU 直接发送到其对等方?!
正如 Abator 评论的那样,使用 cudaMemcpyDefault
和 cudaMemcpy3Params
有效:
cudaMemcpy3DParms memcpyParams = {0};
memset(&memcpyParams, 0, sizeof(memcpyParams));
memcpyParams.srcArray = NULL;
memcpyParams.srcPos = make_cudaPos(0, 0, 0);
memcpyParams.srcPtr =
make_cudaPitchedPtr((void *)d_inputs[0], data_size, data_count, 1);
memcpyParams.dstArray = NULL;
memcpyParams.dstPos = make_cudaPos(0, 0, 0);
memcpyParams.dstPtr =
make_cudaPitchedPtr(d_results[1], data_size, data_count, 1);
memcpyParams.extent = make_cudaExtent(data_size, 1, 1);
memcpyParams.kind = cudaMemcpyDefault;
如driver_types.h
所述:传输的方向是根据指针值推断的。需要统一虚拟寻址
利用 CUDA Graphs,我想通过 NVLink 将一些数据从一个 GPU 传输到另一个 GPU。定义图和节点后,我按如下方式填充 memcpy
参数以从 GPU 0 传输到 1:
cudaMemcpy3DPeerParms memcpyParams = {0};
memset(&memcpyParams, 0, sizeof(memcpyParams));
memcpyParams.srcDevice = 0;
memcpyParams.srcArray = NULL;
memcpyParams.srcPos = make_cudaPos(0, 0, 0);
memcpyParams.srcPtr =
make_cudaPitchedPtr((void *)d_inputs[0], data_size, data_count, 1);
memcpyParams.dstDevice = 1;
memcpyParams.dstArray = NULL;
memcpyParams.dstPos = make_cudaPos(0, 0, 0);
memcpyParams.dstPtr =
make_cudaPitchedPtr(d_results[1], data_size, data_count, 1);
memcpyParams.extent = make_cudaExtent(data_size, 1, 1);
// Add the first copy node with no dependency
cudaGraphAddMemcpyNode(©_0to1, graph, NULL, 0, &memcpyParams);
编译器不喜欢 cudaGraphAddMemcpyNode
的最后一个参数。它说:
error: argument of type "cudaMemcpy3DPeerParms *" is incompatible with parameter of type "const cudaMemcpy3DParms *"
结构的静态转换不起作用,它们也有不同的参数。来自驱动程序头文件:
/**
* CUDA 3D memory copying parameters
*/
struct __device_builtin__ cudaMemcpy3DParms
{
cudaArray_t srcArray; /**< Source memory address */
struct cudaPos srcPos; /**< Source position offset */
struct cudaPitchedPtr srcPtr; /**< Pitched source memory address */
cudaArray_t dstArray; /**< Destination memory address */
struct cudaPos dstPos; /**< Destination position offset */
struct cudaPitchedPtr dstPtr; /**< Pitched destination memory address */
struct cudaExtent extent; /**< Requested memory copy size */
enum cudaMemcpyKind kind; /**< Type of transfer */
};
/**
* CUDA 3D cross-device memory copying parameters
*/
struct __device_builtin__ cudaMemcpy3DPeerParms
{
cudaArray_t srcArray; /**< Source memory address */
struct cudaPos srcPos; /**< Source position offset */
struct cudaPitchedPtr srcPtr; /**< Pitched source memory address */
int srcDevice; /**< Source device */
cudaArray_t dstArray; /**< Destination memory address */
struct cudaPos dstPos; /**< Destination position offset */
struct cudaPitchedPtr dstPtr; /**< Pitched destination memory address */
int dstDevice; /**< Destination device */
struct cudaExtent extent; /**< Requested memory copy size */
};
在我看来,没有像 cudaGraphAddMemcpyNode
这样接受 cudaMemcpy3DPeerParms
参数的函数。那么,如何在 CUDA 图形中使用 NVLink 将一段数据从一个 GPU 直接发送到其对等方?!
正如 Abator 评论的那样,使用 cudaMemcpyDefault
和 cudaMemcpy3Params
有效:
cudaMemcpy3DParms memcpyParams = {0};
memset(&memcpyParams, 0, sizeof(memcpyParams));
memcpyParams.srcArray = NULL;
memcpyParams.srcPos = make_cudaPos(0, 0, 0);
memcpyParams.srcPtr =
make_cudaPitchedPtr((void *)d_inputs[0], data_size, data_count, 1);
memcpyParams.dstArray = NULL;
memcpyParams.dstPos = make_cudaPos(0, 0, 0);
memcpyParams.dstPtr =
make_cudaPitchedPtr(d_results[1], data_size, data_count, 1);
memcpyParams.extent = make_cudaExtent(data_size, 1, 1);
memcpyParams.kind = cudaMemcpyDefault;
如driver_types.h
所述:传输的方向是根据指针值推断的。需要统一虚拟寻址