将指针数组复制到设备内存中并返回 (CUDA)
Copying array of pointers into device memory and back (CUDA)
我试图在我的玩具示例中使用 cublas
函数 cublasSgemmBatched
。在此示例中,我首先分配二维数组:大小为 [6
][5
] 的 h_AA, h_BB
和大小为 [6
][[=] 的 h_CC
19=]]。之后我将它复制到设备,执行 cublasSgemmBatched
并尝试将数组 d_CC
复制回主机数组 h_CC
。但是,我在设备到主机复制时遇到错误 (cudaErrorLaunchFailure
),我不确定我是否将数组正确复制到设备中:
int main(){
cublasHandle_t handle;
cudaError_t cudaerr;
cudaEvent_t start, stop;
cublasStatus_t stat;
const float alpha = 1.0f;
const float beta = 0.0f;
float **h_AA, **h_BB, **h_CC;
h_AA = new float*[6];
h_BB = new float*[6];
h_CC = new float*[6];
for (int i = 0; i < 6; i++){
h_AA[i] = new float[5];
h_BB[i] = new float[5];
h_CC[i] = new float[1];
for (int j = 0; j < 5; j++){
h_AA[i][j] = j;
h_BB[i][j] = j;
}
h_CC[i][0] = 1;
}
float **d_AA, **d_BB, **d_CC;
cudaMalloc(&d_AA, 6 * sizeof(float*));
cudaMalloc(&d_BB, 6 * sizeof(float*));
cudaMalloc(&d_CC, 6 * sizeof(float*));
cudaerr = cudaMemcpy(d_AA, h_AA, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_BB, h_BB, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_CC, h_CC, 6 * sizeof(float*), cudaMemcpyHostToDevice);
stat = cublasCreate(&handle);
stat = cublasSgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 5, &alpha,
(const float**)d_AA, 1, (const float**)d_BB, 5, &beta, d_CC, 1, 6);
cudaerr = cudaMemcpy(h_CC, d_CC, 6 * sizeof(float*), cudaMemcpyDeviceToHost);
cublasDestroy(handle);
}
因此此代码有效,但是最后一个 cudaerr
returns cudaErrorLaunchFailure
。我试图在 Github.
上遵循此示例代码
谢谢
P.S。我不明白的是,sizeof(float*)
是什么以及 cudaMalloc
如何知道每个数组需要多少内存(就像这里我只确定 1 维的大小)。
更新:我做到了!!:
cublasHandle_t handle;
cudaError_t cudaerr;
cudaEvent_t start, stop;
cublasStatus_t stat;
const float alpha = 1.0f;
const float beta = 0.0f;
float *h_A = new float[5];
float *h_B = new float[5];
float *h_C = new float[6];
for (int i = 0; i < 5; i++)
{
h_A[i] = i;
h_B[i] = i;
}
float **h_AA, **h_BB, **h_CC;
h_AA = (float**)malloc(6* sizeof(float*));
h_BB = (float**)malloc(6 * sizeof(float*));
h_CC = (float**)malloc(6 * sizeof(float*));
for (int i = 0; i < 6; i++){
cudaMalloc((void **)&h_AA[i], 5 * sizeof(float));
cudaMalloc((void **)&h_BB[i], 5 * sizeof(float));
cudaMalloc((void **)&h_CC[i], sizeof(float));
cudaMemcpy(h_AA[i], h_A, 5 * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(h_BB[i], h_B, 5 * sizeof(float), cudaMemcpyHostToDevice);
}
float **d_AA, **d_BB, **d_CC;
cudaMalloc(&d_AA, 6 * sizeof(float*));
cudaMalloc(&d_BB, 6 * sizeof(float*));
cudaMalloc(&d_CC, 6 * sizeof(float*));
cudaerr = cudaMemcpy(d_AA, h_AA, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_BB, h_BB, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_CC, h_CC, 6 * sizeof(float*), cudaMemcpyHostToDevice);
stat = cublasCreate(&handle);
stat = cublasSgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 5, &alpha,
(const float**)d_AA, 1, (const float**)d_BB, 5, &beta, d_CC, 1, 6);
cudaerr = cudaMemcpy(h_CC, d_CC, sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < 6;i++)
cudaMemcpy(h_C+i, h_CC[i], sizeof(float), cudaMemcpyDeviceToHost);
cublasDestroy(handle);
所以,我找到了答案(感谢@Robert Crovella):为了创建 device array of pointers to device arrays
(对于批处理函数),应该首先创建 host array of pointers to device arrays
,然后复制它进入 device array of pointers to device arrays
。转回host也是一样:应该用intermediate host array of pointers to device arrays
.
cublasHandle_t handle;
cudaError_t cudaerr;
cudaEvent_t start, stop;
cublasStatus_t stat;
const float alpha = 1.0f;
const float beta = 0.0f;
float *h_A = new float[5];
float *h_B = new float[5];
float *h_C = new float[6];
for (int i = 0; i < 5; i++)
{
h_A[i] = i;
h_B[i] = i;
}
float **h_AA, **h_BB, **h_CC;
h_AA = (float**)malloc(6* sizeof(float*));
h_BB = (float**)malloc(6 * sizeof(float*));
h_CC = (float**)malloc(6 * sizeof(float*));
for (int i = 0; i < 6; i++){
cudaMalloc((void **)&h_AA[i], 5 * sizeof(float));
cudaMalloc((void **)&h_BB[i], 5 * sizeof(float));
cudaMalloc((void **)&h_CC[i], sizeof(float));
cudaMemcpy(h_AA[i], h_A, 5 * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(h_BB[i], h_B, 5 * sizeof(float), cudaMemcpyHostToDevice);
}
float **d_AA, **d_BB, **d_CC;
cudaMalloc(&d_AA, 6 * sizeof(float*));
cudaMalloc(&d_BB, 6 * sizeof(float*));
cudaMalloc(&d_CC, 6 * sizeof(float*));
cudaerr = cudaMemcpy(d_AA, h_AA, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_BB, h_BB, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_CC, h_CC, 6 * sizeof(float*), cudaMemcpyHostToDevice);
stat = cublasCreate(&handle);
stat = cublasSgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 5, &alpha,
(const float**)d_AA, 1, (const float**)d_BB, 5, &beta, d_CC, 1, 6);
cudaerr = cudaMemcpy(h_CC, d_CC, sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < 6;i++)
cudaMemcpy(h_C+i, h_CC[i], sizeof(float), cudaMemcpyDeviceToHost);
cublasDestroy(handle);
我试图在我的玩具示例中使用 cublas
函数 cublasSgemmBatched
。在此示例中,我首先分配二维数组:大小为 [6
][5
] 的 h_AA, h_BB
和大小为 [6
][[=] 的 h_CC
19=]]。之后我将它复制到设备,执行 cublasSgemmBatched
并尝试将数组 d_CC
复制回主机数组 h_CC
。但是,我在设备到主机复制时遇到错误 (cudaErrorLaunchFailure
),我不确定我是否将数组正确复制到设备中:
int main(){
cublasHandle_t handle;
cudaError_t cudaerr;
cudaEvent_t start, stop;
cublasStatus_t stat;
const float alpha = 1.0f;
const float beta = 0.0f;
float **h_AA, **h_BB, **h_CC;
h_AA = new float*[6];
h_BB = new float*[6];
h_CC = new float*[6];
for (int i = 0; i < 6; i++){
h_AA[i] = new float[5];
h_BB[i] = new float[5];
h_CC[i] = new float[1];
for (int j = 0; j < 5; j++){
h_AA[i][j] = j;
h_BB[i][j] = j;
}
h_CC[i][0] = 1;
}
float **d_AA, **d_BB, **d_CC;
cudaMalloc(&d_AA, 6 * sizeof(float*));
cudaMalloc(&d_BB, 6 * sizeof(float*));
cudaMalloc(&d_CC, 6 * sizeof(float*));
cudaerr = cudaMemcpy(d_AA, h_AA, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_BB, h_BB, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_CC, h_CC, 6 * sizeof(float*), cudaMemcpyHostToDevice);
stat = cublasCreate(&handle);
stat = cublasSgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 5, &alpha,
(const float**)d_AA, 1, (const float**)d_BB, 5, &beta, d_CC, 1, 6);
cudaerr = cudaMemcpy(h_CC, d_CC, 6 * sizeof(float*), cudaMemcpyDeviceToHost);
cublasDestroy(handle);
}
因此此代码有效,但是最后一个 cudaerr
returns cudaErrorLaunchFailure
。我试图在 Github.
谢谢
P.S。我不明白的是,sizeof(float*)
是什么以及 cudaMalloc
如何知道每个数组需要多少内存(就像这里我只确定 1 维的大小)。
更新:我做到了!!:
cublasHandle_t handle;
cudaError_t cudaerr;
cudaEvent_t start, stop;
cublasStatus_t stat;
const float alpha = 1.0f;
const float beta = 0.0f;
float *h_A = new float[5];
float *h_B = new float[5];
float *h_C = new float[6];
for (int i = 0; i < 5; i++)
{
h_A[i] = i;
h_B[i] = i;
}
float **h_AA, **h_BB, **h_CC;
h_AA = (float**)malloc(6* sizeof(float*));
h_BB = (float**)malloc(6 * sizeof(float*));
h_CC = (float**)malloc(6 * sizeof(float*));
for (int i = 0; i < 6; i++){
cudaMalloc((void **)&h_AA[i], 5 * sizeof(float));
cudaMalloc((void **)&h_BB[i], 5 * sizeof(float));
cudaMalloc((void **)&h_CC[i], sizeof(float));
cudaMemcpy(h_AA[i], h_A, 5 * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(h_BB[i], h_B, 5 * sizeof(float), cudaMemcpyHostToDevice);
}
float **d_AA, **d_BB, **d_CC;
cudaMalloc(&d_AA, 6 * sizeof(float*));
cudaMalloc(&d_BB, 6 * sizeof(float*));
cudaMalloc(&d_CC, 6 * sizeof(float*));
cudaerr = cudaMemcpy(d_AA, h_AA, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_BB, h_BB, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_CC, h_CC, 6 * sizeof(float*), cudaMemcpyHostToDevice);
stat = cublasCreate(&handle);
stat = cublasSgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 5, &alpha,
(const float**)d_AA, 1, (const float**)d_BB, 5, &beta, d_CC, 1, 6);
cudaerr = cudaMemcpy(h_CC, d_CC, sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < 6;i++)
cudaMemcpy(h_C+i, h_CC[i], sizeof(float), cudaMemcpyDeviceToHost);
cublasDestroy(handle);
所以,我找到了答案(感谢@Robert Crovella):为了创建 device array of pointers to device arrays
(对于批处理函数),应该首先创建 host array of pointers to device arrays
,然后复制它进入 device array of pointers to device arrays
。转回host也是一样:应该用intermediate host array of pointers to device arrays
.
cublasHandle_t handle;
cudaError_t cudaerr;
cudaEvent_t start, stop;
cublasStatus_t stat;
const float alpha = 1.0f;
const float beta = 0.0f;
float *h_A = new float[5];
float *h_B = new float[5];
float *h_C = new float[6];
for (int i = 0; i < 5; i++)
{
h_A[i] = i;
h_B[i] = i;
}
float **h_AA, **h_BB, **h_CC;
h_AA = (float**)malloc(6* sizeof(float*));
h_BB = (float**)malloc(6 * sizeof(float*));
h_CC = (float**)malloc(6 * sizeof(float*));
for (int i = 0; i < 6; i++){
cudaMalloc((void **)&h_AA[i], 5 * sizeof(float));
cudaMalloc((void **)&h_BB[i], 5 * sizeof(float));
cudaMalloc((void **)&h_CC[i], sizeof(float));
cudaMemcpy(h_AA[i], h_A, 5 * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(h_BB[i], h_B, 5 * sizeof(float), cudaMemcpyHostToDevice);
}
float **d_AA, **d_BB, **d_CC;
cudaMalloc(&d_AA, 6 * sizeof(float*));
cudaMalloc(&d_BB, 6 * sizeof(float*));
cudaMalloc(&d_CC, 6 * sizeof(float*));
cudaerr = cudaMemcpy(d_AA, h_AA, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_BB, h_BB, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_CC, h_CC, 6 * sizeof(float*), cudaMemcpyHostToDevice);
stat = cublasCreate(&handle);
stat = cublasSgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 5, &alpha,
(const float**)d_AA, 1, (const float**)d_BB, 5, &beta, d_CC, 1, 6);
cudaerr = cudaMemcpy(h_CC, d_CC, sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < 6;i++)
cudaMemcpy(h_C+i, h_CC[i], sizeof(float), cudaMemcpyDeviceToHost);
cublasDestroy(handle);