cudaMemcpy 中从设备到主机的无效参数错误
invalid argument error in cudaMemcpy from device to host
我是 CUDA/GPU 的新手,我在将数据从我的设备复制回主机时遇到问题。我正在使用 CUDA 工具包 6.5 为 Jetson TK1 开发。它构建成功,但在运行时出错。我的代码如下:
//main.cu
void allocate(double* const d_inputCurrent, double* signal, double* const d_outputCurrent, const size_t size);
int main () {
int data_length = 1024000;
const int length=512;
const size_t size= length;
double signalA[length], signalB[length], signalC[length];
for (int i=0; i<data_length; i++)
{
double *d_inputCurrentIa, *d_inputCurrentIb, *d_inputCurrentIc;
double *d_outputCurrentIa, *d_outputCurrentIb, *d_outputCurrentIc;
if(i==0)
{
for(int k=0; k<length; k++)
{
signalA[k]=v_ia[k];
signalB[k]=v_ib[k];
signalC[k]=v_ic[k];
}
i=length-1;
}
else
{
//allocate memory in GPU and kernel call for phase A
allocate(d_inputCurrentIa, signalA, d_outputCurrentIa, size);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaMemcpy(signalA, d_outputCurrentIa, sizeof(double) * size, cudaMemcpyDeviceToHost));
signalA[length-1]=v_ia[i];
//allocate memory in GPU and kernel call for phase B
allocate(d_inputCurrentIb, signalB, d_outputCurrentIb, size);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaMemcpy(signalB, d_outputCurrentIb, sizeof(double) * size, cudaMemcpyDeviceToHost));
signalB[length-1]=v_ib[i];
//allocate memory in GPU and kernel call for phase C;
allocate(d_inputCurrentIc, signalC, d_outputCurrentIc, size);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaMemcpy(signalC, d_outputCurrentIc, sizeof(double) * size, cudaMemcpyDeviceToHost));
signalC[length-1]=v_ic[i];
//memory cleaning
checkCudaErrors(cudaFree(d_inputCurrentIa));
checkCudaErrors(cudaFree(d_inputCurrentIb));
checkCudaErrors(cudaFree(d_inputCurrentIc));
checkCudaErrors(cudaFree(d_outputCurrentIa));
checkCudaErrors(cudaFree(d_outputCurrentIb));
checkCudaErrors(cudaFree(d_outputCurrentIc));
}
而且我的内核和函数都很简单,它们只是每次将数组元素向左移动:
__global__ void allocate_kernel(double* const d_in, double* const d_out, const size_t size) {
__shared__ double shared[512];
int tid = threadIdx.x;
if(tid < size)
shared[tid] = d_in[tid];
__syncthreads();
if(tid < size-1)
d_out[tid]=shared[tid+1];
__syncthreads();
}
void allocate(double* const d_inputCurrent, double* signal, double* const d_outputCurrent, const size_t size) {
const dim3 blockSize(512);
const dim3 gridSize(1);
checkCudaErrors(cudaFree(0));
checkCudaErrors(cudaMalloc((void **)&d_inputCurrent, sizeof(double) * size));
checkCudaErrors(cudaMalloc((void **)&d_outputCurrent, sizeof(double) * size));
checkCudaErrors(cudaMemset(d_outputCurrent, 0, sizeof(double) * size));
checkCudaErrors(cudaMemcpy(d_inputCurrent, signal, sizeof(double) * size, cudaMemcpyHostToDevice));
allocate_kernel<<<gridSize, blockSize>>>(d_inputCurrent, d_outputCurrent, size);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
}
这是我博士论文的一小部分,我正在用这段代码练习 CUDA,我知道现在它没有那么大的意义,但我无法继续前进,因为我被这个问题困住了。任何帮助将不胜感激,提前致谢。
在 C 中,您不能按值传递指向函数的指针,让该函数修改 指针 ,然后期望该指针的修改显示在调用中环境:
double *d_inputCurrentIa, *d_inputCurrentIb, *d_inputCurrentIc;
double *d_outputCurrentIa, *d_outputCurrentIb, *d_outputCurrentIc;
...
//allocate memory in GPU and kernel call for phase A
// at this point, d_inputCurrentIa and d_outputCurrentIa are pointing to nothing
allocate(d_inputCurrentIa, signalA, d_outputCurrentIa, size);
// allocate modified those pointers internally, but the modified values don't show up here
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaMemcpy(signalA, d_outputCurrentIa, sizeof(double) * size, cudaMemcpyDeviceToHost));
// therefore you will get an error here, because d_outputCurrentIa still points to nothing
有多种方法可以完成这项工作。一种方法是传递要修改和使用的指针的 地址:
void allocate(double** d_inputCurrent, double* signal, double **d_outputCurrent, const size_t size);
double *d_inputCurrentIa, *d_inputCurrentIb, *d_inputCurrentIc;
double *d_outputCurrentIa, *d_outputCurrentIb, *d_outputCurrentIc;
...
//allocate memory in GPU and kernel call for phase A
allocate(&d_inputCurrentIa, signalA, &d_outputCurrentIa, size);
...
void allocate(double** d_inputCurrent, double* signal, double** d_outputCurrent, const size_t size) {
const dim3 blockSize(512);
const dim3 gridSize(1);
checkCudaErrors(cudaFree(0));
checkCudaErrors(cudaMalloc((void **)d_inputCurrent, sizeof(double) * size));
checkCudaErrors(cudaMalloc((void **)d_outputCurrent, sizeof(double) * size));
checkCudaErrors(cudaMemset(*d_outputCurrent, 0, sizeof(double) * size));
checkCudaErrors(cudaMemcpy(*d_inputCurrent, signal, sizeof(double) * size, cudaMemcpyHostToDevice));
allocate_kernel<<<gridSize, blockSize>>>(*d_inputCurrent, *d_outputCurrent, size);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
}
备注:
不确定为什么要标记这些指针 const
。它们在任何方面都不是 const
(该函数将修改指针 value 以及它指向的数据。)
在浏览器中编码。您可能需要修复一些其他问题。由于您没有提供完整的代码来使用,我也没有提供完整的代码。但这应该是一个路线图。
在函数中分配可能会导致内存泄漏。你可能想考虑一下。如果您要重复使用或创建大量指针,请务必制定释放这些指针的计划。
我是 CUDA/GPU 的新手,我在将数据从我的设备复制回主机时遇到问题。我正在使用 CUDA 工具包 6.5 为 Jetson TK1 开发。它构建成功,但在运行时出错。我的代码如下:
//main.cu
void allocate(double* const d_inputCurrent, double* signal, double* const d_outputCurrent, const size_t size);
int main () {
int data_length = 1024000;
const int length=512;
const size_t size= length;
double signalA[length], signalB[length], signalC[length];
for (int i=0; i<data_length; i++)
{
double *d_inputCurrentIa, *d_inputCurrentIb, *d_inputCurrentIc;
double *d_outputCurrentIa, *d_outputCurrentIb, *d_outputCurrentIc;
if(i==0)
{
for(int k=0; k<length; k++)
{
signalA[k]=v_ia[k];
signalB[k]=v_ib[k];
signalC[k]=v_ic[k];
}
i=length-1;
}
else
{
//allocate memory in GPU and kernel call for phase A
allocate(d_inputCurrentIa, signalA, d_outputCurrentIa, size);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaMemcpy(signalA, d_outputCurrentIa, sizeof(double) * size, cudaMemcpyDeviceToHost));
signalA[length-1]=v_ia[i];
//allocate memory in GPU and kernel call for phase B
allocate(d_inputCurrentIb, signalB, d_outputCurrentIb, size);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaMemcpy(signalB, d_outputCurrentIb, sizeof(double) * size, cudaMemcpyDeviceToHost));
signalB[length-1]=v_ib[i];
//allocate memory in GPU and kernel call for phase C;
allocate(d_inputCurrentIc, signalC, d_outputCurrentIc, size);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaMemcpy(signalC, d_outputCurrentIc, sizeof(double) * size, cudaMemcpyDeviceToHost));
signalC[length-1]=v_ic[i];
//memory cleaning
checkCudaErrors(cudaFree(d_inputCurrentIa));
checkCudaErrors(cudaFree(d_inputCurrentIb));
checkCudaErrors(cudaFree(d_inputCurrentIc));
checkCudaErrors(cudaFree(d_outputCurrentIa));
checkCudaErrors(cudaFree(d_outputCurrentIb));
checkCudaErrors(cudaFree(d_outputCurrentIc));
}
而且我的内核和函数都很简单,它们只是每次将数组元素向左移动:
__global__ void allocate_kernel(double* const d_in, double* const d_out, const size_t size) {
__shared__ double shared[512];
int tid = threadIdx.x;
if(tid < size)
shared[tid] = d_in[tid];
__syncthreads();
if(tid < size-1)
d_out[tid]=shared[tid+1];
__syncthreads();
}
void allocate(double* const d_inputCurrent, double* signal, double* const d_outputCurrent, const size_t size) {
const dim3 blockSize(512);
const dim3 gridSize(1);
checkCudaErrors(cudaFree(0));
checkCudaErrors(cudaMalloc((void **)&d_inputCurrent, sizeof(double) * size));
checkCudaErrors(cudaMalloc((void **)&d_outputCurrent, sizeof(double) * size));
checkCudaErrors(cudaMemset(d_outputCurrent, 0, sizeof(double) * size));
checkCudaErrors(cudaMemcpy(d_inputCurrent, signal, sizeof(double) * size, cudaMemcpyHostToDevice));
allocate_kernel<<<gridSize, blockSize>>>(d_inputCurrent, d_outputCurrent, size);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
}
这是我博士论文的一小部分,我正在用这段代码练习 CUDA,我知道现在它没有那么大的意义,但我无法继续前进,因为我被这个问题困住了。任何帮助将不胜感激,提前致谢。
在 C 中,您不能按值传递指向函数的指针,让该函数修改 指针 ,然后期望该指针的修改显示在调用中环境:
double *d_inputCurrentIa, *d_inputCurrentIb, *d_inputCurrentIc;
double *d_outputCurrentIa, *d_outputCurrentIb, *d_outputCurrentIc;
...
//allocate memory in GPU and kernel call for phase A
// at this point, d_inputCurrentIa and d_outputCurrentIa are pointing to nothing
allocate(d_inputCurrentIa, signalA, d_outputCurrentIa, size);
// allocate modified those pointers internally, but the modified values don't show up here
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaMemcpy(signalA, d_outputCurrentIa, sizeof(double) * size, cudaMemcpyDeviceToHost));
// therefore you will get an error here, because d_outputCurrentIa still points to nothing
有多种方法可以完成这项工作。一种方法是传递要修改和使用的指针的 地址:
void allocate(double** d_inputCurrent, double* signal, double **d_outputCurrent, const size_t size);
double *d_inputCurrentIa, *d_inputCurrentIb, *d_inputCurrentIc;
double *d_outputCurrentIa, *d_outputCurrentIb, *d_outputCurrentIc;
...
//allocate memory in GPU and kernel call for phase A
allocate(&d_inputCurrentIa, signalA, &d_outputCurrentIa, size);
...
void allocate(double** d_inputCurrent, double* signal, double** d_outputCurrent, const size_t size) {
const dim3 blockSize(512);
const dim3 gridSize(1);
checkCudaErrors(cudaFree(0));
checkCudaErrors(cudaMalloc((void **)d_inputCurrent, sizeof(double) * size));
checkCudaErrors(cudaMalloc((void **)d_outputCurrent, sizeof(double) * size));
checkCudaErrors(cudaMemset(*d_outputCurrent, 0, sizeof(double) * size));
checkCudaErrors(cudaMemcpy(*d_inputCurrent, signal, sizeof(double) * size, cudaMemcpyHostToDevice));
allocate_kernel<<<gridSize, blockSize>>>(*d_inputCurrent, *d_outputCurrent, size);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
}
备注:
不确定为什么要标记这些指针
const
。它们在任何方面都不是const
(该函数将修改指针 value 以及它指向的数据。)在浏览器中编码。您可能需要修复一些其他问题。由于您没有提供完整的代码来使用,我也没有提供完整的代码。但这应该是一个路线图。
在函数中分配可能会导致内存泄漏。你可能想考虑一下。如果您要重复使用或创建大量指针,请务必制定释放这些指针的计划。