C 中的 CUDA:如何使用 cudaMemcpyAsync 修复错误 11
CUDA in C: How to fix Error 11 with cudaMemcpyAsync
我目前正在尝试获得一个简单的多 GPU 程序 运行 CUDA。
它基本上做的是将一个大数组和一些虚拟数据分块复制到 GPU,GPU 进行一些数学运算,然后将结果数组复制回来。
我在 VS2017 的输出中没有收到任何错误,但我设置的一些错误消息告诉我,在尝试复制 H2D 或 D2H 时。
它告诉我正在发生 cudaErrorInvalidValue。
此外,在使用 cudaFree() 时;函数,我得到一个 cudaErrorInvalidDevicePointer 错误。
程序的输出,结果,是完全错误的。内核出于测试目的,仅将输出数组的每个值都设置为值 50。结果是一个相对较大的负数,无论内核做什么,始终相同。
我已经尝试使用一个指针,它不是结构的一部分,但在 cudaMalloc 之前定义,它首先被使用。那并没有改变任何东西。
这是运行内核的函数:
void runKernel(int device, int Repetition, float* h_data, float* h_out, int MemoryPerComputation, int BLOCK_N, int THREAD_N, GPUplan gpuplan, KernelPlan kernelPlan)
{
cudaSetDevice(device);
cudaStreamCreate(&gpuplan.stream);
cudaMemcpyAsync(gpuplan.d_data_ptr, h_data, kernelPlan.Computations * MemoryPerComputation, cudaMemcpyHostToDevice, gpuplan.stream); //asynchronous memory copy of the data array h2d
cudaError_t x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memcpy H2D on GPU %i: Error %i\n", device, x);
}
dummyKernel << <BLOCK_N, THREAD_N, 0, gpuplan.stream >> > (gpuplan.d_data_ptr, gpuplan.d_out_ptr, kernelPlan.ComputationsPerThread, kernelPlan.AdditionalComputationThreadCount); //run kernel
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("no successfull kernel launch\n Kernel Launch Error %i \n", x);
}
else {
printf("kernel ran.\n");
}
cudaMemcpyAsync(h_out, gpuplan.d_out_ptr, kernelPlan.Computations * MemoryPerComputation, cudaMemcpyDeviceToHost, gpuplan.stream); //asynchronous memory copy of the output array d2h
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memcpy D2H on GPU %i: Error %i\n", device, x);
}
cudaStreamDestroy(gpuplan.stream);
}
那么这里,struct是怎么定义的"kernel.h":
#ifndef KERNEL_H
#define KERNEL_H
#include "cuda_runtime.h"
//GPU plan
typedef struct
{
unsigned int Computations; //computations on this GPU
unsigned int Repetitions; // amount of kernel repetitions
unsigned int ComputationsPerRepetition; // amount of computations in every kernel execution
unsigned int AdditionalComputationRepetitionsCount; // amount of repetitions that need to do one additional computation
unsigned int DataStartingPoint; // tells the kernel launch at which point in the DATA array this GPU has to start working
float* d_data_ptr;
float* d_out_ptr;
cudaStream_t stream;
} GPUplan;
typedef struct
{
unsigned int Computations;
unsigned int ComputationsPerThread; // number of computations every thread of this repetition on this GPU has to do
unsigned int AdditionalComputationThreadCount; // number of threads in this repetition on this GPU that have to
unsigned int DataStartingPoint; // tells the kernel launch at which point in the DATA array this repetition has to start working
} KernelPlan;
GPUplan planGPUComputation(int DATA_N, int GPU_N, int device, long long MemoryPerComputation, int dataCounter);
KernelPlan planKernelComputation(int GPUDataStartingPoint, int GPUComputationsPerRepetition, int GPUAdditionalComputationRepetitionsCount, int Repetition, int dataCounter, int THREAD_N, int BLOCK_N);
void memAllocation(int device, int MemoryPerComputation, GPUplan gpuPlan, KernelPlan kernelPlan);
void runKernel(int device, int Repetition, float* h_data, float* h_out, int MemoryPerComputation, int BLOCK_N, int THREAD_N, GPUplan gpuplan, KernelPlan kernelPlan);
void memFree(int device, GPUplan gpuPlan);
__global__ void dummyKernel(float *d_data, float *d_out, int d_ComputationsPerThread, int d_AdditionalComputationThreadCount);
#endif
这里是调用 runKernel 的代码部分:
int GPU_N;
cudaGetDeviceCount(&GPU_N);
const int BLOCK_N = 32;
const int THREAD_N = 1024;
const int DATA_N = 144000;
const int MemoryPerComputation = sizeof(float);
float *h_data;
float *h_out;
h_data = (float *)malloc(MemoryPerComputation * DATA_N);
h_out = (float *)malloc(MemoryPerComputation * DATA_N);
float* sourcePointer;
float* destPointer;
for (int i = 0; i < maxRepetitionCount; i++) // repeat this enough times so that the GPU with the most repetitions will get through all of them
{
//malloc
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
memAllocation(j, MemoryPerComputation, plan[j], kernelPlan[j*MAX_REP_COUNT + i]);
}
}
//kernel launch/memcpy
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
sourcePointer = h_data + kernelPlan[j*MAX_REP_COUNT + i].DataStartingPoint;
destPointer = h_out + kernelPlan[j*MAX_REP_COUNT + i].DataStartingPoint;
runKernel(j, i, sourcePointer, destPointer, MemoryPerComputation, BLOCK_N, THREAD_N, plan[j], kernelPlan[j*MAX_REP_COUNT + i]);
}
}
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
memFree(j, plan[j]);
}
}
}
我认为内核本身在这里并不重要,因为 memcpy 错误甚至在执行之前就已经出现了。
预期的输出是,输出数组的每个元素都是 50。相反,每个元素都是 -431602080.0
数组是浮点数组。
编辑:这里是用于重现问题的完整代码(除了上面的 kernel.h):
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include "kernel.h"
#define MAX_GPU_COUNT 32
#define MAX_REP_COUNT 64
__global__ void dummyKernel(float *d_data, float *d_out, int d_ComputationsPerThread, int d_AdditionalComputationThreadCount) {
int computations = d_ComputationsPerThread; //computations to be performed in this repetition on this GPU
const int threadID = blockDim.x * blockIdx.x + threadIdx.x; //thread id within GPU Repetition
if (threadID > d_AdditionalComputationThreadCount) {
computations++; //check if thread has to do an additional computation
}
for (int i = 0; i < computations; i++) {
d_out[i * blockDim.x * gridDim.x + threadID] = 50;
}
}
GPUplan planGPUComputation(int DATA_N, int GPU_N, int device, long long MemoryPerComputation, int dataCounter)
{
GPUplan plan;
size_t free, total;
//computations on GPU #device
plan.Computations = DATA_N / GPU_N;
//take into account odd data size for this GPU
if (DATA_N % GPU_N > device) {
plan.Computations++;
}
plan.DataStartingPoint = dataCounter;
//get memory information
cudaSetDevice(device);
cudaMemGetInfo(&free, &total);
//calculate Repetitions on this GPU #device
plan.Repetitions = ((plan.Computations * MemoryPerComputation / free) + 1);
printf("Repetitions: %i\n", plan.Repetitions);
if (plan.Repetitions > MAX_REP_COUNT) {
printf("Repetition count larger than MAX_REP_COUNT %i\n\n", MAX_REP_COUNT);
}
//calculate Computations per Repetition
plan.ComputationsPerRepetition = plan.Computations / plan.Repetitions;
//calculate how many Repetitions have to do an additional Computation
plan.AdditionalComputationRepetitionsCount = plan.Computations % plan.Repetitions;
return plan;
}
KernelPlan planKernelComputation(int GPUDataStartingPoint, int GPUComputationsPerRepetition, int GPUAdditionalComputationRepetitionsCount, int Repetition, int dataCounter, int THREAD_N, int BLOCK_N)
{
KernelPlan plan;
//calculate total Calculations in this Repetition
plan.Computations = GPUComputationsPerRepetition;
if (GPUAdditionalComputationRepetitionsCount > Repetition) {
plan.Computations++;
}
plan.ComputationsPerThread = plan.Computations / (THREAD_N * BLOCK_N); // Computations every thread has to do (+- 1)
plan.AdditionalComputationThreadCount = plan.Computations % (THREAD_N * BLOCK_N); // how many threads have to do +1 calculation
plan.DataStartingPoint = GPUDataStartingPoint + dataCounter;
return plan;
}
void memAllocation(int device, int MemoryPerComputation, GPUplan gpuPlan, KernelPlan kernelPlan)
{
cudaSetDevice(device); //select device to allocate memory on
cudaError_t x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Error Selecting device %i: Error %i\n", device, x);
}
cudaMalloc((void**)&(gpuPlan.d_data_ptr), MemoryPerComputation * kernelPlan.Computations); // device data array memory allocation
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Malloc 1 on GPU %i: Error %i\n", device, x);
}
cudaMalloc((void**)&(gpuPlan.d_out_ptr), MemoryPerComputation * kernelPlan.Computations); // device output array memory allocation
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Malloc 2 on GPU %i: Error %i\n", device, x);
}
}
void runKernel(int device, int Repetition, float* h_data, float* h_out, int MemoryPerComputation, int BLOCK_N, int THREAD_N, GPUplan gpuplan, KernelPlan kernelPlan)
{
cudaSetDevice(device);
cudaStreamCreate(&gpuplan.stream);
cudaMemcpyAsync(gpuplan.d_data_ptr, h_data, kernelPlan.Computations * MemoryPerComputation, cudaMemcpyHostToDevice, gpuplan.stream); //asynchronous memory copy of the data array h2d
cudaError_t x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memcpy H2D on GPU %i: Error %i\n", device, x);
}
dummyKernel << <BLOCK_N, THREAD_N, 0, gpuplan.stream >> > (gpuplan.d_data_ptr, gpuplan.d_out_ptr, kernelPlan.ComputationsPerThread, kernelPlan.AdditionalComputationThreadCount); //run kernel
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("no successfull kernel launch\n Kernel Launch Error %i \n", x);
}
else {
printf("kernel ran.\n");
}
cudaMemcpyAsync(h_out, gpuplan.d_out_ptr, kernelPlan.Computations * MemoryPerComputation, cudaMemcpyDeviceToHost, gpuplan.stream); //asynchronous memory copy of the output array d2h
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memcpy D2H on GPU %i: Error %i\n", device, x);
}
cudaStreamDestroy(gpuplan.stream);
}
void memFree(int device, GPUplan gpuPlan)
{
cudaSetDevice(device); //select device to allocate memory on
cudaFree(gpuPlan.d_data_ptr);
cudaFree(gpuPlan.d_out_ptr);
cudaError_t x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memfree on GPU %i: Error %i\n", device, x);
}
else {
printf("memory freed.\n");
}
//17 = cudaErrorInvalidDevicePointer
}
int main()
{
//get device count
int GPU_N;
cudaGetDeviceCount(&GPU_N);
//adjust for device count larger than MAX_GPU_COUNT
if (GPU_N > MAX_GPU_COUNT)
{
GPU_N = MAX_GPU_COUNT;
}
printf("GPU count: %i\n", GPU_N);
//definitions for running the program
const int BLOCK_N = 32;
const int THREAD_N = 1024;
const int DATA_N = 144000;
const int MemoryPerComputation = sizeof(float);
///////////////////////////////////////////////////////////
//Subdividing input data across GPUs
//////////////////////////////////////////////
//GPUplan
GPUplan plan[MAX_GPU_COUNT];
int dataCounter = 0;
for (int i = 0; i < GPU_N; i++)
{
plan[i] = planGPUComputation(DATA_N, GPU_N, i, MemoryPerComputation, dataCounter);
dataCounter += plan[i].Computations;
}
//KernelPlan
KernelPlan kernelPlan[MAX_GPU_COUNT*MAX_REP_COUNT];
for (int i = 0; i < GPU_N; i++)
{
int GPURepetitions = plan[i].Repetitions;
dataCounter = plan[i].DataStartingPoint;
for (int j = 0; j < GPURepetitions; j++)
{
kernelPlan[i*MAX_REP_COUNT + j] = planKernelComputation(plan[i].DataStartingPoint, plan[i].ComputationsPerRepetition, plan[i].AdditionalComputationRepetitionsCount, j, dataCounter, THREAD_N, BLOCK_N);
dataCounter += kernelPlan[i*MAX_REP_COUNT + j].Computations;
}
}
float *h_data;
float *h_out;
h_data = (float *)malloc(MemoryPerComputation * DATA_N);
h_out = (float *)malloc(MemoryPerComputation * DATA_N);
//generate some input data
for (int i = 0; i < DATA_N; i++) {
h_data[i] = 2 * i;
}
//get highest repetition count
int maxRepetitionCount = 0;
for (int i = 0; i < GPU_N; i++) {
if (plan[i].Repetitions > maxRepetitionCount) {
maxRepetitionCount = plan[i].Repetitions;
}
}
printf("maxRepetitionCount: %i\n\n", maxRepetitionCount);
float* sourcePointer;
float* destPointer;
for (int i = 0; i < maxRepetitionCount; i++) // repeat this enough times so that the GPU with the most repetitions will get through all of them
{
//malloc
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
memAllocation(j, MemoryPerComputation, plan[j], kernelPlan[j*MAX_REP_COUNT + i]);
}
}
//kernel launch/memcpy
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
sourcePointer = h_data + kernelPlan[j*MAX_REP_COUNT + i].DataStartingPoint;
destPointer = h_out + kernelPlan[j*MAX_REP_COUNT + i].DataStartingPoint;
runKernel(j, i, sourcePointer, destPointer, MemoryPerComputation, BLOCK_N, THREAD_N, plan[j], kernelPlan[j*MAX_REP_COUNT + i]);
}
}
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
memFree(j, plan[j]);
}
}
}
//printing expected results and results
for (int i = 0; i < 50; i++)
{
printf("%f\t", h_data[i]);
printf("%f\n", h_out[i]);
}
free(h_data);
free(h_out);
getchar();
return 0;
}
第一个问题其实和CUDA无关。当您将结构 by-value 传递给 C 或 C++ 中的函数时,会生成该结构的副本以供函数使用。在函数中对该结构的修改对调用环境中的原始结构没有影响。这会影响您的 memAllocation
功能:
void memAllocation(int device, int MemoryPerComputation, GPUplan gpuPlan, KernelPlan kernelPlan)
^^^^^^^
passed by value
{
cudaSetDevice(device); //select device to allocate memory on
cudaError_t x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Error Selecting device %i: Error %i\n", device, x);
}
cudaMalloc((void**)&(gpuPlan.d_data_ptr), MemoryPerComputation * kernelPlan.Computations); // device data array memory allocation
^^^^^^^^^^^^^^^^^^
modifying the copy, not the original
通过引用 而不是按值传递 gpuPlan
结构 可以很容易地解决这个问题。修改 kernel.h 头文件中的原型,以及定义:
void memAllocation(int device, int MemoryPerComputation, GPUplan &gpuPlan, KernelPlan kernelPlan)
^
随着该更改,结构通过引用传递,并且修改(例如分配指针的设置)将显示在调用环境中。这是关于 cudaMemcpy
操作的无效参数报告的近端原因。您传递的指针未分配,因为您的分配是在指针副本上完成的,而不是原始指针。
更改后您的代码可能 运行ning 正确。至少当我 运行 它没有显示错误并且输出似乎都设置为 50.
但是这段代码仍然存在问题。如果您 运行 您的代码带有 cuda-memcheck
(或打开 nsight VSE 中的内存检查器功能),您应该会看到与这行代码相关的错误,即索引越界:
__global__ void dummyKernel(float *d_data, float *d_out, int d_ComputationsPerThread, int d_AdditionalComputationThreadCount) {
...
d_out[i * blockDim.x * gridDim.x + threadID] = 50; //indexing out of bounds
我不会尝试为您解决这个问题。在我看来,您的 for-loop 以及您计算索引的方式显然超出了数组的末尾。如果需要,您可以遵循讨论的方法 here。
我目前正在尝试获得一个简单的多 GPU 程序 运行 CUDA。 它基本上做的是将一个大数组和一些虚拟数据分块复制到 GPU,GPU 进行一些数学运算,然后将结果数组复制回来。
我在 VS2017 的输出中没有收到任何错误,但我设置的一些错误消息告诉我,在尝试复制 H2D 或 D2H 时。 它告诉我正在发生 cudaErrorInvalidValue。 此外,在使用 cudaFree() 时;函数,我得到一个 cudaErrorInvalidDevicePointer 错误。
程序的输出,结果,是完全错误的。内核出于测试目的,仅将输出数组的每个值都设置为值 50。结果是一个相对较大的负数,无论内核做什么,始终相同。
我已经尝试使用一个指针,它不是结构的一部分,但在 cudaMalloc 之前定义,它首先被使用。那并没有改变任何东西。
这是运行内核的函数:
void runKernel(int device, int Repetition, float* h_data, float* h_out, int MemoryPerComputation, int BLOCK_N, int THREAD_N, GPUplan gpuplan, KernelPlan kernelPlan)
{
cudaSetDevice(device);
cudaStreamCreate(&gpuplan.stream);
cudaMemcpyAsync(gpuplan.d_data_ptr, h_data, kernelPlan.Computations * MemoryPerComputation, cudaMemcpyHostToDevice, gpuplan.stream); //asynchronous memory copy of the data array h2d
cudaError_t x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memcpy H2D on GPU %i: Error %i\n", device, x);
}
dummyKernel << <BLOCK_N, THREAD_N, 0, gpuplan.stream >> > (gpuplan.d_data_ptr, gpuplan.d_out_ptr, kernelPlan.ComputationsPerThread, kernelPlan.AdditionalComputationThreadCount); //run kernel
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("no successfull kernel launch\n Kernel Launch Error %i \n", x);
}
else {
printf("kernel ran.\n");
}
cudaMemcpyAsync(h_out, gpuplan.d_out_ptr, kernelPlan.Computations * MemoryPerComputation, cudaMemcpyDeviceToHost, gpuplan.stream); //asynchronous memory copy of the output array d2h
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memcpy D2H on GPU %i: Error %i\n", device, x);
}
cudaStreamDestroy(gpuplan.stream);
}
那么这里,struct是怎么定义的"kernel.h":
#ifndef KERNEL_H
#define KERNEL_H
#include "cuda_runtime.h"
//GPU plan
typedef struct
{
unsigned int Computations; //computations on this GPU
unsigned int Repetitions; // amount of kernel repetitions
unsigned int ComputationsPerRepetition; // amount of computations in every kernel execution
unsigned int AdditionalComputationRepetitionsCount; // amount of repetitions that need to do one additional computation
unsigned int DataStartingPoint; // tells the kernel launch at which point in the DATA array this GPU has to start working
float* d_data_ptr;
float* d_out_ptr;
cudaStream_t stream;
} GPUplan;
typedef struct
{
unsigned int Computations;
unsigned int ComputationsPerThread; // number of computations every thread of this repetition on this GPU has to do
unsigned int AdditionalComputationThreadCount; // number of threads in this repetition on this GPU that have to
unsigned int DataStartingPoint; // tells the kernel launch at which point in the DATA array this repetition has to start working
} KernelPlan;
GPUplan planGPUComputation(int DATA_N, int GPU_N, int device, long long MemoryPerComputation, int dataCounter);
KernelPlan planKernelComputation(int GPUDataStartingPoint, int GPUComputationsPerRepetition, int GPUAdditionalComputationRepetitionsCount, int Repetition, int dataCounter, int THREAD_N, int BLOCK_N);
void memAllocation(int device, int MemoryPerComputation, GPUplan gpuPlan, KernelPlan kernelPlan);
void runKernel(int device, int Repetition, float* h_data, float* h_out, int MemoryPerComputation, int BLOCK_N, int THREAD_N, GPUplan gpuplan, KernelPlan kernelPlan);
void memFree(int device, GPUplan gpuPlan);
__global__ void dummyKernel(float *d_data, float *d_out, int d_ComputationsPerThread, int d_AdditionalComputationThreadCount);
#endif
这里是调用 runKernel 的代码部分:
int GPU_N;
cudaGetDeviceCount(&GPU_N);
const int BLOCK_N = 32;
const int THREAD_N = 1024;
const int DATA_N = 144000;
const int MemoryPerComputation = sizeof(float);
float *h_data;
float *h_out;
h_data = (float *)malloc(MemoryPerComputation * DATA_N);
h_out = (float *)malloc(MemoryPerComputation * DATA_N);
float* sourcePointer;
float* destPointer;
for (int i = 0; i < maxRepetitionCount; i++) // repeat this enough times so that the GPU with the most repetitions will get through all of them
{
//malloc
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
memAllocation(j, MemoryPerComputation, plan[j], kernelPlan[j*MAX_REP_COUNT + i]);
}
}
//kernel launch/memcpy
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
sourcePointer = h_data + kernelPlan[j*MAX_REP_COUNT + i].DataStartingPoint;
destPointer = h_out + kernelPlan[j*MAX_REP_COUNT + i].DataStartingPoint;
runKernel(j, i, sourcePointer, destPointer, MemoryPerComputation, BLOCK_N, THREAD_N, plan[j], kernelPlan[j*MAX_REP_COUNT + i]);
}
}
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
memFree(j, plan[j]);
}
}
}
我认为内核本身在这里并不重要,因为 memcpy 错误甚至在执行之前就已经出现了。
预期的输出是,输出数组的每个元素都是 50。相反,每个元素都是 -431602080.0
数组是浮点数组。
编辑:这里是用于重现问题的完整代码(除了上面的 kernel.h):
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include "kernel.h"
#define MAX_GPU_COUNT 32
#define MAX_REP_COUNT 64
__global__ void dummyKernel(float *d_data, float *d_out, int d_ComputationsPerThread, int d_AdditionalComputationThreadCount) {
int computations = d_ComputationsPerThread; //computations to be performed in this repetition on this GPU
const int threadID = blockDim.x * blockIdx.x + threadIdx.x; //thread id within GPU Repetition
if (threadID > d_AdditionalComputationThreadCount) {
computations++; //check if thread has to do an additional computation
}
for (int i = 0; i < computations; i++) {
d_out[i * blockDim.x * gridDim.x + threadID] = 50;
}
}
GPUplan planGPUComputation(int DATA_N, int GPU_N, int device, long long MemoryPerComputation, int dataCounter)
{
GPUplan plan;
size_t free, total;
//computations on GPU #device
plan.Computations = DATA_N / GPU_N;
//take into account odd data size for this GPU
if (DATA_N % GPU_N > device) {
plan.Computations++;
}
plan.DataStartingPoint = dataCounter;
//get memory information
cudaSetDevice(device);
cudaMemGetInfo(&free, &total);
//calculate Repetitions on this GPU #device
plan.Repetitions = ((plan.Computations * MemoryPerComputation / free) + 1);
printf("Repetitions: %i\n", plan.Repetitions);
if (plan.Repetitions > MAX_REP_COUNT) {
printf("Repetition count larger than MAX_REP_COUNT %i\n\n", MAX_REP_COUNT);
}
//calculate Computations per Repetition
plan.ComputationsPerRepetition = plan.Computations / plan.Repetitions;
//calculate how many Repetitions have to do an additional Computation
plan.AdditionalComputationRepetitionsCount = plan.Computations % plan.Repetitions;
return plan;
}
KernelPlan planKernelComputation(int GPUDataStartingPoint, int GPUComputationsPerRepetition, int GPUAdditionalComputationRepetitionsCount, int Repetition, int dataCounter, int THREAD_N, int BLOCK_N)
{
KernelPlan plan;
//calculate total Calculations in this Repetition
plan.Computations = GPUComputationsPerRepetition;
if (GPUAdditionalComputationRepetitionsCount > Repetition) {
plan.Computations++;
}
plan.ComputationsPerThread = plan.Computations / (THREAD_N * BLOCK_N); // Computations every thread has to do (+- 1)
plan.AdditionalComputationThreadCount = plan.Computations % (THREAD_N * BLOCK_N); // how many threads have to do +1 calculation
plan.DataStartingPoint = GPUDataStartingPoint + dataCounter;
return plan;
}
void memAllocation(int device, int MemoryPerComputation, GPUplan gpuPlan, KernelPlan kernelPlan)
{
cudaSetDevice(device); //select device to allocate memory on
cudaError_t x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Error Selecting device %i: Error %i\n", device, x);
}
cudaMalloc((void**)&(gpuPlan.d_data_ptr), MemoryPerComputation * kernelPlan.Computations); // device data array memory allocation
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Malloc 1 on GPU %i: Error %i\n", device, x);
}
cudaMalloc((void**)&(gpuPlan.d_out_ptr), MemoryPerComputation * kernelPlan.Computations); // device output array memory allocation
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Malloc 2 on GPU %i: Error %i\n", device, x);
}
}
void runKernel(int device, int Repetition, float* h_data, float* h_out, int MemoryPerComputation, int BLOCK_N, int THREAD_N, GPUplan gpuplan, KernelPlan kernelPlan)
{
cudaSetDevice(device);
cudaStreamCreate(&gpuplan.stream);
cudaMemcpyAsync(gpuplan.d_data_ptr, h_data, kernelPlan.Computations * MemoryPerComputation, cudaMemcpyHostToDevice, gpuplan.stream); //asynchronous memory copy of the data array h2d
cudaError_t x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memcpy H2D on GPU %i: Error %i\n", device, x);
}
dummyKernel << <BLOCK_N, THREAD_N, 0, gpuplan.stream >> > (gpuplan.d_data_ptr, gpuplan.d_out_ptr, kernelPlan.ComputationsPerThread, kernelPlan.AdditionalComputationThreadCount); //run kernel
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("no successfull kernel launch\n Kernel Launch Error %i \n", x);
}
else {
printf("kernel ran.\n");
}
cudaMemcpyAsync(h_out, gpuplan.d_out_ptr, kernelPlan.Computations * MemoryPerComputation, cudaMemcpyDeviceToHost, gpuplan.stream); //asynchronous memory copy of the output array d2h
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memcpy D2H on GPU %i: Error %i\n", device, x);
}
cudaStreamDestroy(gpuplan.stream);
}
void memFree(int device, GPUplan gpuPlan)
{
cudaSetDevice(device); //select device to allocate memory on
cudaFree(gpuPlan.d_data_ptr);
cudaFree(gpuPlan.d_out_ptr);
cudaError_t x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memfree on GPU %i: Error %i\n", device, x);
}
else {
printf("memory freed.\n");
}
//17 = cudaErrorInvalidDevicePointer
}
int main()
{
//get device count
int GPU_N;
cudaGetDeviceCount(&GPU_N);
//adjust for device count larger than MAX_GPU_COUNT
if (GPU_N > MAX_GPU_COUNT)
{
GPU_N = MAX_GPU_COUNT;
}
printf("GPU count: %i\n", GPU_N);
//definitions for running the program
const int BLOCK_N = 32;
const int THREAD_N = 1024;
const int DATA_N = 144000;
const int MemoryPerComputation = sizeof(float);
///////////////////////////////////////////////////////////
//Subdividing input data across GPUs
//////////////////////////////////////////////
//GPUplan
GPUplan plan[MAX_GPU_COUNT];
int dataCounter = 0;
for (int i = 0; i < GPU_N; i++)
{
plan[i] = planGPUComputation(DATA_N, GPU_N, i, MemoryPerComputation, dataCounter);
dataCounter += plan[i].Computations;
}
//KernelPlan
KernelPlan kernelPlan[MAX_GPU_COUNT*MAX_REP_COUNT];
for (int i = 0; i < GPU_N; i++)
{
int GPURepetitions = plan[i].Repetitions;
dataCounter = plan[i].DataStartingPoint;
for (int j = 0; j < GPURepetitions; j++)
{
kernelPlan[i*MAX_REP_COUNT + j] = planKernelComputation(plan[i].DataStartingPoint, plan[i].ComputationsPerRepetition, plan[i].AdditionalComputationRepetitionsCount, j, dataCounter, THREAD_N, BLOCK_N);
dataCounter += kernelPlan[i*MAX_REP_COUNT + j].Computations;
}
}
float *h_data;
float *h_out;
h_data = (float *)malloc(MemoryPerComputation * DATA_N);
h_out = (float *)malloc(MemoryPerComputation * DATA_N);
//generate some input data
for (int i = 0; i < DATA_N; i++) {
h_data[i] = 2 * i;
}
//get highest repetition count
int maxRepetitionCount = 0;
for (int i = 0; i < GPU_N; i++) {
if (plan[i].Repetitions > maxRepetitionCount) {
maxRepetitionCount = plan[i].Repetitions;
}
}
printf("maxRepetitionCount: %i\n\n", maxRepetitionCount);
float* sourcePointer;
float* destPointer;
for (int i = 0; i < maxRepetitionCount; i++) // repeat this enough times so that the GPU with the most repetitions will get through all of them
{
//malloc
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
memAllocation(j, MemoryPerComputation, plan[j], kernelPlan[j*MAX_REP_COUNT + i]);
}
}
//kernel launch/memcpy
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
sourcePointer = h_data + kernelPlan[j*MAX_REP_COUNT + i].DataStartingPoint;
destPointer = h_out + kernelPlan[j*MAX_REP_COUNT + i].DataStartingPoint;
runKernel(j, i, sourcePointer, destPointer, MemoryPerComputation, BLOCK_N, THREAD_N, plan[j], kernelPlan[j*MAX_REP_COUNT + i]);
}
}
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
memFree(j, plan[j]);
}
}
}
//printing expected results and results
for (int i = 0; i < 50; i++)
{
printf("%f\t", h_data[i]);
printf("%f\n", h_out[i]);
}
free(h_data);
free(h_out);
getchar();
return 0;
}
第一个问题其实和CUDA无关。当您将结构 by-value 传递给 C 或 C++ 中的函数时,会生成该结构的副本以供函数使用。在函数中对该结构的修改对调用环境中的原始结构没有影响。这会影响您的 memAllocation
功能:
void memAllocation(int device, int MemoryPerComputation, GPUplan gpuPlan, KernelPlan kernelPlan)
^^^^^^^
passed by value
{
cudaSetDevice(device); //select device to allocate memory on
cudaError_t x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Error Selecting device %i: Error %i\n", device, x);
}
cudaMalloc((void**)&(gpuPlan.d_data_ptr), MemoryPerComputation * kernelPlan.Computations); // device data array memory allocation
^^^^^^^^^^^^^^^^^^
modifying the copy, not the original
通过引用 而不是按值传递 gpuPlan
结构 可以很容易地解决这个问题。修改 kernel.h 头文件中的原型,以及定义:
void memAllocation(int device, int MemoryPerComputation, GPUplan &gpuPlan, KernelPlan kernelPlan)
^
随着该更改,结构通过引用传递,并且修改(例如分配指针的设置)将显示在调用环境中。这是关于 cudaMemcpy
操作的无效参数报告的近端原因。您传递的指针未分配,因为您的分配是在指针副本上完成的,而不是原始指针。
更改后您的代码可能 运行ning 正确。至少当我 运行 它没有显示错误并且输出似乎都设置为 50.
但是这段代码仍然存在问题。如果您 运行 您的代码带有 cuda-memcheck
(或打开 nsight VSE 中的内存检查器功能),您应该会看到与这行代码相关的错误,即索引越界:
__global__ void dummyKernel(float *d_data, float *d_out, int d_ComputationsPerThread, int d_AdditionalComputationThreadCount) {
...
d_out[i * blockDim.x * gridDim.x + threadID] = 50; //indexing out of bounds
我不会尝试为您解决这个问题。在我看来,您的 for-loop 以及您计算索引的方式显然超出了数组的末尾。如果需要,您可以遵循讨论的方法 here。