SoA 内 AoS 的 CUDA 内存分配
CUDA Memory Allocation for AoS inside a SoA
我一直在一个程序中工作,该程序需要在另一个结构数组或数组结构中使用结构数组,我决定在给定初始条件(有动态)的情况下使用这种方法,以下是我试图在 CUDA
中分配的结构
struct population
{
int id;
tour *tours;
};
struct tour
{
int id;
node *nodes;
double value;
int node_qty;
};
struct node
{
int id;
double x;
double y;
int item_qty;
item *items;
};
struct item
{
float weight;
float value;
};
如您所见,这组结构是一个内在的结构,正如我所说的,大多数属性都是动态的(P.E:节点数量、项目数量和游览数量).我已经多次尝试分配内存,但结果几乎总是相同的“访问冲突写入位置”。作为旁注,我尝试遵循其他问题的一些建议,例如:cudaMemcpy segmentation fault or this .
以下代码分配了大部分内存,但是当我尝试访问结构的属性时,结果是“非法内存访问”
// 1. cudaMalloc a pointer to device memory that hold population
population* d_initial_population;
HANDLE_ERROR(cudaMalloc((void**)&d_initial_population, sizeof(population)));
// 2. Create a separate tour pointer on the host.
tour* d_tour_ptr;
HANDLE_ERROR(cudaMalloc((void**)&d_tour_ptr, sizeof(tour) * POPULATION_SIZE));
// 3. Create a separate node pointer on the host.
node* d_node_ptr[POPULATION_SIZE];
// Allocate memory on device according to population size
for (int i = 0; i < POPULATION_SIZE; ++i)
{
// 4. cudaMalloc node storage on the device for node pointer
HANDLE_ERROR(cudaMalloc((void**)&(d_node_ptr[i]), sizeof(node) * node_quantity));
// 5. cudaMemcpy the pointer value of node pointer from host to the device node pointer
HANDLE_ERROR(cudaMemcpy(&(d_tour_ptr[i].nodes), &(d_node_ptr[i]), sizeof(node*), cudaMemcpyHostToDevice));
// Optional: Copy an instantiated object on the host to the device pointer
HANDLE_ERROR(cudaMemcpy(d_node_ptr[i], initial_tour.nodes, sizeof(node) * node_quantity, cudaMemcpyHostToDevice));
}
// 6. cudaMemcpy the pointer value of tour pointer from host to the device population pointer
HANDLE_ERROR(cudaMemcpy(&(d_initial_population->tours), &d_tour_ptr, sizeof(tour*), cudaMemcpyHostToDevice));
在这个初始方法之后,我的下一个尝试是先尝试分配内部结构,然后再向上,我的尝试是使用节点和项目结构如下
// Define a pointer for struct "node"
node* dev_node;
// 1. cudaMalloc a pointer to device memory that will hold the struct "node", in this case is called "dev_node"
HANDLE_ERROR(cudaMalloc((void**)&dev_node, node_quantity * sizeof(node)));
// 2. (optionally) copy an instantiated object of struct "node" on the host to the device pointer "dev_node" from step 1 using cudaMemcpy
HANDLE_ERROR(cudaMemcpy(dev_node, n, node_quantity * sizeof(node), cudaMemcpyHostToDevice));
// 3. Create a separate "item" pointer on the host, in this case it's called "dev_item"
item* dev_item;
// 4. cudaMalloc "item" storage on the device for "dev_item"
HANDLE_ERROR(cudaMalloc((void**)&dev_item, node_quantity));
for (int i = 0; i < node_quantity; i++)
{
HANDLE_ERROR(cudaMalloc((void**)&(dev_item[i]), sizeof(item)* initial_tour.nodes[i].item_qty));
}
// 5. cudaMemcpy the pointer value of "dev_item" from the host to the device pointer &(dev_node->i)
for (int i = 0; i < node_quantity; i++)
{
HANDLE_ERROR(cudaMemcpy(&(dev_node[i].items), &(dev_item[i]), sizeof(item*), cudaMemcpyHostToDevice));
}
// 6. Copy the embedded data
for (int i = 0; i < node_quantity; i++)
{
HANDLE_ERROR(cudaMemcpy(&dev_item[i], n[i].items, sizeof(item) * dev_node[i].item_qty, cudaMemcpyHostToDevice));
}
但是最后一次尝试在下面的行中给我一个访问冲突写入位置 0x0000000B00700C00
HANDLE_ERROR(cudaMalloc((void**)&(dev_item[i]), sizeof(item)* initial_tour.nodes[i].item_qty));
我想错误与某种丢失或错误的内存分配有关,但我一直无法弄清楚它在哪里。
更新 1:
经过 talonmies 指示的一些研究后,我做了一个简化版本的代码只是为了解决这个问题,但仍然不起作用。
这是我的新代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
struct item
{
int id;
float weight;
float value;
int node;
int taken;
};
struct node
{
int id;
double x;
double y;
int item_qty;
item* items;
};
struct tour
{
int id;
int node_qty;
node* nodes;
};
struct population
{
int id;
tour* tours;
};
static void HandleError(cudaError_t err, const char* file, int line)
{
if (err != cudaSuccess) {
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
getchar();
exit(EXIT_FAILURE);
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
int main()
{
// Get user defined values
int population_size, tour_size, node_size, item_size;
printf("Enter values for amount of population, amount of tours, amount of nodes and amount of items:\n");
// For this exercise the values are 1 10 5 4
scanf("%i %i %i %i", &population_size, &tour_size, &node_size, &item_size);
printf("\n");
printf("The values are: %i %i %i %i\n", population_size, tour_size, node_size, item_size);
#pragma region ALLOCATE CPU MEMORY
// Declare pointers
population* host_population;
tour* host_tour;
node* host_node;
item* host_item;
// Allocate host memory for population
host_population = (population*)malloc(sizeof(population) * population_size);
for (int p = 0; p < population_size; p++)
{
host_population[p].tours = (tour*)malloc(sizeof(tour) * tour_size);
}
// Allocate host memory for tour
host_tour = (tour*)malloc(sizeof(tour) * tour_size);
for (int t = 0; t < tour_size; t++)
{
host_tour[t].nodes = (node*)malloc(sizeof(node) * node_size);
}
// Allocate host memory for node
host_node = (node*)malloc(sizeof(node) * node_size);
for (int n = 0; n < node_size; n++)
{
host_node[n].items = (item*)malloc(sizeof(item) * item_size);
}
// Allocate memory for item
host_item = (item*)malloc(sizeof(item) * item_size);
#pragma endregion
#pragma region FILL CPU DATA
//Fill the full structure with information, for test purposes these values are going to be taken
// 1. Item Data
int item_id[4] = { 1,2,3,4 };
float item_value[4] = { 300,50,30,40 };
float item_weight[4] = { 400,200,40,2 };
int item_node[4] = { 3,4,5,2 };
// 2. Node Data
int node_id[5] = { 1,2,3,4,5 };
double node_x[5] = { 0,6,14,11,7 };
double node_y[5] = { 0,-5,5,13,5 };
int node_item[5] = { 0,1,1,1,1 };
// 3. Tour Data
int tour_id[10] = { 1,2,3,4,5,6,7,8,9,10 };
// 4. Population Data
int population_id = 1;
for (int i = 0; i < item_size; i++)
{
host_item[i].id = item_id[i];
host_item[i].value = item_value[i];
host_item[i].taken = rand() % 2;
host_item[i].node = item_node[i];
host_item[i].weight = item_weight[i];
}
for (int n = 0; n < node_size; n++)
{
host_node[n].id = node_id[n];
host_node[n].x = node_x[n];
host_node[n].y = node_y[n];
host_node[n].item_qty = node_item[n];
for (int i = 0; i < item_size; i++)
{
if (host_node[n].id == host_item[i].node)
{
memcpy(host_node[n].items, &host_item[i], sizeof(item) * node_item[n]);
}
}
}
for (int t = 0; t < tour_size; t++)
{
host_tour[t].id = tour_id[t];
host_tour[t].node_qty = node_size;
memcpy(host_tour[t].nodes, host_node, sizeof(node) * node_size);
}
for (int p = 0; p < population_size; p++)
{
host_population[p].id = population_id;
memcpy(host_population[p].tours, host_tour, sizeof(tour) * tour_size);
}
//printStructure(host_population, population_size, tour_size);
#pragma endregion
population* device_population;
tour *device_tour;
node* device_node;
item* device_item;
// Allocate host memory for population
HANDLE_ERROR(cudaMalloc((void**)&device_population, sizeof(population) * population_size));
// Allocate host memory for tour
HANDLE_ERROR(cudaMalloc((void**)&device_tour, sizeof(tour*) * population_size));
for (int p = 0; p < population_size; p++)
{
HANDLE_ERROR(cudaMalloc((void**)&(device_tour[p]), sizeof(tour) * tour_size));
}
return 0;
}
在这个练习之后我有一个更具体的问题要问:这之间有什么区别:
// Allocate host memory for tour
HANDLE_ERROR(cudaMalloc((void**)&device_tour, sizeof(tour*) * population_size));
for (int p = 0; p < population_size; p++)
{
HANDLE_ERROR(cudaMalloc((void**)&(device_tour[p]), sizeof(tour) * tour_size));
}
还有这个
// Allocate host memory for tour
device_tour[1];
for (int p = 0; p < population_size; p++)
{
HANDLE_ERROR(cudaMalloc((void**)&(device_tour[p]), sizeof(tour) * tour_size));
}
第二个块没有任何问题,但第一个 returns 出现异常“访问冲突写入位置”。有没有办法在不使用静态数组的情况下使第一个块工作?
经过进一步研究,我终于找到了解决方案,正如所建议的那样,解决方案是为结构的每个级别创建主机内存版本。以下是完整的工作示例:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/// <summary>
/// Struct to define item
/// </summary>
/// <returns></returns>
struct item
{
int id;
float weight;
float value;
int node;
int taken;
};
/// <summary>
/// Struct to define a node
/// </summary>
/// <returns></returns>
struct node
{
int id;
double x;
double y;
int item_qty;
item* items;
};
/// <summary>
/// Struct to define a tour
/// </summary>
/// <returns></returns>
struct tour
{
int id;
int node_qty;
node* nodes;
};
/// <summary>
/// Struct to define population
/// </summary>
/// <returns></returns>
struct population
{
int id;
int tour_qty;
tour* tours;
};
static void HandleError(cudaError_t err, const char* file, int line)
{
if (err != cudaSuccess) {
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
getchar();
exit(EXIT_FAILURE);
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
void printStructure(population* pop, int pop_size, int tour_size);
__global__ void populationTest(population* population, int population_size)
{
for (int p = 0; p < population_size; ++p)
{
printf(" > population[%d].id: %d\n", p, population[p].id);
printf(" > population[%d].tour_qty: %d\n", p, population[p].tour_qty);
if (population[p].tour_qty > 0)
{
for (int t = 0; t < population[p].tour_qty; ++t)
{
printf(" > population[%d].tours[%d].node_qty: %d\n", p, t, population[p].tours[t].node_qty);
if (population[p].tours[t].node_qty > 0)
{
for (int n = 0; n < population[p].tours[t].node_qty; ++n)
{
printf(" > population[%d].tours[%d].nodes[%d].id: %d\n", p, t, n, population[p].tours[t].nodes[n].id);
printf(" > population[%d].tours[%d].nodes[%d].x: %lf\n", p, t, n, population[p].tours[t].nodes[n].x);
printf(" > population[%d].tours[%d].nodes[%d].y: %lf\n", p, t, n, population[p].tours[t].nodes[n].y);
printf(" > population[%d].tours[%d].nodes[%d].item_qty: %d\n", p, t, n, population[p].tours[t].nodes[n].item_qty);
if (population[p].tours[t].nodes[n].item_qty > 0)
{
for (int i = 0; i < population[p].tours[t].nodes[n].item_qty; ++i)
{
printf(" > population[%d].tours[%d].nodes[%d].items[%d].id: %d\n", p, t, n, i, population[p].tours[t].nodes[n].items[i].id);
printf(" > population[%d].tours[%d].nodes[%d].items[%d].node: %d\n", p, t, n, i, population[p].tours[t].nodes[n].items[i].node);
printf(" > population[%d].tours[%d].nodes[%d].items[%d].taken: %d\n", p, t, n, i, population[p].tours[t].nodes[n].items[i].taken);
printf(" > population[%d].tours[%d].nodes[%d].items[%d].value: %f\n", p, t, n, i, population[p].tours[t].nodes[n].items[i].value);
printf(" > population[%d].tours[%d].nodes[%d].items[%d].weight: %f\n", p, t, n, i, population[p].tours[t].nodes[n].items[i].weight);
}
}
}
}
}
}
}
printf("\n\n");
}
int main()
{
// Get user defined values
int population_size = 1;
int tour_size = 10;
int node_size = 5;
int item_size = 4;
#pragma region ALLOCATE CPU MEMORY
// Declare pointers
population* host_population;
tour* host_tour;
node* host_node;
item* host_item;
// Allocate host memory for population
host_population = (population*)malloc(sizeof(population) * population_size);
for (int p = 0; p < population_size; p++)
{
host_population[p].tours = (tour*)malloc(sizeof(tour) * tour_size);
}
// Allocate host memory for tour
host_tour = (tour*)malloc(sizeof(tour) * tour_size);
for (int t = 0; t < tour_size; t++)
{
host_tour[t].nodes = (node*)malloc(sizeof(node) * node_size);
}
// Allocate host memory for node
host_node = (node*)malloc(sizeof(node) * node_size);
for (int n = 0; n < node_size; n++)
{
host_node[n].items = (item*)malloc(sizeof(item) * item_size);
}
// Allocate memory for item
//host_item = (item*)malloc(sizeof(item) * item_size);
//Test for AoSoA
host_item = (item*)malloc(sizeof(item) * item_size * node_size);
#pragma endregion
#pragma region FILL CPU DATA
//Fill the full structure with information, for test purposes these values are going to be taken
// 1. Item Data
int item_id[4] = { 1,2,3,4 };
float item_value[4] = { 300,50,30,40 };
float item_weight[4] = { 400,200,40,2 };
int item_node[4] = { 3,4,5,2 };
// 2. Node Data
int node_id[5] = { 1,2,3,4,5 };
double node_x[5] = { 0,6,14,11,7 };
double node_y[5] = { 0,-5,5,13,5 };
int node_item[5] = { 0,1,1,1,1 };
// 3. Tour Data
int tour_id[10] = { 1,2,3,4,5,6,7,8,9,10 };
// 4. Population Data
int population_id = 1;
for (int i = 0; i < item_size; i++)
{
host_item[i].id = item_id[i];
host_item[i].value = item_value[i];
host_item[i].taken = rand() % 2;
host_item[i].node = item_node[i];
host_item[i].weight = item_weight[i];
}
for (int n = 0; n < node_size; n++)
{
host_node[n].id = node_id[n];
host_node[n].x = node_x[n];
host_node[n].y = node_y[n];
host_node[n].item_qty = node_item[n];
for (int i = 0; i < item_size; i++)
{
if (host_node[n].id == host_item[i].node)
{
memcpy(host_node[n].items, &host_item[i], sizeof(item) * node_item[n]);
}
}
}
for (int t = 0; t < tour_size; t++)
{
host_tour[t].id = tour_id[t];
host_tour[t].node_qty = node_size;
memcpy(host_tour[t].nodes, host_node, sizeof(node) * node_size);
}
for (int p = 0; p < population_size; p++)
{
host_population[p].id = population_id;
host_population[p].tour_qty = tour_size;
memcpy(host_population[p].tours, host_tour, sizeof(tour) * tour_size);
}
printStructure(host_population, population_size, tour_size);
#pragma endregion
#pragma region ALLOCATE GPU MEMORY
// Define pointers for device structs
population* device_population;
tour* device_tour;
node* device_node;
item* device_item;
// Allocate device memory for population
HANDLE_ERROR(cudaMalloc((void**)&device_population, sizeof(population) * size_t(population_size)));
// Allocate device memory for tour
HANDLE_ERROR(cudaMalloc((void**)&device_tour, sizeof(tour) * size_t(tour_size)));
// Allocate device memory for node
HANDLE_ERROR(cudaMalloc((void**)&device_node, sizeof(node) * size_t(node_size)));
// Allocate device memory for item
HANDLE_ERROR(cudaMalloc((void**)&device_item, sizeof(item) * size_t(item_size)));
// Copy host item struct with device pointers to device
HANDLE_ERROR(cudaMemcpy(device_item, host_item, sizeof(item) * size_t(item_size), cudaMemcpyHostToDevice));
// Offset pointers
for (int n = 0; n < node_size; ++n)
{
for (int i = 0; i < item_size; ++i)
{
if (host_node[n].id == host_item[i].node)
{
host_node[n].items = device_item + i;
}
}
}
// Copy host node struct with device pointers to device
HANDLE_ERROR(cudaMemcpy(device_node, host_node, sizeof(node) * size_t(node_size), cudaMemcpyHostToDevice));
for (int t = 0; t < tour_size; ++t)
{
host_tour[t].nodes = device_node;
}
// Copy host tour struct with device pointers to device
HANDLE_ERROR(cudaMemcpy(device_tour, host_tour, sizeof(tour) * size_t(tour_size), cudaMemcpyHostToDevice));
for (int p = 0; p < population_size; ++p)
{
host_population[p].tours = device_tour;
}
host_population->tour_qty = tour_size;
HANDLE_ERROR(cudaMemcpy(device_population, host_population, sizeof(population) * size_t(population_size), cudaMemcpyHostToDevice));
populationTest << <1, 1 >> > (device_population, population_size);
HANDLE_ERROR(cudaDeviceSynchronize());
#pragma endregion
return 0;
}
/// <summary>
/// Function to print the tree struct of population
/// </summary>
/// <param name="p"></param>
/// <param name="pop_size"></param>
/// <param name="tour_size"></param>
void printStructure(population* pop, int pop_size, int tour_size)
{
for (int p = 0; p < pop_size; ++p)
{
printf(" > population[%d].id: %d\n", p, pop[p].id);
for (int t = 0; t < tour_size; ++t)
{
printf(" > population[%d].tours[%d].id: %d\n", p, t, pop[p].tours[t].id);
printf(" > population[%d].tours[%d].node_qty: %d\n", p, t, pop[p].tours[t].node_qty);
if (pop[p].tours[t].node_qty > 0)
{
for (int n = 0; n < pop[p].tours[t].node_qty; ++n)
{
printf(" > population[%d].tours[%d].nodes[%d].id: %d\n", p, t, n, pop[p].tours[t].nodes[n].id);
printf(" > population[%d].tours[%d].nodes[%d].x: %lf\n", p, t, n, pop[p].tours[t].nodes[n].x);
printf(" > population[%d].tours[%d].nodes[%d].y: %lf\n", p, t, n, pop[p].tours[t].nodes[n].y);
printf(" > population[%d].tours[%d].nodes[%d].item_qty: %d\n", p, t, n, pop[p].tours[t].nodes[n].item_qty);
if (pop[p].tours[t].nodes[n].item_qty > 0)
{
for (int i = 0; i < pop[p].tours[t].nodes[n].item_qty; ++i)
{
printf(" > population[%d].tours[%d].nodes[%d].items[%d].id: %d\n", p, t, n, i, pop[p].tours[t].nodes[n].items[i].id);
printf(" > population[%d].tours[%d].nodes[%d].items[%d].node: %d\n", p, t, n, i, pop[p].tours[t].nodes[n].items[i].node);
printf(" > population[%d].tours[%d].nodes[%d].items[%d].taken: %d\n", p, t, n, i, pop[p].tours[t].nodes[n].items[i].taken);
printf(" > population[%d].tours[%d].nodes[%d].items[%d].value: %f\n", p, t, n, i, pop[p].tours[t].nodes[n].items[i].value);
printf(" > population[%d].tours[%d].nodes[%d].items[%d].weight: %f\n", p, t, n, i, pop[p].tours[t].nodes[n].items[i].weight);
}
}
}
}
}
}
printf("\n\n");
}
我一直在一个程序中工作,该程序需要在另一个结构数组或数组结构中使用结构数组,我决定在给定初始条件(有动态)的情况下使用这种方法,以下是我试图在 CUDA
中分配的结构struct population
{
int id;
tour *tours;
};
struct tour
{
int id;
node *nodes;
double value;
int node_qty;
};
struct node
{
int id;
double x;
double y;
int item_qty;
item *items;
};
struct item
{
float weight;
float value;
};
如您所见,这组结构是一个内在的结构,正如我所说的,大多数属性都是动态的(P.E:节点数量、项目数量和游览数量).我已经多次尝试分配内存,但结果几乎总是相同的“访问冲突写入位置”。作为旁注,我尝试遵循其他问题的一些建议,例如:cudaMemcpy segmentation fault or this
以下代码分配了大部分内存,但是当我尝试访问结构的属性时,结果是“非法内存访问”
// 1. cudaMalloc a pointer to device memory that hold population
population* d_initial_population;
HANDLE_ERROR(cudaMalloc((void**)&d_initial_population, sizeof(population)));
// 2. Create a separate tour pointer on the host.
tour* d_tour_ptr;
HANDLE_ERROR(cudaMalloc((void**)&d_tour_ptr, sizeof(tour) * POPULATION_SIZE));
// 3. Create a separate node pointer on the host.
node* d_node_ptr[POPULATION_SIZE];
// Allocate memory on device according to population size
for (int i = 0; i < POPULATION_SIZE; ++i)
{
// 4. cudaMalloc node storage on the device for node pointer
HANDLE_ERROR(cudaMalloc((void**)&(d_node_ptr[i]), sizeof(node) * node_quantity));
// 5. cudaMemcpy the pointer value of node pointer from host to the device node pointer
HANDLE_ERROR(cudaMemcpy(&(d_tour_ptr[i].nodes), &(d_node_ptr[i]), sizeof(node*), cudaMemcpyHostToDevice));
// Optional: Copy an instantiated object on the host to the device pointer
HANDLE_ERROR(cudaMemcpy(d_node_ptr[i], initial_tour.nodes, sizeof(node) * node_quantity, cudaMemcpyHostToDevice));
}
// 6. cudaMemcpy the pointer value of tour pointer from host to the device population pointer
HANDLE_ERROR(cudaMemcpy(&(d_initial_population->tours), &d_tour_ptr, sizeof(tour*), cudaMemcpyHostToDevice));
在这个初始方法之后,我的下一个尝试是先尝试分配内部结构,然后再向上,我的尝试是使用节点和项目结构如下
// Define a pointer for struct "node"
node* dev_node;
// 1. cudaMalloc a pointer to device memory that will hold the struct "node", in this case is called "dev_node"
HANDLE_ERROR(cudaMalloc((void**)&dev_node, node_quantity * sizeof(node)));
// 2. (optionally) copy an instantiated object of struct "node" on the host to the device pointer "dev_node" from step 1 using cudaMemcpy
HANDLE_ERROR(cudaMemcpy(dev_node, n, node_quantity * sizeof(node), cudaMemcpyHostToDevice));
// 3. Create a separate "item" pointer on the host, in this case it's called "dev_item"
item* dev_item;
// 4. cudaMalloc "item" storage on the device for "dev_item"
HANDLE_ERROR(cudaMalloc((void**)&dev_item, node_quantity));
for (int i = 0; i < node_quantity; i++)
{
HANDLE_ERROR(cudaMalloc((void**)&(dev_item[i]), sizeof(item)* initial_tour.nodes[i].item_qty));
}
// 5. cudaMemcpy the pointer value of "dev_item" from the host to the device pointer &(dev_node->i)
for (int i = 0; i < node_quantity; i++)
{
HANDLE_ERROR(cudaMemcpy(&(dev_node[i].items), &(dev_item[i]), sizeof(item*), cudaMemcpyHostToDevice));
}
// 6. Copy the embedded data
for (int i = 0; i < node_quantity; i++)
{
HANDLE_ERROR(cudaMemcpy(&dev_item[i], n[i].items, sizeof(item) * dev_node[i].item_qty, cudaMemcpyHostToDevice));
}
但是最后一次尝试在下面的行中给我一个访问冲突写入位置 0x0000000B00700C00
HANDLE_ERROR(cudaMalloc((void**)&(dev_item[i]), sizeof(item)* initial_tour.nodes[i].item_qty));
我想错误与某种丢失或错误的内存分配有关,但我一直无法弄清楚它在哪里。
更新 1: 经过 talonmies 指示的一些研究后,我做了一个简化版本的代码只是为了解决这个问题,但仍然不起作用。
这是我的新代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
struct item
{
int id;
float weight;
float value;
int node;
int taken;
};
struct node
{
int id;
double x;
double y;
int item_qty;
item* items;
};
struct tour
{
int id;
int node_qty;
node* nodes;
};
struct population
{
int id;
tour* tours;
};
static void HandleError(cudaError_t err, const char* file, int line)
{
if (err != cudaSuccess) {
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
getchar();
exit(EXIT_FAILURE);
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
int main()
{
// Get user defined values
int population_size, tour_size, node_size, item_size;
printf("Enter values for amount of population, amount of tours, amount of nodes and amount of items:\n");
// For this exercise the values are 1 10 5 4
scanf("%i %i %i %i", &population_size, &tour_size, &node_size, &item_size);
printf("\n");
printf("The values are: %i %i %i %i\n", population_size, tour_size, node_size, item_size);
#pragma region ALLOCATE CPU MEMORY
// Declare pointers
population* host_population;
tour* host_tour;
node* host_node;
item* host_item;
// Allocate host memory for population
host_population = (population*)malloc(sizeof(population) * population_size);
for (int p = 0; p < population_size; p++)
{
host_population[p].tours = (tour*)malloc(sizeof(tour) * tour_size);
}
// Allocate host memory for tour
host_tour = (tour*)malloc(sizeof(tour) * tour_size);
for (int t = 0; t < tour_size; t++)
{
host_tour[t].nodes = (node*)malloc(sizeof(node) * node_size);
}
// Allocate host memory for node
host_node = (node*)malloc(sizeof(node) * node_size);
for (int n = 0; n < node_size; n++)
{
host_node[n].items = (item*)malloc(sizeof(item) * item_size);
}
// Allocate memory for item
host_item = (item*)malloc(sizeof(item) * item_size);
#pragma endregion
#pragma region FILL CPU DATA
//Fill the full structure with information, for test purposes these values are going to be taken
// 1. Item Data
int item_id[4] = { 1,2,3,4 };
float item_value[4] = { 300,50,30,40 };
float item_weight[4] = { 400,200,40,2 };
int item_node[4] = { 3,4,5,2 };
// 2. Node Data
int node_id[5] = { 1,2,3,4,5 };
double node_x[5] = { 0,6,14,11,7 };
double node_y[5] = { 0,-5,5,13,5 };
int node_item[5] = { 0,1,1,1,1 };
// 3. Tour Data
int tour_id[10] = { 1,2,3,4,5,6,7,8,9,10 };
// 4. Population Data
int population_id = 1;
for (int i = 0; i < item_size; i++)
{
host_item[i].id = item_id[i];
host_item[i].value = item_value[i];
host_item[i].taken = rand() % 2;
host_item[i].node = item_node[i];
host_item[i].weight = item_weight[i];
}
for (int n = 0; n < node_size; n++)
{
host_node[n].id = node_id[n];
host_node[n].x = node_x[n];
host_node[n].y = node_y[n];
host_node[n].item_qty = node_item[n];
for (int i = 0; i < item_size; i++)
{
if (host_node[n].id == host_item[i].node)
{
memcpy(host_node[n].items, &host_item[i], sizeof(item) * node_item[n]);
}
}
}
for (int t = 0; t < tour_size; t++)
{
host_tour[t].id = tour_id[t];
host_tour[t].node_qty = node_size;
memcpy(host_tour[t].nodes, host_node, sizeof(node) * node_size);
}
for (int p = 0; p < population_size; p++)
{
host_population[p].id = population_id;
memcpy(host_population[p].tours, host_tour, sizeof(tour) * tour_size);
}
//printStructure(host_population, population_size, tour_size);
#pragma endregion
population* device_population;
tour *device_tour;
node* device_node;
item* device_item;
// Allocate host memory for population
HANDLE_ERROR(cudaMalloc((void**)&device_population, sizeof(population) * population_size));
// Allocate host memory for tour
HANDLE_ERROR(cudaMalloc((void**)&device_tour, sizeof(tour*) * population_size));
for (int p = 0; p < population_size; p++)
{
HANDLE_ERROR(cudaMalloc((void**)&(device_tour[p]), sizeof(tour) * tour_size));
}
return 0;
}
在这个练习之后我有一个更具体的问题要问:这之间有什么区别:
// Allocate host memory for tour
HANDLE_ERROR(cudaMalloc((void**)&device_tour, sizeof(tour*) * population_size));
for (int p = 0; p < population_size; p++)
{
HANDLE_ERROR(cudaMalloc((void**)&(device_tour[p]), sizeof(tour) * tour_size));
}
还有这个
// Allocate host memory for tour
device_tour[1];
for (int p = 0; p < population_size; p++)
{
HANDLE_ERROR(cudaMalloc((void**)&(device_tour[p]), sizeof(tour) * tour_size));
}
第二个块没有任何问题,但第一个 returns 出现异常“访问冲突写入位置”。有没有办法在不使用静态数组的情况下使第一个块工作?
经过进一步研究,我终于找到了解决方案,正如所建议的那样,解决方案是为结构的每个级别创建主机内存版本。以下是完整的工作示例:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/// <summary>
/// Struct to define item
/// </summary>
/// <returns></returns>
struct item
{
int id;
float weight;
float value;
int node;
int taken;
};
/// <summary>
/// Struct to define a node
/// </summary>
/// <returns></returns>
struct node
{
int id;
double x;
double y;
int item_qty;
item* items;
};
/// <summary>
/// Struct to define a tour
/// </summary>
/// <returns></returns>
struct tour
{
int id;
int node_qty;
node* nodes;
};
/// <summary>
/// Struct to define population
/// </summary>
/// <returns></returns>
struct population
{
int id;
int tour_qty;
tour* tours;
};
static void HandleError(cudaError_t err, const char* file, int line)
{
if (err != cudaSuccess) {
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
getchar();
exit(EXIT_FAILURE);
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
void printStructure(population* pop, int pop_size, int tour_size);
__global__ void populationTest(population* population, int population_size)
{
for (int p = 0; p < population_size; ++p)
{
printf(" > population[%d].id: %d\n", p, population[p].id);
printf(" > population[%d].tour_qty: %d\n", p, population[p].tour_qty);
if (population[p].tour_qty > 0)
{
for (int t = 0; t < population[p].tour_qty; ++t)
{
printf(" > population[%d].tours[%d].node_qty: %d\n", p, t, population[p].tours[t].node_qty);
if (population[p].tours[t].node_qty > 0)
{
for (int n = 0; n < population[p].tours[t].node_qty; ++n)
{
printf(" > population[%d].tours[%d].nodes[%d].id: %d\n", p, t, n, population[p].tours[t].nodes[n].id);
printf(" > population[%d].tours[%d].nodes[%d].x: %lf\n", p, t, n, population[p].tours[t].nodes[n].x);
printf(" > population[%d].tours[%d].nodes[%d].y: %lf\n", p, t, n, population[p].tours[t].nodes[n].y);
printf(" > population[%d].tours[%d].nodes[%d].item_qty: %d\n", p, t, n, population[p].tours[t].nodes[n].item_qty);
if (population[p].tours[t].nodes[n].item_qty > 0)
{
for (int i = 0; i < population[p].tours[t].nodes[n].item_qty; ++i)
{
printf(" > population[%d].tours[%d].nodes[%d].items[%d].id: %d\n", p, t, n, i, population[p].tours[t].nodes[n].items[i].id);
printf(" > population[%d].tours[%d].nodes[%d].items[%d].node: %d\n", p, t, n, i, population[p].tours[t].nodes[n].items[i].node);
printf(" > population[%d].tours[%d].nodes[%d].items[%d].taken: %d\n", p, t, n, i, population[p].tours[t].nodes[n].items[i].taken);
printf(" > population[%d].tours[%d].nodes[%d].items[%d].value: %f\n", p, t, n, i, population[p].tours[t].nodes[n].items[i].value);
printf(" > population[%d].tours[%d].nodes[%d].items[%d].weight: %f\n", p, t, n, i, population[p].tours[t].nodes[n].items[i].weight);
}
}
}
}
}
}
}
printf("\n\n");
}
int main()
{
// Get user defined values
int population_size = 1;
int tour_size = 10;
int node_size = 5;
int item_size = 4;
#pragma region ALLOCATE CPU MEMORY
// Declare pointers
population* host_population;
tour* host_tour;
node* host_node;
item* host_item;
// Allocate host memory for population
host_population = (population*)malloc(sizeof(population) * population_size);
for (int p = 0; p < population_size; p++)
{
host_population[p].tours = (tour*)malloc(sizeof(tour) * tour_size);
}
// Allocate host memory for tour
host_tour = (tour*)malloc(sizeof(tour) * tour_size);
for (int t = 0; t < tour_size; t++)
{
host_tour[t].nodes = (node*)malloc(sizeof(node) * node_size);
}
// Allocate host memory for node
host_node = (node*)malloc(sizeof(node) * node_size);
for (int n = 0; n < node_size; n++)
{
host_node[n].items = (item*)malloc(sizeof(item) * item_size);
}
// Allocate memory for item
//host_item = (item*)malloc(sizeof(item) * item_size);
//Test for AoSoA
host_item = (item*)malloc(sizeof(item) * item_size * node_size);
#pragma endregion
#pragma region FILL CPU DATA
//Fill the full structure with information, for test purposes these values are going to be taken
// 1. Item Data
int item_id[4] = { 1,2,3,4 };
float item_value[4] = { 300,50,30,40 };
float item_weight[4] = { 400,200,40,2 };
int item_node[4] = { 3,4,5,2 };
// 2. Node Data
int node_id[5] = { 1,2,3,4,5 };
double node_x[5] = { 0,6,14,11,7 };
double node_y[5] = { 0,-5,5,13,5 };
int node_item[5] = { 0,1,1,1,1 };
// 3. Tour Data
int tour_id[10] = { 1,2,3,4,5,6,7,8,9,10 };
// 4. Population Data
int population_id = 1;
for (int i = 0; i < item_size; i++)
{
host_item[i].id = item_id[i];
host_item[i].value = item_value[i];
host_item[i].taken = rand() % 2;
host_item[i].node = item_node[i];
host_item[i].weight = item_weight[i];
}
for (int n = 0; n < node_size; n++)
{
host_node[n].id = node_id[n];
host_node[n].x = node_x[n];
host_node[n].y = node_y[n];
host_node[n].item_qty = node_item[n];
for (int i = 0; i < item_size; i++)
{
if (host_node[n].id == host_item[i].node)
{
memcpy(host_node[n].items, &host_item[i], sizeof(item) * node_item[n]);
}
}
}
for (int t = 0; t < tour_size; t++)
{
host_tour[t].id = tour_id[t];
host_tour[t].node_qty = node_size;
memcpy(host_tour[t].nodes, host_node, sizeof(node) * node_size);
}
for (int p = 0; p < population_size; p++)
{
host_population[p].id = population_id;
host_population[p].tour_qty = tour_size;
memcpy(host_population[p].tours, host_tour, sizeof(tour) * tour_size);
}
printStructure(host_population, population_size, tour_size);
#pragma endregion
#pragma region ALLOCATE GPU MEMORY
// Define pointers for device structs
population* device_population;
tour* device_tour;
node* device_node;
item* device_item;
// Allocate device memory for population
HANDLE_ERROR(cudaMalloc((void**)&device_population, sizeof(population) * size_t(population_size)));
// Allocate device memory for tour
HANDLE_ERROR(cudaMalloc((void**)&device_tour, sizeof(tour) * size_t(tour_size)));
// Allocate device memory for node
HANDLE_ERROR(cudaMalloc((void**)&device_node, sizeof(node) * size_t(node_size)));
// Allocate device memory for item
HANDLE_ERROR(cudaMalloc((void**)&device_item, sizeof(item) * size_t(item_size)));
// Copy host item struct with device pointers to device
HANDLE_ERROR(cudaMemcpy(device_item, host_item, sizeof(item) * size_t(item_size), cudaMemcpyHostToDevice));
// Offset pointers
for (int n = 0; n < node_size; ++n)
{
for (int i = 0; i < item_size; ++i)
{
if (host_node[n].id == host_item[i].node)
{
host_node[n].items = device_item + i;
}
}
}
// Copy host node struct with device pointers to device
HANDLE_ERROR(cudaMemcpy(device_node, host_node, sizeof(node) * size_t(node_size), cudaMemcpyHostToDevice));
for (int t = 0; t < tour_size; ++t)
{
host_tour[t].nodes = device_node;
}
// Copy host tour struct with device pointers to device
HANDLE_ERROR(cudaMemcpy(device_tour, host_tour, sizeof(tour) * size_t(tour_size), cudaMemcpyHostToDevice));
for (int p = 0; p < population_size; ++p)
{
host_population[p].tours = device_tour;
}
host_population->tour_qty = tour_size;
HANDLE_ERROR(cudaMemcpy(device_population, host_population, sizeof(population) * size_t(population_size), cudaMemcpyHostToDevice));
populationTest << <1, 1 >> > (device_population, population_size);
HANDLE_ERROR(cudaDeviceSynchronize());
#pragma endregion
return 0;
}
/// <summary>
/// Function to print the tree struct of population
/// </summary>
/// <param name="p"></param>
/// <param name="pop_size"></param>
/// <param name="tour_size"></param>
void printStructure(population* pop, int pop_size, int tour_size)
{
for (int p = 0; p < pop_size; ++p)
{
printf(" > population[%d].id: %d\n", p, pop[p].id);
for (int t = 0; t < tour_size; ++t)
{
printf(" > population[%d].tours[%d].id: %d\n", p, t, pop[p].tours[t].id);
printf(" > population[%d].tours[%d].node_qty: %d\n", p, t, pop[p].tours[t].node_qty);
if (pop[p].tours[t].node_qty > 0)
{
for (int n = 0; n < pop[p].tours[t].node_qty; ++n)
{
printf(" > population[%d].tours[%d].nodes[%d].id: %d\n", p, t, n, pop[p].tours[t].nodes[n].id);
printf(" > population[%d].tours[%d].nodes[%d].x: %lf\n", p, t, n, pop[p].tours[t].nodes[n].x);
printf(" > population[%d].tours[%d].nodes[%d].y: %lf\n", p, t, n, pop[p].tours[t].nodes[n].y);
printf(" > population[%d].tours[%d].nodes[%d].item_qty: %d\n", p, t, n, pop[p].tours[t].nodes[n].item_qty);
if (pop[p].tours[t].nodes[n].item_qty > 0)
{
for (int i = 0; i < pop[p].tours[t].nodes[n].item_qty; ++i)
{
printf(" > population[%d].tours[%d].nodes[%d].items[%d].id: %d\n", p, t, n, i, pop[p].tours[t].nodes[n].items[i].id);
printf(" > population[%d].tours[%d].nodes[%d].items[%d].node: %d\n", p, t, n, i, pop[p].tours[t].nodes[n].items[i].node);
printf(" > population[%d].tours[%d].nodes[%d].items[%d].taken: %d\n", p, t, n, i, pop[p].tours[t].nodes[n].items[i].taken);
printf(" > population[%d].tours[%d].nodes[%d].items[%d].value: %f\n", p, t, n, i, pop[p].tours[t].nodes[n].items[i].value);
printf(" > population[%d].tours[%d].nodes[%d].items[%d].weight: %f\n", p, t, n, i, pop[p].tours[t].nodes[n].items[i].weight);
}
}
}
}
}
}
printf("\n\n");
}