SoA 内 AoS 的 CUDA 内存分配

Question

我一直在一个程序中工作，该程序需要在另一个结构数组或数组结构中使用结构数组，我决定在给定初始条件（有动态）的情况下使用这种方法，以下是我试图在 CUDA

中分配的结构

struct population
{
    int id;
    tour *tours;
};

struct tour
{
    int id;
    node *nodes;
    double value;
    int node_qty;
};

struct node 
{
    int id;
    double x;
    double y;
    int item_qty;
    item *items;
};

struct item 
{
    float weight;
    float value;
};

如您所见，这组结构是一个内在的结构，正如我所说的，大多数属性都是动态的（P.E：节点数量、项目数量和游览数量).我已经多次尝试分配内存，但结果几乎总是相同的“访问冲突写入位置”。作为旁注，我尝试遵循其他问题的一些建议，例如：cudaMemcpy segmentation fault or this .

以下代码分配了大部分内存，但是当我尝试访问结构的属性时，结果是“非法内存访问”

// 1. cudaMalloc a pointer to device memory that hold population
population* d_initial_population;

HANDLE_ERROR(cudaMalloc((void**)&d_initial_population, sizeof(population)));
    
// 2. Create a separate tour pointer on the host.
tour* d_tour_ptr;
HANDLE_ERROR(cudaMalloc((void**)&d_tour_ptr, sizeof(tour) * POPULATION_SIZE));
   
// 3. Create a separate node pointer on the host.
node* d_node_ptr[POPULATION_SIZE];
   
// Allocate memory on device according to population size
for (int i = 0; i < POPULATION_SIZE; ++i)
{
    // 4. cudaMalloc node storage on the device for node pointer
    HANDLE_ERROR(cudaMalloc((void**)&(d_node_ptr[i]), sizeof(node) * node_quantity));
    // 5. cudaMemcpy the pointer value of node pointer from host to the device node pointer
    HANDLE_ERROR(cudaMemcpy(&(d_tour_ptr[i].nodes), &(d_node_ptr[i]), sizeof(node*), cudaMemcpyHostToDevice));
    // Optional: Copy an instantiated object on the host to the device pointer
    HANDLE_ERROR(cudaMemcpy(d_node_ptr[i], initial_tour.nodes, sizeof(node) * node_quantity, cudaMemcpyHostToDevice));
}
// 6. cudaMemcpy the pointer value of tour pointer from host to the device population pointer
HANDLE_ERROR(cudaMemcpy(&(d_initial_population->tours), &d_tour_ptr, sizeof(tour*), cudaMemcpyHostToDevice));

在这个初始方法之后，我的下一个尝试是先尝试分配内部结构，然后再向上，我的尝试是使用节点和项目结构如下

// Define a pointer for struct "node"
node* dev_node; 

// 1. cudaMalloc a pointer to device memory that will hold the struct "node", in this case is called "dev_node"
HANDLE_ERROR(cudaMalloc((void**)&dev_node, node_quantity * sizeof(node)));

// 2. (optionally) copy an instantiated object of struct "node" on the host to the device pointer "dev_node" from step 1 using cudaMemcpy
HANDLE_ERROR(cudaMemcpy(dev_node, n, node_quantity * sizeof(node), cudaMemcpyHostToDevice));

// 3. Create a separate "item" pointer on the host, in this case it's called "dev_item"
item* dev_item;

// 4. cudaMalloc "item" storage on the device for "dev_item"
HANDLE_ERROR(cudaMalloc((void**)&dev_item, node_quantity));
for (int i = 0; i < node_quantity; i++)
{
    HANDLE_ERROR(cudaMalloc((void**)&(dev_item[i]), sizeof(item)* initial_tour.nodes[i].item_qty));
}

// 5. cudaMemcpy the pointer value of "dev_item" from the host to the device pointer &(dev_node->i)
for (int i = 0; i < node_quantity; i++)
{
    HANDLE_ERROR(cudaMemcpy(&(dev_node[i].items), &(dev_item[i]), sizeof(item*), cudaMemcpyHostToDevice));
}

// 6. Copy the embedded data
for (int i = 0; i < node_quantity; i++)
{
    HANDLE_ERROR(cudaMemcpy(&dev_item[i], n[i].items, sizeof(item) * dev_node[i].item_qty, cudaMemcpyHostToDevice));
}

但是最后一次尝试在下面的行中给我一个访问冲突写入位置 0x0000000B00700C00

HANDLE_ERROR(cudaMalloc((void**)&(dev_item[i]), sizeof(item)* initial_tour.nodes[i].item_qty));

我想错误与某种丢失或错误的内存分配有关，但我一直无法弄清楚它在哪里。

更新 1：经过 talonmies 指示的一些研究后，我做了一个简化版本的代码只是为了解决这个问题，但仍然不起作用。

这是我的新代码：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

struct item
{
    int id;
    float weight;
    float value;
    int node;
    int taken;
};

struct node
{
    int id;
    double x;
    double y;
    int item_qty;
    item* items;
};

struct tour
{
    int id;
    int node_qty;
    node* nodes;
};

struct population
{
    int id;
    tour* tours;
};

static void HandleError(cudaError_t err, const char* file, int line)
{
    if (err != cudaSuccess) {
        printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
        getchar();
        exit(EXIT_FAILURE);
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
    
int main()
{
    // Get user defined values
    int population_size, tour_size, node_size, item_size;

    printf("Enter values for amount of population, amount of tours, amount of nodes and amount of items:\n");
    // For this exercise the values are 1 10 5 4
    scanf("%i %i %i %i", &population_size, &tour_size, &node_size, &item_size);
    printf("\n");

    printf("The values are: %i %i %i %i\n", population_size, tour_size, node_size, item_size);

#pragma region ALLOCATE CPU MEMORY

    // Declare pointers
    population* host_population;
    tour* host_tour;
    node* host_node;
    item* host_item;

    // Allocate host memory for population
    host_population = (population*)malloc(sizeof(population) * population_size);
    for (int p = 0; p < population_size; p++)
    {
        host_population[p].tours = (tour*)malloc(sizeof(tour) * tour_size);
    }

    // Allocate host memory for tour
    host_tour = (tour*)malloc(sizeof(tour) * tour_size);
    for (int t = 0; t < tour_size; t++)
    {
        host_tour[t].nodes = (node*)malloc(sizeof(node) * node_size);
    }

    // Allocate host memory for node
    host_node = (node*)malloc(sizeof(node) * node_size);
    for (int n = 0; n < node_size; n++)
    {
        host_node[n].items = (item*)malloc(sizeof(item) * item_size);
    }

    // Allocate memory for item
    host_item = (item*)malloc(sizeof(item) * item_size);

#pragma endregion

#pragma region FILL CPU DATA

    //Fill the full structure with information, for test purposes these values are going to be taken

    // 1. Item Data
    int item_id[4] = { 1,2,3,4 };
    float item_value[4] = { 300,50,30,40 };
    float item_weight[4] = { 400,200,40,2 };
    int item_node[4] = { 3,4,5,2 };

    // 2. Node Data
    int node_id[5] = { 1,2,3,4,5 };
    double node_x[5] = { 0,6,14,11,7 };
    double node_y[5] = { 0,-5,5,13,5 };
    int node_item[5] = { 0,1,1,1,1 };

    // 3. Tour Data
    int tour_id[10] = { 1,2,3,4,5,6,7,8,9,10 };

    // 4. Population Data
    int population_id = 1;

    for (int i = 0; i < item_size; i++)
    {
        host_item[i].id = item_id[i];
        host_item[i].value = item_value[i];
        host_item[i].taken = rand() % 2;
        host_item[i].node = item_node[i];
        host_item[i].weight = item_weight[i];
    }

    for (int n = 0; n < node_size; n++)
    {
        host_node[n].id = node_id[n];
        host_node[n].x = node_x[n];
        host_node[n].y = node_y[n];
        host_node[n].item_qty = node_item[n];
        for (int i = 0; i < item_size; i++)
        {
            if (host_node[n].id == host_item[i].node)
            {
                memcpy(host_node[n].items, &host_item[i], sizeof(item) * node_item[n]);
            }
        }
    }

    for (int t = 0; t < tour_size; t++)
    {
        host_tour[t].id = tour_id[t];
        host_tour[t].node_qty = node_size;
        memcpy(host_tour[t].nodes, host_node, sizeof(node) * node_size);
    }

    for (int p = 0; p < population_size; p++)
    {
        host_population[p].id = population_id;
        memcpy(host_population[p].tours, host_tour, sizeof(tour) * tour_size);
    }

    //printStructure(host_population, population_size, tour_size);

#pragma endregion

    population* device_population;
    tour *device_tour;
    node* device_node;
    item* device_item;

    // Allocate host memory for population
    HANDLE_ERROR(cudaMalloc((void**)&device_population, sizeof(population) * population_size));

    // Allocate host memory for tour
    HANDLE_ERROR(cudaMalloc((void**)&device_tour, sizeof(tour*) * population_size));
    for (int p = 0; p < population_size; p++)
    {
        HANDLE_ERROR(cudaMalloc((void**)&(device_tour[p]), sizeof(tour) * tour_size));
    }

    return 0;
}

在这个练习之后我有一个更具体的问题要问：这之间有什么区别：

    // Allocate host memory for tour
    HANDLE_ERROR(cudaMalloc((void**)&device_tour, sizeof(tour*) * population_size));
    for (int p = 0; p < population_size; p++)
    {
        HANDLE_ERROR(cudaMalloc((void**)&(device_tour[p]), sizeof(tour) * tour_size));
    }

还有这个

// Allocate host memory for tour
    device_tour[1];
    for (int p = 0; p < population_size; p++)
    {
        HANDLE_ERROR(cudaMalloc((void**)&(device_tour[p]), sizeof(tour) * tour_size));
    }

第二个块没有任何问题，但第一个 returns 出现异常“访问冲突写入位置”。有没有办法在不使用静态数组的情况下使第一个块工作？

Answer 1

经过进一步研究，我终于找到了解决方案，正如所建议的那样，解决方案是为结构的每个级别创建主机内存版本。以下是完整的工作示例：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
    
/// <summary>
    /// Struct to define item
    /// </summary>
    /// <returns></returns>
struct item
{
    int id;
    float weight;
    float value;
    int node;
    int taken;
};

/// <summary>
/// Struct to define a node
/// </summary>
/// <returns></returns>
struct node
{
    int id;
    double x;
    double y;
    int item_qty;
    item* items;
};

/// <summary>
/// Struct to define a tour
/// </summary>
/// <returns></returns>
struct tour
{
    int id;
    int node_qty;
    node* nodes;
};

/// <summary>
/// Struct to define population
/// </summary>
/// <returns></returns>
struct population
{
    int id;
    int tour_qty;
    tour* tours;
};

static void HandleError(cudaError_t err, const char* file, int line)
{
    if (err != cudaSuccess) {
        printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
        getchar();
        exit(EXIT_FAILURE);
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))

void printStructure(population* pop, int pop_size, int tour_size);

__global__ void populationTest(population* population, int population_size)
{
    for (int p = 0; p < population_size; ++p)
    {
        printf(" > population[%d].id: %d\n", p, population[p].id);
        printf(" > population[%d].tour_qty: %d\n", p, population[p].tour_qty);
        if (population[p].tour_qty > 0)
        {
            for (int t = 0; t < population[p].tour_qty; ++t)
            {
                printf(" > population[%d].tours[%d].node_qty: %d\n", p, t, population[p].tours[t].node_qty);
                if (population[p].tours[t].node_qty > 0)
                {
                    for (int n = 0; n < population[p].tours[t].node_qty; ++n)
                    {
                        printf(" > population[%d].tours[%d].nodes[%d].id: %d\n", p, t, n, population[p].tours[t].nodes[n].id);
                        printf(" > population[%d].tours[%d].nodes[%d].x: %lf\n", p, t, n, population[p].tours[t].nodes[n].x);
                        printf(" > population[%d].tours[%d].nodes[%d].y: %lf\n", p, t, n, population[p].tours[t].nodes[n].y);
                        printf(" > population[%d].tours[%d].nodes[%d].item_qty: %d\n", p, t, n, population[p].tours[t].nodes[n].item_qty);
                        if (population[p].tours[t].nodes[n].item_qty > 0)
                        {
                            for (int i = 0; i < population[p].tours[t].nodes[n].item_qty; ++i)
                            {
                                printf(" > population[%d].tours[%d].nodes[%d].items[%d].id: %d\n", p, t, n, i, population[p].tours[t].nodes[n].items[i].id);
                                printf(" > population[%d].tours[%d].nodes[%d].items[%d].node: %d\n", p, t, n, i, population[p].tours[t].nodes[n].items[i].node);
                                printf(" > population[%d].tours[%d].nodes[%d].items[%d].taken: %d\n", p, t, n, i, population[p].tours[t].nodes[n].items[i].taken);
                                printf(" > population[%d].tours[%d].nodes[%d].items[%d].value: %f\n", p, t, n, i, population[p].tours[t].nodes[n].items[i].value);
                                printf(" > population[%d].tours[%d].nodes[%d].items[%d].weight: %f\n", p, t, n, i, population[p].tours[t].nodes[n].items[i].weight);
                            }
                        }
                    }
                }
            }
        }
    }
    printf("\n\n");
}

int main()
{
    // Get user defined values
    int population_size = 1;
    int tour_size = 10; 
    int node_size = 5;
    int item_size = 4;

#pragma region ALLOCATE CPU MEMORY

    // Declare pointers
    population* host_population;
    tour* host_tour;
    node* host_node;
    item* host_item;

    // Allocate host memory for population
    host_population = (population*)malloc(sizeof(population) * population_size);
    for (int p = 0; p < population_size; p++)
    {
        host_population[p].tours = (tour*)malloc(sizeof(tour) * tour_size);
    }

    // Allocate host memory for tour
    host_tour = (tour*)malloc(sizeof(tour) * tour_size);
    for (int t = 0; t < tour_size; t++)
    {
        host_tour[t].nodes = (node*)malloc(sizeof(node) * node_size);
    }

    // Allocate host memory for node
    host_node = (node*)malloc(sizeof(node) * node_size);
    for (int n = 0; n < node_size; n++)
    {
        host_node[n].items = (item*)malloc(sizeof(item) * item_size);
    }

    // Allocate memory for item
    //host_item = (item*)malloc(sizeof(item) * item_size);

    //Test for AoSoA
    host_item = (item*)malloc(sizeof(item) * item_size * node_size);

#pragma endregion

#pragma region FILL CPU DATA

    //Fill the full structure with information, for test purposes these values are going to be taken

    // 1. Item Data
    int item_id[4] = { 1,2,3,4 };
    float item_value[4] = { 300,50,30,40 };
    float item_weight[4] = { 400,200,40,2 };
    int item_node[4] = { 3,4,5,2 };

    // 2. Node Data
    int node_id[5] = { 1,2,3,4,5 };
    double node_x[5] = { 0,6,14,11,7 };
    double node_y[5] = { 0,-5,5,13,5 };
    int node_item[5] = { 0,1,1,1,1 };

    // 3. Tour Data
    int tour_id[10] = { 1,2,3,4,5,6,7,8,9,10 };

    // 4. Population Data
    int population_id = 1;

    for (int i = 0; i < item_size; i++)
    {
        host_item[i].id = item_id[i];
        host_item[i].value = item_value[i];
        host_item[i].taken = rand() % 2;
        host_item[i].node = item_node[i];
        host_item[i].weight = item_weight[i];
    }

    for (int n = 0; n < node_size; n++)
    {
        host_node[n].id = node_id[n];
        host_node[n].x = node_x[n];
        host_node[n].y = node_y[n];
        host_node[n].item_qty = node_item[n];
        for (int i = 0; i < item_size; i++)
        {
            if (host_node[n].id == host_item[i].node)
            {
                memcpy(host_node[n].items, &host_item[i], sizeof(item) * node_item[n]);
            }
        }
    }

    for (int t = 0; t < tour_size; t++)
    {
        host_tour[t].id = tour_id[t];
        host_tour[t].node_qty = node_size;
        memcpy(host_tour[t].nodes, host_node, sizeof(node) * node_size);
    }

    for (int p = 0; p < population_size; p++)
    {
        host_population[p].id = population_id;
        host_population[p].tour_qty = tour_size;
        memcpy(host_population[p].tours, host_tour, sizeof(tour) * tour_size);
    }

    printStructure(host_population, population_size, tour_size);

#pragma endregion

#pragma region ALLOCATE GPU MEMORY

    // Define pointers for device structs
    population* device_population;
    tour* device_tour;
    node* device_node;
    item* device_item;

    // Allocate device memory for population
    HANDLE_ERROR(cudaMalloc((void**)&device_population, sizeof(population) * size_t(population_size)));
    
    // Allocate device memory for tour
    HANDLE_ERROR(cudaMalloc((void**)&device_tour, sizeof(tour) * size_t(tour_size)));
    
    // Allocate device memory for node
    HANDLE_ERROR(cudaMalloc((void**)&device_node, sizeof(node) * size_t(node_size)));
    
    // Allocate device memory for item
    HANDLE_ERROR(cudaMalloc((void**)&device_item, sizeof(item) * size_t(item_size)));

    // Copy host item struct with device pointers to device
    HANDLE_ERROR(cudaMemcpy(device_item, host_item, sizeof(item) * size_t(item_size), cudaMemcpyHostToDevice));

    // Offset pointers
    for (int n = 0; n < node_size; ++n)
    {
        for (int i = 0; i < item_size; ++i)
        {
            if (host_node[n].id == host_item[i].node)
            {
                host_node[n].items = device_item + i;
            }
        }
    }

    // Copy host node struct with device pointers to device
    HANDLE_ERROR(cudaMemcpy(device_node, host_node, sizeof(node) * size_t(node_size), cudaMemcpyHostToDevice));

    for (int t = 0; t < tour_size; ++t)
    {
        host_tour[t].nodes = device_node;
    }

    // Copy host tour struct with device pointers to device
    HANDLE_ERROR(cudaMemcpy(device_tour, host_tour, sizeof(tour) * size_t(tour_size), cudaMemcpyHostToDevice));

    for (int p = 0; p < population_size; ++p)
    {
        host_population[p].tours = device_tour;
    }

    host_population->tour_qty = tour_size;

    HANDLE_ERROR(cudaMemcpy(device_population, host_population, sizeof(population) * size_t(population_size), cudaMemcpyHostToDevice));

   populationTest << <1, 1 >> > (device_population, population_size);
   HANDLE_ERROR(cudaDeviceSynchronize());

#pragma endregion

    return 0;
}

/// <summary>
/// Function to print the tree struct of population
/// </summary>
/// <param name="p"></param>
/// <param name="pop_size"></param>
/// <param name="tour_size"></param>
void printStructure(population* pop, int pop_size, int tour_size)
{
    for (int p = 0; p < pop_size; ++p)
    {
        printf(" > population[%d].id: %d\n", p, pop[p].id);
        for (int t = 0; t < tour_size; ++t)
        {
            printf(" > population[%d].tours[%d].id: %d\n", p, t, pop[p].tours[t].id);
            printf(" > population[%d].tours[%d].node_qty: %d\n", p, t, pop[p].tours[t].node_qty);
            if (pop[p].tours[t].node_qty > 0)
            {
                for (int n = 0; n < pop[p].tours[t].node_qty; ++n)
                {
                    printf(" > population[%d].tours[%d].nodes[%d].id: %d\n", p, t, n, pop[p].tours[t].nodes[n].id);
                    printf(" > population[%d].tours[%d].nodes[%d].x: %lf\n", p, t, n, pop[p].tours[t].nodes[n].x);
                    printf(" > population[%d].tours[%d].nodes[%d].y: %lf\n", p, t, n, pop[p].tours[t].nodes[n].y);
                    printf(" > population[%d].tours[%d].nodes[%d].item_qty: %d\n", p, t, n, pop[p].tours[t].nodes[n].item_qty);
                    if (pop[p].tours[t].nodes[n].item_qty > 0)
                    {
                        for (int i = 0; i < pop[p].tours[t].nodes[n].item_qty; ++i)
                        {
                            printf(" > population[%d].tours[%d].nodes[%d].items[%d].id: %d\n", p, t, n, i, pop[p].tours[t].nodes[n].items[i].id);
                            printf(" > population[%d].tours[%d].nodes[%d].items[%d].node: %d\n", p, t, n, i, pop[p].tours[t].nodes[n].items[i].node);
                            printf(" > population[%d].tours[%d].nodes[%d].items[%d].taken: %d\n", p, t, n, i, pop[p].tours[t].nodes[n].items[i].taken);
                            printf(" > population[%d].tours[%d].nodes[%d].items[%d].value: %f\n", p, t, n, i, pop[p].tours[t].nodes[n].items[i].value);
                            printf(" > population[%d].tours[%d].nodes[%d].items[%d].weight: %f\n", p, t, n, i, pop[p].tours[t].nodes[n].items[i].weight);
                        }
                    }
                }
            }
        }
    }
    printf("\n\n");
}

SoA 内 AoS​​ 的 CUDA 内存分配

CUDA Memory Allocation for AoS inside a SoA

c

cuda

SoA 内 AoS 的 CUDA 内存分配