GPU 上动态结构数组的内存分配

Question

我在将结构数组传递给 gpu 内核时遇到问题。我基于这个主题 - cudaMemcpy segmentation fault 并且我这样写：

#include <stdio.h>
#include <stdlib.h>

struct Test {
    char *array;
};

__global__ void kernel(Test *dev_test) {
    for(int i=0; i < 5; i++) {
        printf("Kernel[0][i]: %c \n", dev_test[0].array[i]);
    }
}

int main(void) {

    int n = 4, size = 5;
    Test *dev_test, *test;

    test = (Test*)malloc(sizeof(Test)*n);
    for(int i = 0; i < n; i++)
        test[i].array = (char*)malloc(size * sizeof(char));

    for(int i=0; i < n; i++) {
        char temp[] = { 'a', 'b', 'c', 'd' , 'e' };
        memcpy(test[i].array, temp, size * sizeof(char));
    }

    cudaMalloc((void**)&dev_test, n * sizeof(Test));
    cudaMemcpy(dev_test, test, n * sizeof(Test), cudaMemcpyHostToDevice);
    for(int i=0; i < n; i++) {
        cudaMalloc((void**)&(test[i].array), size * sizeof(char));
        cudaMemcpy(&(dev_test[i].array), &(test[i].array), size * sizeof(char), cudaMemcpyHostToDevice);
    }

    kernel<<<1, 1>>>(dev_test);
    cudaDeviceSynchronize();

    //  memory free
    return 0;
}

没有错误，但内核中显示的值不正确。我做错了什么？在此先感谢您的帮助。

Answer 1

这是在分配一个指向主机内存的新指针：

 test[i].array = (char*)malloc(size * sizeof(char));

这是将数据复制到主机内存中的那个区域：
```
 memcpy(test[i].array, temp, size * sizeof(char));
```
这是用新指针覆盖先前分配的主机内存指针（来自上面的步骤1）到设备内存：
```
 cudaMalloc((void**)&(test[i].array), size * sizeof(char));
```

在第 3 步之后，您在第 2 步中设置的数据将完全丢失，并且无法再以任何方式访问。参考您链接的 question/answer 中的步骤 3 和 4：

3.Create a separate int pointer on the host, let's call it myhostptr

4.cudaMalloc int storage on the device for myhostptr

你还没有这样做。您没有创建单独的指针。您重新使用（擦除、覆盖）了一个现有的指针，该指针指向您在主机上关心的数据。 This question/answer，也从您链接的答案链接，几乎完全给出了您需要遵循的步骤，代码。

这是你的代码的修改版本，它正确地实现了你没有根据你链接的 question/answer 正确实现的缺失的步骤 3 和 4（和 5）：（参考描述步骤的评论3,4,5)

$ cat t755.cu
#include <stdio.h>
#include <stdlib.h>

struct Test {
    char *array;
};

__global__ void kernel(Test *dev_test) {
    for(int i=0; i < 5; i++) {
        printf("Kernel[0][i]: %c \n", dev_test[0].array[i]);
    }
}

int main(void) {

    int n = 4, size = 5;
    Test *dev_test, *test;

    test = (Test*)malloc(sizeof(Test)*n);
    for(int i = 0; i < n; i++)
        test[i].array = (char*)malloc(size * sizeof(char));

    for(int i=0; i < n; i++) {
        char temp[] = { 'a', 'b', 'c', 'd' , 'e' };
        memcpy(test[i].array, temp, size * sizeof(char));
    }

    cudaMalloc((void**)&dev_test, n * sizeof(Test));
    cudaMemcpy(dev_test, test, n * sizeof(Test), cudaMemcpyHostToDevice);

    // Step 3:
    char *temp_data[n];
    // Step 4:
    for (int i=0; i < n; i++)
      cudaMalloc(&(temp_data[i]), size*sizeof(char));
    // Step 5:
    for (int i=0; i < n; i++)
      cudaMemcpy(&(dev_test[i].array), &(temp_data[i]), sizeof(char *), cudaMemcpyHostToDevice);
    // now copy the embedded data:
    for (int i=0; i < n; i++)
      cudaMemcpy(temp_data[i], test[i].array, size*sizeof(char), cudaMemcpyHostToDevice);

    kernel<<<1, 1>>>(dev_test);
    cudaDeviceSynchronize();

    //  memory free
    return 0;
}

$ nvcc -o t755 t755.cu
$ cuda-memcheck ./t755
========= CUDA-MEMCHECK
Kernel[0][i]: a
Kernel[0][i]: b
Kernel[0][i]: c
Kernel[0][i]: d
Kernel[0][i]: e
========= ERROR SUMMARY: 0 errors
$

由于上述方法对初学者来说可能具有挑战性，通常的建议是不要这样做，而是展平您的数据结构。扁平化一般是指重新排列数据存储，以去除必须单独分配的嵌入式指针。

扁平化此数据结构的一个简单示例是改用它：

struct Test {
    char array[5];
};

当然，这种 特殊的 方法并不能满足所有目的，但它应该可以说明一般情况 idea/intent。修改之后，举个例子，代码变得简单多了：

$ cat t755.cu
#include <stdio.h>
#include <stdlib.h>

struct Test {
    char array[5];
};

__global__ void kernel(Test *dev_test) {
    for(int i=0; i < 5; i++) {
        printf("Kernel[0][i]: %c \n", dev_test[0].array[i]);
    }
}

int main(void) {

    int n = 4, size = 5;
    Test *dev_test, *test;

    test = (Test*)malloc(sizeof(Test)*n);

    for(int i=0; i < n; i++) {
        char temp[] = { 'a', 'b', 'c', 'd' , 'e' };
        memcpy(test[i].array, temp, size * sizeof(char));
    }

    cudaMalloc((void**)&dev_test, n * sizeof(Test));
    cudaMemcpy(dev_test, test, n * sizeof(Test), cudaMemcpyHostToDevice);

    kernel<<<1, 1>>>(dev_test);
    cudaDeviceSynchronize();

    //  memory free
    return 0;
}
$ nvcc -o t755 t755.cu
$ cuda-memcheck ./t755
========= CUDA-MEMCHECK
Kernel[0][i]: a
Kernel[0][i]: b
Kernel[0][i]: c
Kernel[0][i]: d
Kernel[0][i]: e
========= ERROR SUMMARY: 0 errors
$

GPU 上动态结构数组的内存分配

Memory allocation on GPU for dynamic array of structs

c

struct

cuda

dynamic-memory-allocation