使用推力对静态分配的数组进行排序

Question

在我的代码中，我在全局内存中有一个静态分配的数组（即，使用 __device__ 分配），我想使用 thrust::sort 对其进行排序，但这是行不通的。本主题的所有示例都使用 CUDA 运行时分配的数组（使用 cudaMalloc）。有什么办法可以对静态分配的数组进行排序吗？

我想这与主机无法访问静态分配的内存有关。使用 cudaMalloc 分配的数组，它工作正常。但是，我想避免使用这种类型的分配，因为静态分配允许更容易地从设备代码访问数据（不是吗？）。

最小（非）工作示例：

#include <stdio.h>
#include <thrust/device_ptr.h>
#include <thrust/sort.h>

#define N 4

typedef struct element {
  int key;
  int value;
  __host__ __device__ bool operator<(element e) const
  { return key > e.key; }
} element;

__device__ element array[N];

__global__ void init() {
  for (int i = 0; i < N; ++i) {
    array[N - i - 1].key = i;
  }
}

__global__ void print_array() {
  for (int i = 0; i < N; ++i) {
    printf("%d ", array[i].key);
  }
  printf("\n");
}

int main(void) {
  thrust::device_ptr<element> array_first(array);

  init<<<1,1>>>();

  printf("unsorted: ");
  print_array<<<1, 1>>>();
  cudaDeviceSynchronize();

  thrust::sort(array_first, array_first + N);

  printf("sorted: ");
  print_array<<<1, 1>>>();
  cudaDeviceSynchronize();
}

Answer 1

使用cudaGetSymbolAddress从__host__函数中获取array变量的地址：

void* array_ptr = 0;
cudaGetSymbolAddress(&array_ptr, array);
thrust::device_ptr<element> array_first(reinterpret_cast<element*>(array_ptr));

完整程序如下：

#include <stdio.h>
#include <thrust/device_ptr.h>
#include <thrust/sort.h>

#define N 4

typedef struct element {
  int key;
  int value;
  __host__ __device__ bool operator<(element e) const
  { return key > e.key; }
} element;

__device__ element array[N];

__global__ void init() {
  for (int i = 0; i < N; ++i) {
    array[N - i - 1].key = i;
  }
}

__global__ void print_array() {
  for (int i = 0; i < N; ++i) {
    printf("%d ", array[i].key);
  }
  printf("\n");
}

int main(void) {
  cudaError_t error;

  void* array_ptr = 0;
  if(error = cudaGetSymbolAddress(&array_ptr, array))
  {
    throw thrust::system_error(error, thrust::cuda_category());
  }

  thrust::device_ptr<element> array_first(reinterpret_cast<element*>(array_ptr));

  init<<<1,1>>>();

  printf("unsorted: ");
  print_array<<<1, 1>>>();

  if(error = cudaDeviceSynchronize())
  {
    throw thrust::system_error(error, thrust::cuda_category());
  }

  thrust::sort(array_first, array_first + N);

  if(error = cudaDeviceSynchronize())
  {
    throw thrust::system_error(error, thrust::cuda_category());
  }

  printf("sorted: ");
  print_array<<<1, 1>>>();

  if(error = cudaDeviceSynchronize())
  {
    throw thrust::system_error(error, thrust::cuda_category());
  }

  return 0;
}

这是我系统上的输出：

$ nvcc test.cu -run
unsorted: 3 2 1 0 
sorted: 3 2 1 0

排序后的输出与未排序的输出相同，但考虑到数据的生成方式和 element::operator<.

的定义，我猜这是故意的

Answer 2

这个：

__device__ element array[N];

...

thrust::device_ptr<element> array_first(array);

是非法的。在host代码中，array是主机地址，不能传递给设备代码。改为做这样的事情：

element* array_d;
cudaGetSymbolAddress((void **)&array_d, array);
thrust::device_ptr<element> array_first(array_d);

即您需要使用 cudaGetSymbolAddress 在运行时从 GPU 上下文中读取地址，然后您可以在 GPU 代码中使用该调用的结果。

使用推力对静态分配的数组进行排序

Sorting statically allocated array using Thrust

c++

sorting

cuda

thrust

最小（非）工作示例：