Thrust - 在 gpu 上对 class 对象的成员数组进行排序

Thrust - sorting member arrays of class object on gpu

目前我正在努力将一个用普通 cpu C++ 编写的分子动力学模拟程序移植到 Cuda。简而言之,程序初始化一个原子列表,将控制转移到 class CCalc 的对象,该对象计算 100(或其他次数)迭代的原子力、速度和位置,最后 returns 在屏幕上绘制原子。

我的目标是在 gpu 上的 CCalc 运行 中拥有所有计算量大的函数。为了避免必须将 CCalc 中的所有计算常量一一复制,我决定将整个 class 复制到设备内存中,由 this__d 指向。由于绘图函数是从 cpu 调用的,原子列表需要每 100 次迭代在 cpu 和 gpu 之间复制,因此它不是 CCalc 的成员。

在函数 CCalc::refreshCellList() 中,我想重新排列 atoms__d(驻留在设备内存中的原子列表),以便将同一单元中的所有原子组合在一起。换句话说,atoms__d需要以cellId为键进行排序。

因为我不想浪费时间实现自己的排序算法,所以我尝试使用 thrust::sort_by_key()。这就是我被困的地方。函数 thrust::sort_by_key() 需要 device_ptr 个对象作为参数;但是我无法访问 cellId,因为我只能将 this__d 转换为 device_ptr,我无法在 cpu.

上取消引用

有没有办法在不破坏 "class on gpu" 结构的情况下做到这一点?

这是我的代码(摘录):

#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <vector>
#include <thrust\sort.h>
#include <thrust\device_ptr.h>

#define REFRESH_CELL_LISTS 20

struct Atom
{
    float pos[3];
    float vel[3];
    float force[3];
    // others
};

std::vector<Atom> atom;
Atom *atom__d;
int noOfAtoms = 0;

class CCalc;
__global__ void makeCells(CCalc *C, Atom *A);

class CCalc
{
private:
    CCalc *this__d;
public:
    const int nAtoms = noOfAtoms;
    int *cellId;
    const int nCellX = 4, nCellY = 3;
    // many force calculation constants

    CCalc()
    {
        cudaMalloc((void**)&cellId, nAtoms*sizeof(int));

        // some other stuff

        cudaMalloc((void**)&this__d, sizeof(CCalc));
        cudaMemcpy(this__d, this, sizeof(CCalc), cudaMemcpyHostToDevice);
    }

    // destructor

    void relaxStructure(int numOfIterations)
    {
        cudaMalloc((void**)&atom__d, nAtoms*sizeof(Atom));
        cudaMemcpy(atom__d, &atom[0], nAtoms*sizeof(Atom), cudaMemcpyHostToDevice);

        for(int iter = 0; iter < numOfIterations; iter++)
        {
            // stuff
            if(!(iter % REFRESH_CELL_LISTS)) refreshCellLists();
            // calculate forces; update velocities and positions
        }

        cudaMemcpy(&atom[0], atom__d, nAtoms*sizeof(Atom), cudaMemcpyDeviceToHost);
        cudaFree(atom__d);
    }

    // functions for force, velocity and position calculation

    void refreshCellLists()
    {
        makeCells<<<(nAtoms + 31) / 32, 32>>>(this__d, atom__d);
        cudaDeviceSynchronize();
        // sort atom__d array using cellId as keys;
        // here is where I would like to use thrust::sort_by_key()
    }
};

__global__ void makeCells(CCalc *C, Atom *A)
{
    int index = blockDim.x*blockIdx.x + threadIdx.x;
    if(index < C->nAtoms)
    {
        // determine cell x, y based on position
        // for now let's use an arbitrary mapping to obtain x, y
        int X = (index * index) % C->nCellX;
        int Y = (index * index) % C->nCellY;

        C->cellId[index] = X + Y * C->nCellX;
    }
}

int main()
{
    cudaSetDevice(0);

    noOfAtoms = 1000; // normally defined by input file
    atom.resize(noOfAtoms);

    // initialise atom positions, velocities and forces

    CCalc calcObject;

    while(true) // as long as we need
    {
        // draw atoms on screen
        calcObject.relaxStructure(100);
    }
}

非常感谢。

In other words, atoms__d needs to be sorted with cellId as keys.

应该可以在 refreshCellLists 方法中指定的位置执行此操作。为简单起见,我选择直接使用原始设备指针(尽管我们也可以轻松地将这些原始设备指针包装在 thrust::device_ptr 中)结合 thrust::device 执行策略。这是一个有效的例子:

$ cat t1156.cu
#include <vector>
#include <thrust/execution_policy.h>
#include <thrust/sort.h>
#include <thrust/device_ptr.h>

#define REFRESH_CELL_LISTS 20

struct Atom
{
    float pos[3];
    float vel[3];
    float force[3];
    // others
};

std::vector<Atom> atom;
Atom *atom__d;
int noOfAtoms = 0;

class CCalc;
__global__ void makeCells(CCalc *C, Atom *A);

class CCalc
{
private:
    CCalc *this__d;
public:
    const int nAtoms = noOfAtoms;
    int *cellId;
    const int nCellX = 4, nCellY = 3;
    // many force calculation constants

    CCalc()
    {
        cudaMalloc((void**)&cellId, nAtoms*sizeof(int));

        // some other stuff

        cudaMalloc((void**)&this__d, sizeof(CCalc));
        cudaMemcpy(this__d, this, sizeof(CCalc), cudaMemcpyHostToDevice);
    }

    // destructor

    void relaxStructure(int numOfIterations)
    {
        cudaMalloc((void**)&atom__d, nAtoms*sizeof(Atom));
        cudaMemcpy(atom__d, &atom[0], nAtoms*sizeof(Atom), cudaMemcpyHostToDevice);

        for(int iter = 0; iter < numOfIterations; iter++)
        {
            // stuff
            if(!(iter % REFRESH_CELL_LISTS)) refreshCellLists();
            // calculate forces; update velocities and positions
        }

        cudaMemcpy(&atom[0], atom__d, nAtoms*sizeof(Atom), cudaMemcpyDeviceToHost);
        cudaFree(atom__d);
    }

    // functions for force, velocity and position calculation

    void refreshCellLists()
    {
        makeCells<<<(nAtoms + 31) / 32, 32>>>(this__d, atom__d);
        cudaDeviceSynchronize();
        // sort atom__d array using cellId as keys;
        thrust::sort_by_key(thrust::device, cellId, cellId+nAtoms, atom__d);
    }
};

__global__ void makeCells(CCalc *C, Atom *A)
{
    int index = blockDim.x*blockIdx.x + threadIdx.x;
    if(index < C->nAtoms)
    {
        // determine cell x, y based on position
        // for now let's use an arbitrary mapping to obtain x, y
        int X = (index * index) % C->nCellX;
        int Y = (index * index) % C->nCellY;

        C->cellId[index] = X + Y * C->nCellX;
    }
}

int main()
{
    cudaSetDevice(0);

    noOfAtoms = 1000; // normally defined by input file
    atom.resize(noOfAtoms);

    // initialise atom positions, velocities and forces

    CCalc calcObject;

    for (int i = 0; i < 100; i++) // as long as we need
    {
        // draw atoms on screen
        calcObject.relaxStructure(100);
    }
}
$ nvcc -std=c++11 -o t1156 t1156.cu
$ cuda-memcheck ./t1156
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$

在构建推力代码时,尤其是在 windows 上,我通常会总结出一组建议 here