Thrust - 在 gpu 上对 class 对象的成员数组进行排序
Thrust - sorting member arrays of class object on gpu
目前我正在努力将一个用普通 cpu C++ 编写的分子动力学模拟程序移植到 Cuda。简而言之,程序初始化一个原子列表,将控制转移到 class CCalc
的对象,该对象计算 100
(或其他次数)迭代的原子力、速度和位置,最后 returns 在屏幕上绘制原子。
我的目标是在 gpu 上的 CCalc
运行 中拥有所有计算量大的函数。为了避免必须将 CCalc
中的所有计算常量一一复制,我决定将整个 class 复制到设备内存中,由 this__d
指向。由于绘图函数是从 cpu 调用的,原子列表需要每 100
次迭代在 cpu 和 gpu 之间复制,因此它不是 CCalc
的成员。
在函数 CCalc::refreshCellList()
中,我想重新排列 atoms__d
(驻留在设备内存中的原子列表),以便将同一单元中的所有原子组合在一起。换句话说,atoms__d
需要以cellId
为键进行排序。
因为我不想浪费时间实现自己的排序算法,所以我尝试使用 thrust::sort_by_key()
。这就是我被困的地方。函数 thrust::sort_by_key()
需要 device_ptr
个对象作为参数;但是我无法访问 cellId
,因为我只能将 this__d
转换为 device_ptr
,我无法在 cpu.
上取消引用
有没有办法在不破坏 "class on gpu" 结构的情况下做到这一点?
这是我的代码(摘录):
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <vector>
#include <thrust\sort.h>
#include <thrust\device_ptr.h>
#define REFRESH_CELL_LISTS 20
struct Atom
{
float pos[3];
float vel[3];
float force[3];
// others
};
std::vector<Atom> atom;
Atom *atom__d;
int noOfAtoms = 0;
class CCalc;
__global__ void makeCells(CCalc *C, Atom *A);
class CCalc
{
private:
CCalc *this__d;
public:
const int nAtoms = noOfAtoms;
int *cellId;
const int nCellX = 4, nCellY = 3;
// many force calculation constants
CCalc()
{
cudaMalloc((void**)&cellId, nAtoms*sizeof(int));
// some other stuff
cudaMalloc((void**)&this__d, sizeof(CCalc));
cudaMemcpy(this__d, this, sizeof(CCalc), cudaMemcpyHostToDevice);
}
// destructor
void relaxStructure(int numOfIterations)
{
cudaMalloc((void**)&atom__d, nAtoms*sizeof(Atom));
cudaMemcpy(atom__d, &atom[0], nAtoms*sizeof(Atom), cudaMemcpyHostToDevice);
for(int iter = 0; iter < numOfIterations; iter++)
{
// stuff
if(!(iter % REFRESH_CELL_LISTS)) refreshCellLists();
// calculate forces; update velocities and positions
}
cudaMemcpy(&atom[0], atom__d, nAtoms*sizeof(Atom), cudaMemcpyDeviceToHost);
cudaFree(atom__d);
}
// functions for force, velocity and position calculation
void refreshCellLists()
{
makeCells<<<(nAtoms + 31) / 32, 32>>>(this__d, atom__d);
cudaDeviceSynchronize();
// sort atom__d array using cellId as keys;
// here is where I would like to use thrust::sort_by_key()
}
};
__global__ void makeCells(CCalc *C, Atom *A)
{
int index = blockDim.x*blockIdx.x + threadIdx.x;
if(index < C->nAtoms)
{
// determine cell x, y based on position
// for now let's use an arbitrary mapping to obtain x, y
int X = (index * index) % C->nCellX;
int Y = (index * index) % C->nCellY;
C->cellId[index] = X + Y * C->nCellX;
}
}
int main()
{
cudaSetDevice(0);
noOfAtoms = 1000; // normally defined by input file
atom.resize(noOfAtoms);
// initialise atom positions, velocities and forces
CCalc calcObject;
while(true) // as long as we need
{
// draw atoms on screen
calcObject.relaxStructure(100);
}
}
非常感谢。
In other words, atoms__d needs to be sorted with cellId as keys.
应该可以在 refreshCellLists
方法中指定的位置执行此操作。为简单起见,我选择直接使用原始设备指针(尽管我们也可以轻松地将这些原始设备指针包装在 thrust::device_ptr
中)结合 thrust::device
执行策略。这是一个有效的例子:
$ cat t1156.cu
#include <vector>
#include <thrust/execution_policy.h>
#include <thrust/sort.h>
#include <thrust/device_ptr.h>
#define REFRESH_CELL_LISTS 20
struct Atom
{
float pos[3];
float vel[3];
float force[3];
// others
};
std::vector<Atom> atom;
Atom *atom__d;
int noOfAtoms = 0;
class CCalc;
__global__ void makeCells(CCalc *C, Atom *A);
class CCalc
{
private:
CCalc *this__d;
public:
const int nAtoms = noOfAtoms;
int *cellId;
const int nCellX = 4, nCellY = 3;
// many force calculation constants
CCalc()
{
cudaMalloc((void**)&cellId, nAtoms*sizeof(int));
// some other stuff
cudaMalloc((void**)&this__d, sizeof(CCalc));
cudaMemcpy(this__d, this, sizeof(CCalc), cudaMemcpyHostToDevice);
}
// destructor
void relaxStructure(int numOfIterations)
{
cudaMalloc((void**)&atom__d, nAtoms*sizeof(Atom));
cudaMemcpy(atom__d, &atom[0], nAtoms*sizeof(Atom), cudaMemcpyHostToDevice);
for(int iter = 0; iter < numOfIterations; iter++)
{
// stuff
if(!(iter % REFRESH_CELL_LISTS)) refreshCellLists();
// calculate forces; update velocities and positions
}
cudaMemcpy(&atom[0], atom__d, nAtoms*sizeof(Atom), cudaMemcpyDeviceToHost);
cudaFree(atom__d);
}
// functions for force, velocity and position calculation
void refreshCellLists()
{
makeCells<<<(nAtoms + 31) / 32, 32>>>(this__d, atom__d);
cudaDeviceSynchronize();
// sort atom__d array using cellId as keys;
thrust::sort_by_key(thrust::device, cellId, cellId+nAtoms, atom__d);
}
};
__global__ void makeCells(CCalc *C, Atom *A)
{
int index = blockDim.x*blockIdx.x + threadIdx.x;
if(index < C->nAtoms)
{
// determine cell x, y based on position
// for now let's use an arbitrary mapping to obtain x, y
int X = (index * index) % C->nCellX;
int Y = (index * index) % C->nCellY;
C->cellId[index] = X + Y * C->nCellX;
}
}
int main()
{
cudaSetDevice(0);
noOfAtoms = 1000; // normally defined by input file
atom.resize(noOfAtoms);
// initialise atom positions, velocities and forces
CCalc calcObject;
for (int i = 0; i < 100; i++) // as long as we need
{
// draw atoms on screen
calcObject.relaxStructure(100);
}
}
$ nvcc -std=c++11 -o t1156 t1156.cu
$ cuda-memcheck ./t1156
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
在构建推力代码时,尤其是在 windows 上,我通常会总结出一组建议 here。
目前我正在努力将一个用普通 cpu C++ 编写的分子动力学模拟程序移植到 Cuda。简而言之,程序初始化一个原子列表,将控制转移到 class CCalc
的对象,该对象计算 100
(或其他次数)迭代的原子力、速度和位置,最后 returns 在屏幕上绘制原子。
我的目标是在 gpu 上的 CCalc
运行 中拥有所有计算量大的函数。为了避免必须将 CCalc
中的所有计算常量一一复制,我决定将整个 class 复制到设备内存中,由 this__d
指向。由于绘图函数是从 cpu 调用的,原子列表需要每 100
次迭代在 cpu 和 gpu 之间复制,因此它不是 CCalc
的成员。
在函数 CCalc::refreshCellList()
中,我想重新排列 atoms__d
(驻留在设备内存中的原子列表),以便将同一单元中的所有原子组合在一起。换句话说,atoms__d
需要以cellId
为键进行排序。
因为我不想浪费时间实现自己的排序算法,所以我尝试使用 thrust::sort_by_key()
。这就是我被困的地方。函数 thrust::sort_by_key()
需要 device_ptr
个对象作为参数;但是我无法访问 cellId
,因为我只能将 this__d
转换为 device_ptr
,我无法在 cpu.
有没有办法在不破坏 "class on gpu" 结构的情况下做到这一点?
这是我的代码(摘录):
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <vector>
#include <thrust\sort.h>
#include <thrust\device_ptr.h>
#define REFRESH_CELL_LISTS 20
struct Atom
{
float pos[3];
float vel[3];
float force[3];
// others
};
std::vector<Atom> atom;
Atom *atom__d;
int noOfAtoms = 0;
class CCalc;
__global__ void makeCells(CCalc *C, Atom *A);
class CCalc
{
private:
CCalc *this__d;
public:
const int nAtoms = noOfAtoms;
int *cellId;
const int nCellX = 4, nCellY = 3;
// many force calculation constants
CCalc()
{
cudaMalloc((void**)&cellId, nAtoms*sizeof(int));
// some other stuff
cudaMalloc((void**)&this__d, sizeof(CCalc));
cudaMemcpy(this__d, this, sizeof(CCalc), cudaMemcpyHostToDevice);
}
// destructor
void relaxStructure(int numOfIterations)
{
cudaMalloc((void**)&atom__d, nAtoms*sizeof(Atom));
cudaMemcpy(atom__d, &atom[0], nAtoms*sizeof(Atom), cudaMemcpyHostToDevice);
for(int iter = 0; iter < numOfIterations; iter++)
{
// stuff
if(!(iter % REFRESH_CELL_LISTS)) refreshCellLists();
// calculate forces; update velocities and positions
}
cudaMemcpy(&atom[0], atom__d, nAtoms*sizeof(Atom), cudaMemcpyDeviceToHost);
cudaFree(atom__d);
}
// functions for force, velocity and position calculation
void refreshCellLists()
{
makeCells<<<(nAtoms + 31) / 32, 32>>>(this__d, atom__d);
cudaDeviceSynchronize();
// sort atom__d array using cellId as keys;
// here is where I would like to use thrust::sort_by_key()
}
};
__global__ void makeCells(CCalc *C, Atom *A)
{
int index = blockDim.x*blockIdx.x + threadIdx.x;
if(index < C->nAtoms)
{
// determine cell x, y based on position
// for now let's use an arbitrary mapping to obtain x, y
int X = (index * index) % C->nCellX;
int Y = (index * index) % C->nCellY;
C->cellId[index] = X + Y * C->nCellX;
}
}
int main()
{
cudaSetDevice(0);
noOfAtoms = 1000; // normally defined by input file
atom.resize(noOfAtoms);
// initialise atom positions, velocities and forces
CCalc calcObject;
while(true) // as long as we need
{
// draw atoms on screen
calcObject.relaxStructure(100);
}
}
非常感谢。
In other words, atoms__d needs to be sorted with cellId as keys.
应该可以在 refreshCellLists
方法中指定的位置执行此操作。为简单起见,我选择直接使用原始设备指针(尽管我们也可以轻松地将这些原始设备指针包装在 thrust::device_ptr
中)结合 thrust::device
执行策略。这是一个有效的例子:
$ cat t1156.cu
#include <vector>
#include <thrust/execution_policy.h>
#include <thrust/sort.h>
#include <thrust/device_ptr.h>
#define REFRESH_CELL_LISTS 20
struct Atom
{
float pos[3];
float vel[3];
float force[3];
// others
};
std::vector<Atom> atom;
Atom *atom__d;
int noOfAtoms = 0;
class CCalc;
__global__ void makeCells(CCalc *C, Atom *A);
class CCalc
{
private:
CCalc *this__d;
public:
const int nAtoms = noOfAtoms;
int *cellId;
const int nCellX = 4, nCellY = 3;
// many force calculation constants
CCalc()
{
cudaMalloc((void**)&cellId, nAtoms*sizeof(int));
// some other stuff
cudaMalloc((void**)&this__d, sizeof(CCalc));
cudaMemcpy(this__d, this, sizeof(CCalc), cudaMemcpyHostToDevice);
}
// destructor
void relaxStructure(int numOfIterations)
{
cudaMalloc((void**)&atom__d, nAtoms*sizeof(Atom));
cudaMemcpy(atom__d, &atom[0], nAtoms*sizeof(Atom), cudaMemcpyHostToDevice);
for(int iter = 0; iter < numOfIterations; iter++)
{
// stuff
if(!(iter % REFRESH_CELL_LISTS)) refreshCellLists();
// calculate forces; update velocities and positions
}
cudaMemcpy(&atom[0], atom__d, nAtoms*sizeof(Atom), cudaMemcpyDeviceToHost);
cudaFree(atom__d);
}
// functions for force, velocity and position calculation
void refreshCellLists()
{
makeCells<<<(nAtoms + 31) / 32, 32>>>(this__d, atom__d);
cudaDeviceSynchronize();
// sort atom__d array using cellId as keys;
thrust::sort_by_key(thrust::device, cellId, cellId+nAtoms, atom__d);
}
};
__global__ void makeCells(CCalc *C, Atom *A)
{
int index = blockDim.x*blockIdx.x + threadIdx.x;
if(index < C->nAtoms)
{
// determine cell x, y based on position
// for now let's use an arbitrary mapping to obtain x, y
int X = (index * index) % C->nCellX;
int Y = (index * index) % C->nCellY;
C->cellId[index] = X + Y * C->nCellX;
}
}
int main()
{
cudaSetDevice(0);
noOfAtoms = 1000; // normally defined by input file
atom.resize(noOfAtoms);
// initialise atom positions, velocities and forces
CCalc calcObject;
for (int i = 0; i < 100; i++) // as long as we need
{
// draw atoms on screen
calcObject.relaxStructure(100);
}
}
$ nvcc -std=c++11 -o t1156 t1156.cu
$ cuda-memcheck ./t1156
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
在构建推力代码时,尤其是在 windows 上,我通常会总结出一组建议 here。