在 Cuda 中启动 2d 线程块崩溃
Lauching 2d block of threads in Cuda crashes
我正在构建 n 体模拟,使用 CUDA 来提高性能。我目前正致力于进一步并行化系统,以便粒子之间的每个交互都在单独的线程上运行。这将理论复杂度降低到 1,仅受 GPU 速度的限制。
为了实现这一点,我尝试使用 (N/T, N/T)
网格和 T*T
块(其中 T是每个块的线程数)。我已经能够启动 N*N 网格,但每当我尝试使用多维块(线程)时,内核就会崩溃:
error code invalid configuration arguments
这是 T=512 和 N=5000,但将它们减少到 T=128 和 N=1000 没有效果。以下是一些规格和代码:
Cuda SDK 版本:7.5
显卡:GTX 970 4gb
CC 版本:5.2
在 Windows 7
中的 MSVS 2013 64 位中编译
内核启动代码
dim3 block(TPB, TPB);
dim3 grid;
grid.x = (numParticles + TPB - 1) / TPB;
grid.y = (numParticles + TPB - 1) / TPB;
doParticles<<< grid, block >>>(d_pos, d_vel, d_acc, d_mass, numParticles, dt);
如何更改此代码以实现我的目标?
我可以 post 一些内核代码等,但认为这并不重要,因为内核甚至没有启动。让我知道是否有任何其他信息有用。
提前致谢。
编辑:
MCVE
主要
#define TPB 32
....
unsigned int numParticles = 1000;
p_type* h_pos;
p_type* h_vel;
p_type* h_acc;
p_type* h_mass;
p_type* d_pos;
p_type* d_vel;
p_type* d_acc;
p_type* d_mass;
int pointsPerParticleVec = 3;
size_t size = sizeof(p_type) * 3 * numParticles;
h_pos = (p_type*)malloc(size);
h_vel = (p_type*)malloc(size);
h_acc = (p_type*)malloc(size);
h_mass = (p_type*)malloc(size / 3);
d_pos = NULL;
d_vel = NULL;
d_acc = NULL;
cudaError_t err = cudaSuccess;
//allocate space on GPU
err = cudaMalloc((void **)&d_pos, size);
err = cudaMalloc((void **)&d_vel, size);
err = cudaMalloc((void **)&d_acc, size);
err = cudaMalloc((void **)&d_mass, size / 3);
//nothing really matters for this example just making sure no gargage values happen
for (int partIt = 0; partIt < numParticles; partIt++)
{
int index = partIt * 3;
h_pos[index] = 0;
h_pos[index + 1] = 0;
h_pos[index + 2] = 0;
h_vel[index] = 0;
h_vel[index + 1] = 0;
h_vel[index + 2] = 0;
h_acc[index] = 0;
h_acc[index + 1] = 0;
h_acc[index + 2] = 0;
h_mass[partIt] = 0;
}
err = cudaMemcpy(d_pos, h_pos, size, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_vel, h_vel, size, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_acc, h_acc, size, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_mass, h_mass, size / 3, cudaMemcpyHostToDevice);
while (true) //display functionality removed for now
{
//do calculations
float dt = .1;
dim3 block(TPB, TPB);
dim3 grid;
grid.x = (numParticles + TPB - 1) / TPB;
grid.y = (numParticles + TPB - 1) / TPB;
doParticles << < grid, block >> >(d_pos, d_vel, d_acc, d_mass, numParticles, dt); //<<<<<<<<<<<<here is where it does not launch
err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err)); //see the error pop up here
exit(EXIT_FAILURE);
}
cudaDeviceSynchronize();
int numBlocks2 = (numParticles * 3 + TPB - 1) / TPB;
//add acceleration to velocity
ARR_ADD << <numBlocks2, TPB >> >(d_vel, d_acc, numParticles * 3);
cudaDeviceSynchronize();
//reset acceleration vector
ARR_SET << <numBlocks2, TPB >> >(d_acc, 0.0f, numParticles * 3);
//add velocity to position
POS_ADD << <numBlocks2, TPB >> >(d_pos, d_vel, numParticles * 3, dt);
//copy vector back to cpu (until opengl-cuda gets implemented)
cudaMemcpy(h_pos, d_pos, sizeof(p_type) * 3 * numParticles, cudaMemcpyDeviceToHost);
}
内核
__device__ float fInvSqrt_D(const float& in)
{
long i;
float x2, y;
const float threehalfs = 1.5F;
x2 = in * 0.5F;
y = in;
i = *(long *)&y;
i = 0x5f3759df - (i >> 1);
y = *(float *)&i;
y = y * (threehalfs - (x2 * y * y));
y = y * (threehalfs - (x2 * y * y)); //extra precision
return abs(y);
}
__global__ void POS_ADD(p_type* getter, const p_type *giver, int N, float dt)
{
int index = blockDim.x * blockIdx.x + threadIdx.x;
if (index < N)
{
getter[index] = getter[index] + (giver[index]*dt);
}
}
__global__ void ARR_ADD(p_type* getter, const p_type *giver, int N)
{
int index = blockDim.x * blockIdx.x + threadIdx.x;
if (index < N)
{
getter[index] = getter[index] + giver[index];
}
}
__global__ void ARR_SET(p_type* getter, const p_type value, int N)
{
int index = blockDim.x * blockIdx.x + threadIdx.x;
if (index < N)
{
getter[index] = value;
}
}
__global__ void doParticles(p_type* pos, p_type* vel, p_type* acc, p_type* mass, int numParticles, float tstep)
{
int blockId = blockIdx.y * gridDim.x + blockIdx.x;
int threadIndex = blockId * blockDim.x + threadIdx.x;
int cRowPos = blockId % gridDim.y;
int notInRow = blockId - cRowPos;
int Y = blockId/gridDim.x + threadIdx.y; //slower changing iterator
int X = threadIndex - ((notInRow) * TPB); //fast changing iterator
int pIndex1 = X * 3;
int pIndex2 =Y * 3;
if (pIndex1 != pIndex2 && Y < numParticles)
{
p_type diffx = (pos[pIndex1] - pos[pIndex2]); //calculating difference between points
p_type diffy = (pos[pIndex1 + 1] - pos[pIndex2 + 1]);
p_type diffz = (pos[pIndex1 + 2] - pos[pIndex2 + 2]);
p_type distsqr = diffx*diffx + diffy*diffy + diffz*diffz;
if (distsqr < 0)
{
distsqr *= -1;
}
if (distsqr < 500)
{
distsqr = 500;
}
p_type attraction = (mass[X] * mass[Y]) / (distsqr); //gravity equation
p_type invsqrt = fInvSqrt_D((float)distsqr);
p_type normx = invsqrt*diffx;
p_type normy = invsqrt*diffy;
p_type normz = invsqrt*diffz;
p_type forcex = normx * -attraction;
p_type forcey = normy * -attraction;
p_type forcez = normz * -attraction;
acc[pIndex1] += (forcex * tstep) / mass[X];
acc[pIndex1 + 1] += (forcey * tstep) / mass[X];
acc[pIndex1 + 2] += (forcez * tstep) / mass[X];
}
}
是的,我知道 doParticle 内核中的索引已损坏。我计划修复它启动的那个。 :)
再次感谢。
CUDA 线程块限制为最多 1024 个线程,块中的线程总数是线程块维度的乘积:
dim3 block(TPB, TPB);
因此,任何大于 32 的 TPB
值在这里都不起作用,当您尝试启动任何此类内核时,您将收到无效的配置参数错误。
因此将 T
或 TPB
减少到 32,您应该能够启动内核。
我正在构建 n 体模拟,使用 CUDA 来提高性能。我目前正致力于进一步并行化系统,以便粒子之间的每个交互都在单独的线程上运行。这将理论复杂度降低到 1,仅受 GPU 速度的限制。
为了实现这一点,我尝试使用 (N/T, N/T)
网格和 T*T
块(其中 T是每个块的线程数)。我已经能够启动 N*N 网格,但每当我尝试使用多维块(线程)时,内核就会崩溃:
error code invalid configuration arguments
这是 T=512 和 N=5000,但将它们减少到 T=128 和 N=1000 没有效果。以下是一些规格和代码:
Cuda SDK 版本:7.5
显卡:GTX 970 4gb
CC 版本:5.2
在 Windows 7
中的 MSVS 2013 64 位中编译内核启动代码
dim3 block(TPB, TPB);
dim3 grid;
grid.x = (numParticles + TPB - 1) / TPB;
grid.y = (numParticles + TPB - 1) / TPB;
doParticles<<< grid, block >>>(d_pos, d_vel, d_acc, d_mass, numParticles, dt);
如何更改此代码以实现我的目标?
我可以 post 一些内核代码等,但认为这并不重要,因为内核甚至没有启动。让我知道是否有任何其他信息有用。
提前致谢。
编辑:
MCVE
主要
#define TPB 32
....
unsigned int numParticles = 1000;
p_type* h_pos;
p_type* h_vel;
p_type* h_acc;
p_type* h_mass;
p_type* d_pos;
p_type* d_vel;
p_type* d_acc;
p_type* d_mass;
int pointsPerParticleVec = 3;
size_t size = sizeof(p_type) * 3 * numParticles;
h_pos = (p_type*)malloc(size);
h_vel = (p_type*)malloc(size);
h_acc = (p_type*)malloc(size);
h_mass = (p_type*)malloc(size / 3);
d_pos = NULL;
d_vel = NULL;
d_acc = NULL;
cudaError_t err = cudaSuccess;
//allocate space on GPU
err = cudaMalloc((void **)&d_pos, size);
err = cudaMalloc((void **)&d_vel, size);
err = cudaMalloc((void **)&d_acc, size);
err = cudaMalloc((void **)&d_mass, size / 3);
//nothing really matters for this example just making sure no gargage values happen
for (int partIt = 0; partIt < numParticles; partIt++)
{
int index = partIt * 3;
h_pos[index] = 0;
h_pos[index + 1] = 0;
h_pos[index + 2] = 0;
h_vel[index] = 0;
h_vel[index + 1] = 0;
h_vel[index + 2] = 0;
h_acc[index] = 0;
h_acc[index + 1] = 0;
h_acc[index + 2] = 0;
h_mass[partIt] = 0;
}
err = cudaMemcpy(d_pos, h_pos, size, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_vel, h_vel, size, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_acc, h_acc, size, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_mass, h_mass, size / 3, cudaMemcpyHostToDevice);
while (true) //display functionality removed for now
{
//do calculations
float dt = .1;
dim3 block(TPB, TPB);
dim3 grid;
grid.x = (numParticles + TPB - 1) / TPB;
grid.y = (numParticles + TPB - 1) / TPB;
doParticles << < grid, block >> >(d_pos, d_vel, d_acc, d_mass, numParticles, dt); //<<<<<<<<<<<<here is where it does not launch
err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err)); //see the error pop up here
exit(EXIT_FAILURE);
}
cudaDeviceSynchronize();
int numBlocks2 = (numParticles * 3 + TPB - 1) / TPB;
//add acceleration to velocity
ARR_ADD << <numBlocks2, TPB >> >(d_vel, d_acc, numParticles * 3);
cudaDeviceSynchronize();
//reset acceleration vector
ARR_SET << <numBlocks2, TPB >> >(d_acc, 0.0f, numParticles * 3);
//add velocity to position
POS_ADD << <numBlocks2, TPB >> >(d_pos, d_vel, numParticles * 3, dt);
//copy vector back to cpu (until opengl-cuda gets implemented)
cudaMemcpy(h_pos, d_pos, sizeof(p_type) * 3 * numParticles, cudaMemcpyDeviceToHost);
}
内核
__device__ float fInvSqrt_D(const float& in)
{
long i;
float x2, y;
const float threehalfs = 1.5F;
x2 = in * 0.5F;
y = in;
i = *(long *)&y;
i = 0x5f3759df - (i >> 1);
y = *(float *)&i;
y = y * (threehalfs - (x2 * y * y));
y = y * (threehalfs - (x2 * y * y)); //extra precision
return abs(y);
}
__global__ void POS_ADD(p_type* getter, const p_type *giver, int N, float dt)
{
int index = blockDim.x * blockIdx.x + threadIdx.x;
if (index < N)
{
getter[index] = getter[index] + (giver[index]*dt);
}
}
__global__ void ARR_ADD(p_type* getter, const p_type *giver, int N)
{
int index = blockDim.x * blockIdx.x + threadIdx.x;
if (index < N)
{
getter[index] = getter[index] + giver[index];
}
}
__global__ void ARR_SET(p_type* getter, const p_type value, int N)
{
int index = blockDim.x * blockIdx.x + threadIdx.x;
if (index < N)
{
getter[index] = value;
}
}
__global__ void doParticles(p_type* pos, p_type* vel, p_type* acc, p_type* mass, int numParticles, float tstep)
{
int blockId = blockIdx.y * gridDim.x + blockIdx.x;
int threadIndex = blockId * blockDim.x + threadIdx.x;
int cRowPos = blockId % gridDim.y;
int notInRow = blockId - cRowPos;
int Y = blockId/gridDim.x + threadIdx.y; //slower changing iterator
int X = threadIndex - ((notInRow) * TPB); //fast changing iterator
int pIndex1 = X * 3;
int pIndex2 =Y * 3;
if (pIndex1 != pIndex2 && Y < numParticles)
{
p_type diffx = (pos[pIndex1] - pos[pIndex2]); //calculating difference between points
p_type diffy = (pos[pIndex1 + 1] - pos[pIndex2 + 1]);
p_type diffz = (pos[pIndex1 + 2] - pos[pIndex2 + 2]);
p_type distsqr = diffx*diffx + diffy*diffy + diffz*diffz;
if (distsqr < 0)
{
distsqr *= -1;
}
if (distsqr < 500)
{
distsqr = 500;
}
p_type attraction = (mass[X] * mass[Y]) / (distsqr); //gravity equation
p_type invsqrt = fInvSqrt_D((float)distsqr);
p_type normx = invsqrt*diffx;
p_type normy = invsqrt*diffy;
p_type normz = invsqrt*diffz;
p_type forcex = normx * -attraction;
p_type forcey = normy * -attraction;
p_type forcez = normz * -attraction;
acc[pIndex1] += (forcex * tstep) / mass[X];
acc[pIndex1 + 1] += (forcey * tstep) / mass[X];
acc[pIndex1 + 2] += (forcez * tstep) / mass[X];
}
}
是的,我知道 doParticle 内核中的索引已损坏。我计划修复它启动的那个。 :)
再次感谢。
CUDA 线程块限制为最多 1024 个线程,块中的线程总数是线程块维度的乘积:
dim3 block(TPB, TPB);
因此,任何大于 32 的 TPB
值在这里都不起作用,当您尝试启动任何此类内核时,您将收到无效的配置参数错误。
因此将 T
或 TPB
减少到 32,您应该能够启动内核。