增加数组大小时 CUDA nbody tile 计算错误代码 77
CUDA nbody tile calculation error code 77 when increasing array size
我无法在 CUDA 代码中解决这个问题。
我基本上是在计算 gems3 中数组大小不断增加的 nbody 问题。
粒子在 __global__ void ParticleAmplification()
内核中的特定数组中创建,dev_Ionisation
,并通过主机函数添加到全局数组中 DynamicAlloc()
。
在这一个中,删除了空位置并将新位置放在新数组的末尾。
由于我抛出的线程多于可用粒子,因此我有一个转义变量以避免浪费时间检查是否有粒子。
块和瓦片的数量是动态分配的,设备数组是通过以下方式重新分配的:
checkCudaErrors( cudaFree( dev_vector ) );
checkCudaErrors( cudaMalloc( (void**)&dev_vector, N * sizeof(ParticleProperties) ) );
然后经过一些步骤,通常当粒子数量增加到大约 28000 时,内核崩溃。它给了我错误代码 77,这似乎归因于 (cudaDeviceSynchronize() error code 77: cudaErrorIllegalAddress) __device__ float3 computeBodyAccel
函数中外部共享变量大小 extern __shared__ float3 sharedPos[]
的错误大小。但是,那个似乎总是以相同的大小正确传递给内核:
size_t sharedMemSize = ThreadsInit * sizeof(float3);
integrateBodies<<<blocksInit, ThreadsInit, sharedMemSize>>>( dev_vectorIonisation, dt, numTiles, nbodyTemp );
当使用固定的大数组时,一切正常。
我做错了什么?共享内存是否会因未释放的内存而变满?
这是完整的可编译代码:
// ----- Libraries C/C++ ----- //
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <iomanip>
#include <string>
#include <math.h>
#include <time.h>
#include <dirent.h>
// ----- Libraries CUDA ----- //
#include "cuda.h"
#include <helper_cuda.h>
#include "curand_kernel.h"
// ----- Global variables ----- //
#define El_DIM 512
#define imin(a,b) (a<b?a:b)
using namespace std;
__constant__ float softening_ = 1.0e-12; // softening factor for nbody interaction
__device__ __managed__ int NewParticles = 0;
__device__ __managed__ int TotalProcesses = 0;
__device__ __managed__ bool Ampl = false;
const int ThreadsInit = 512;
const int blocksPerGrid = (int)( ( El_DIM * El_DIM + ThreadsInit -1 ) / ThreadsInit );
struct ParticleProperties{
float3 Position, Velocity, Force;
};
__device__ void initVector( ParticleProperties *dev_Vect, int index ){
dev_Vect[index].Position.x = -1.0;
dev_Vect[index].Position.y = -1.0;
dev_Vect[index].Position.z = -1.0;
dev_Vect[index].Velocity.x = 0.0;
dev_Vect[index].Velocity.y = 0.0;
dev_Vect[index].Velocity.z = 0.0;
dev_Vect[index].Force.x = 0.0;
dev_Vect[index].Force.y = 0.0;
dev_Vect[index].Force.z = 0.0;
}
__device__ void SetVector( ParticleProperties *dev_Vect, float3 position, float4 v, int index ){
dev_Vect[index].Position.x = position.x;
dev_Vect[index].Position.y = position.y;
dev_Vect[index].Position.z = position.z;
dev_Vect[index].Velocity.x = v.x;
dev_Vect[index].Velocity.y = v.y;
dev_Vect[index].Velocity.z = v.z;
dev_Vect[index].Force.x = 0.0;
dev_Vect[index].Force.y = 0.0;
dev_Vect[index].Force.z = 0.0;
}
__device__ float3 bodyBodyInteraction( float3 fi, float3 bi, float3 bj ){
float3 r;
// r_ij [4 FLOPS]
r.x = ( bj.x - bi.x );
r.y = ( bj.y - bi.y );
r.z = ( bj.z - bi.z );
r.z = 0.0;
// distSqr = dot(r_ij, r_ij) + EPS^2 [7 FLOPS]
float distSqr = r.x * r.x + ( r.y * r.y + ( r.z * r.z + softening_ * softening_ ) );
// invDistCube =1/distSqr^(3/2) [4 FLOPS (2 mul, 1 sqrt, 1 inv)]
float invDist = rsqrt(distSqr);
float invDistCube = invDist * invDist * invDist;
// s = m_j * invDistCube [2 FLOP]
float s = invDistCube;
// a_i = a_i + s * r_ij [6 FLOPS]
fi.x += r.x * s;
fi.y += r.y * s;
fi.z += r.z * s;
return fi;
}
__device__ float3 computeBodyAccel( float3 force, float3 bodyPos, ParticleProperties * positions, const int numTiles, const int nbody ){
extern __shared__ float3 sharedPos[];
int computedNbody = 0;
for( int tile = 0; tile < numTiles; tile++ ){
sharedPos[threadIdx.x] = positions[tile * blockDim.x + threadIdx.x].Position;
__syncthreads();
// This is the "tile_calculation" from the GPUG3 article.
#pragma unroll 128
for( unsigned int counter = 0; counter < blockDim.x; counter++ ){
force = bodyBodyInteraction(force, bodyPos, sharedPos[counter]);
computedNbody++;
if( computedNbody == nbody ) break;
}
__syncthreads();
}
return force;
}
__global__ void integrateBodies( ParticleProperties * __restrict__ dev_vector, float deltaTime, int numTiles, int nbody ){
int index = blockIdx.x * blockDim.x + threadIdx.x;
float3 position = {0.0, 0.0, 0.0};
float3 force = {0.0, 0.0, 0.0};
if( index < nbody ){
position = dev_vector[index].Position;
force = computeBodyAccel( force, position, dev_vector, numTiles, nbody );
// store new force
dev_vector[index].Position = position;
dev_vector[index].Force = force;
}
}
__global__ void IntegrationKernel( ParticleProperties * __restrict__ dev_vector, const float deltaT, const int nbody ){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
float3 dvel;
float3 velocity;
if( tid < nbody ){
// integrate
dvel.x = dev_vector[tid].Force.x * deltaT * 0.5;
dvel.y = dev_vector[tid].Force.y * deltaT * 0.5;
dvel.z = dev_vector[tid].Force.z * deltaT * 0.5;
velocity.x = dev_vector[tid].Velocity.x + dvel.x;
velocity.y = dev_vector[tid].Velocity.y + dvel.y;
velocity.z = dev_vector[tid].Velocity.z + dvel.z;
dev_vector[tid].Position.x += velocity.x * deltaT;
dev_vector[tid].Position.y += velocity.y * deltaT;
dev_vector[tid].Position.z += velocity.z * deltaT;
dev_vector[tid].Velocity.x = velocity.x + dvel.x;
dev_vector[tid].Velocity.y = velocity.y + dvel.y;
dev_vector[tid].Velocity.z = velocity.z + dvel.z;
}
}
__global__ void ParticleAmplification( curandState *state, ParticleProperties * __restrict__ dev_vectorIonisation,
ParticleProperties * __restrict__ dev_Ionisation,
const float dt, int numbodies ){
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int LocalProcesses = 0;
float3 position = {0.0, 0.0, 0.0};
float4 v_new = {0.0, 0.0, 0.0, 0.0};
float prob = 0.0;
if( TotalProcesses >= El_DIM * El_DIM - 1 ) Ampl = false;
if( tid < numbodies ){
position.x = dev_vectorIonisation[tid].Position.x;
position.y = dev_vectorIonisation[tid].Position.y;
position.z = dev_vectorIonisation[tid].Position.z;
prob = curand_uniform( &state[tid] );
if( Ampl ){
if( prob < 1.e-3 ){
atomicAdd( &TotalProcesses, 1 );
LocalProcesses = atomicAdd( &NewParticles, 1 );
v_new.x = 0.0;
v_new.y = 0.0;
v_new.z = 0.0;
SetVector( dev_Ionisation, position, v_new, LocalProcesses );
}
}
}
}
__global__ void initCurand( curandState *state, unsigned long seed ){
int tid = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, tid, 0, &state[tid]);
}
__global__ void initProcessIoni( ParticleProperties *dev_Vect ){
int x = threadIdx.x + blockIdx.x * blockDim.x;
initVector( dev_Vect, x );
}
__global__ void Enumerate_Nbody( ParticleProperties *dev_Vect, int *N, int PrevNbody ){
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int gid = blockIdx.x;
extern __shared__ int cache[];
if( tid == 0 )
*N = 0;
if( threadIdx.x == 0 )
cache[gid] = 0;
__syncthreads();
while( tid < PrevNbody ){
if( dev_Vect[tid].Position.x > -1.0 )
atomicAdd( &(cache[gid]), 1 );
tid += blockDim.x * gridDim.x;
}
__syncthreads();
if( threadIdx.x == 0 )
atomicAdd( N, cache[gid] );
}
void DynamicAlloc( ParticleProperties **DynamicVector, const ParticleProperties *StaticVector, const int n, int nbody, const int max ){
ParticleProperties *h_vectorIonisation = new ParticleProperties [nbody];
ParticleProperties *VectTemporary = new ParticleProperties [n];
checkCudaErrors( cudaMemcpy( VectTemporary, *DynamicVector, n * sizeof(ParticleProperties), cudaMemcpyDeviceToHost ) );
checkCudaErrors( cudaFree( *DynamicVector ) );
int i = 0;
int j = 0;
for( i = 0; i < n; i++ ){
if( VectTemporary[i].Position.x > -1.0 ){
h_vectorIonisation[j] = VectTemporary[i];
j++;
}
}
delete [] VectTemporary;
if( NewParticles != 0 ){
ParticleProperties *StaticVectTemporary = new ParticleProperties [max];
checkCudaErrors( cudaMemcpy( StaticVectTemporary, StaticVector, max * sizeof(ParticleProperties), cudaMemcpyDeviceToHost ) );
int k = 0;
#pragma unroll 32
for( i = 0; i < max; i++ ){
if( StaticVectTemporary[i].Position.x > -1.0 ){
h_vectorIonisation[j + k] = StaticVectTemporary[i];
k++;
}
}
delete [] StaticVectTemporary;
}
if( nbody > 0 ){
checkCudaErrors( cudaMalloc( (void**)DynamicVector, nbody * sizeof(ParticleProperties) ) );
checkCudaErrors( cudaMemcpy( *DynamicVector, h_vectorIonisation, nbody * sizeof(ParticleProperties), cudaMemcpyHostToDevice ) );
}
delete [] h_vectorIonisation;
}
int main( int argc_, char **argv_ ){
cudaDeviceReset();
cudaDeviceProp prop;
checkCudaErrors( cudaGetDeviceProperties( &prop, 0 ) );
int Newers = 256;
int nbody = 1;
Ampl = true;
int *dev_nbody;
checkCudaErrors( cudaMalloc( (void**)&dev_nbody, sizeof(int) ) );
checkCudaErrors( cudaMemcpy( dev_nbody, &nbody, sizeof(int), cudaMemcpyHostToDevice ) );
float dt = 0.5e-13;
float3 pos;
pos.x = 1.0 / 2.0 * 1.0e-3;
pos.y = 1.0 / 2.0 * 1.0e-3;
pos.z = 1.0 / 2.0 * 1.0e-3;
float3 speed;
speed.x = 0.0;
speed.y = 0.0;
speed.z = 0.0;
ParticleProperties *dev_vectorIonisation;
checkCudaErrors( cudaMalloc( (void**)&dev_vectorIonisation, nbody * sizeof(ParticleProperties) ) );
ParticleProperties *host_vectorIonisation = new ParticleProperties [nbody];
clog << "Particles array initialisation...";
for( int i = 0; i < nbody; i++ ){
host_vectorIonisation[i].Position.x = drand48() * 1.0e-6 + pos.x;
host_vectorIonisation[i].Position.y = drand48() * 1.0e-6 + pos.y;
host_vectorIonisation[i].Position.z = 0.0;
host_vectorIonisation[i].Velocity.x = speed.x;
host_vectorIonisation[i].Velocity.y = speed.y;
host_vectorIonisation[i].Velocity.z = speed.z;
host_vectorIonisation[i].Force.x = 0.0;
host_vectorIonisation[i].Force.y = 0.0;
host_vectorIonisation[i].Force.z = 0.0;
}
checkCudaErrors( cudaMemcpy( dev_vectorIonisation, host_vectorIonisation, nbody * sizeof(ParticleProperties), cudaMemcpyHostToDevice ) );
delete [] host_vectorIonisation;
clog << "Done" << endl;
ParticleProperties *dev_Ionisation;
checkCudaErrors( cudaMalloc( (void**)&dev_Ionisation, Newers * sizeof(ParticleProperties) ) );
curandState *RndState;
checkCudaErrors( cudaMalloc( (void**)&RndState, El_DIM * El_DIM * sizeof(curandState) ) );
unsigned long seed = 1773;
clog << "cuRand array initialisation...";
initCurand<<<blocksPerGrid, ThreadsInit>>>( RndState, seed );
initProcessIoni<<<1, Newers>>>( dev_Ionisation );
clog << "Done" << endl;
clog << "Propagation of " << nbody << " primary particle(s)." << endl;
int ProcessTemp = 0;
int nbodyTemp = nbody;
int blocksInit = (nbody + ThreadsInit - 1) / ThreadsInit;
int numTiles = (nbody + ThreadsInit - 1) / ThreadsInit;
size_t sharedMemSize = ThreadsInit * sizeof(float3);
char buffer[64];
setvbuf(stdout, buffer, _IOFBF, sizeof(buffer));
while( nbody > 0 ){
integrateBodies<<<blocksInit, ThreadsInit, sharedMemSize>>>( dev_vectorIonisation, dt, numTiles, nbodyTemp );
IntegrationKernel<<<blocksInit, ThreadsInit>>>( dev_vectorIonisation, dt, nbodyTemp );
ParticleAmplification<<<blocksInit, ThreadsInit>>>( RndState, dev_vectorIonisation, dev_Ionisation, dt, nbodyTemp );
checkCudaErrors( cudaDeviceSynchronize() );
Enumerate_Nbody<<<blocksInit, ThreadsInit, blocksInit * sizeof(int)>>>( dev_vectorIonisation, dev_nbody, nbodyTemp );
checkCudaErrors( cudaDeviceSynchronize() );
getLastCudaError("Kernel enumerate bodies execution failed");
checkCudaErrors( cudaMemcpy( &nbody, dev_nbody, sizeof(int), cudaMemcpyDeviceToHost ) );
nbody += NewParticles;
if( NewParticles > ProcessTemp ) ProcessTemp = NewParticles;
if( nbody != nbodyTemp ){
DynamicAlloc( &dev_vectorIonisation, dev_Ionisation, nbodyTemp, nbody, Newers );
numTiles = blocksInit = ( nbody + ThreadsInit - 1) / ThreadsInit;
if( NewParticles != 0 ){
initProcessIoni<<<1, Newers>>>( dev_Ionisation );
checkCudaErrors( cudaDeviceSynchronize() );
}
nbodyTemp = nbody;
NewParticles = 0;
checkCudaErrors( cudaDeviceSynchronize() );
}
printf("\r nbodies: %d", nbodyTemp);
}
checkCudaErrors( cudaFree( dev_Ionisation ) );
}
这是在具有 3.5 计算能力的 GTX Titan Black 上执行的
您的问题始于这行代码(在 main
中):
numTiles = blocksInit = ( nbody + ThreadsInit - 1) / ThreadsInit;
这会创建足够的图块来完全覆盖 nbody
的大小,但并非每个图块都充满了物体。
问题实际上在此时出现在您的 computeBodyAccel
例程中,从 integrateBodies
:
调用
for( int tile = 0; tile < numTiles; tile++ ){
sharedPos[threadIdx.x] = positions[tile * blockDim.x + threadIdx.x].Position;
您对 positions
的索引没有任何保护,并且您假设每个图块对于 threadIdx.x
的每个值都有一个有效的 positions
条目。但事实并非如此,通过使用 -lineinfo
编译代码并使用 cuda-memcheck
编译代码可以发现问题的第一个表现形式。在这种情况下,由于 cuda-memcheck
提供了严格的内存保护,您的代码(对我而言)在大约 500 个实体而不是 28000 个时失败。具体的失败是大小为 4 的无效全局读取,在最后一行代码如上所示。 (因此,这不是与共享内存的 write 相关的索引问题。)从根本上说,问题是 tile*blockDim.s + threadIdx.x
可以超过 nbody
,而你索引 out-of-bounds 阅读 positions
。 (使用 -lineinfo
识别失败的特定内核代码行已涵盖 here)
对您的 computeBodyAccel
例程进行的以下 limit-checking 修改允许我 运行 您的代码达到大约 262,000 个身体,在那里它停止增加(由于 Ampl
限制在 El_DIM*El_DIM
) 并保持在那里:
__device__ float3 computeBodyAccel( float3 force, float3 bodyPos, ParticleProperties * positions, const int numTiles, const int nbody ){
extern __shared__ float3 sharedPos[];
int computedNbody = 0;
for( int tile = 0; tile < numTiles; tile++ ){
if ((tile*blockDim.x + threadIdx.x) < nbody)
sharedPos[threadIdx.x] = positions[tile * blockDim.x + threadIdx.x].Position;
__syncthreads();
// This is the "tile_calculation" from the GPUG3 article.
int limit = blockDim.x;
if (tile = (numTiles - 1)) limit -= (numTiles*blockDim.x)-nbody;
#pragma unroll 128
for( unsigned int counter = 0; counter < limit; counter++ ){
force = bodyBodyInteraction(force, bodyPos, sharedPos[counter]);
computedNbody++;
if( computedNbody == nbody ) break;
}
__syncthreads();
}
return force;
}
您的代码中似乎还有一个问题,即使通过上述修复似乎 运行。如果你运行代码(上面"fix")和cuda-memcheck
并使用here描述的-lineinfo
方法,你会发现(由于内存更紧cuda-memcheck
强制执行的范围检查)当物体数量变大时,最终你在 ParticleAmplification
中尝试创建一个新粒子并在最后调用 SetVector
时遇到另一个内存访问错误.这条线之间似乎存在竞争条件:
if( TotalProcesses >= El_DIM * El_DIM - 1 ) Ampl = false;
和以下可能同时增加 TotalProcesses
和 LocalProcesses
的行:
atomicAdd( &TotalProcesses, 1 );
LocalProcesses = atomicAdd( &NewParticles, 1 );
因为你有很多线程 运行ning 并行,这种类型的限制检查是无用的。您需要更仔细地管理新粒子的建立,方法是检查 atomicAdd
操作的实际返回值并查看它们是否超出限制。
我无法在 CUDA 代码中解决这个问题。
我基本上是在计算 gems3 中数组大小不断增加的 nbody 问题。
粒子在 __global__ void ParticleAmplification()
内核中的特定数组中创建,dev_Ionisation
,并通过主机函数添加到全局数组中 DynamicAlloc()
。
在这一个中,删除了空位置并将新位置放在新数组的末尾。 由于我抛出的线程多于可用粒子,因此我有一个转义变量以避免浪费时间检查是否有粒子。
块和瓦片的数量是动态分配的,设备数组是通过以下方式重新分配的:
checkCudaErrors( cudaFree( dev_vector ) );
checkCudaErrors( cudaMalloc( (void**)&dev_vector, N * sizeof(ParticleProperties) ) );
然后经过一些步骤,通常当粒子数量增加到大约 28000 时,内核崩溃。它给了我错误代码 77,这似乎归因于 (cudaDeviceSynchronize() error code 77: cudaErrorIllegalAddress) __device__ float3 computeBodyAccel
函数中外部共享变量大小 extern __shared__ float3 sharedPos[]
的错误大小。但是,那个似乎总是以相同的大小正确传递给内核:
size_t sharedMemSize = ThreadsInit * sizeof(float3);
integrateBodies<<<blocksInit, ThreadsInit, sharedMemSize>>>( dev_vectorIonisation, dt, numTiles, nbodyTemp );
当使用固定的大数组时,一切正常。
我做错了什么?共享内存是否会因未释放的内存而变满?
这是完整的可编译代码:
// ----- Libraries C/C++ ----- //
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <iomanip>
#include <string>
#include <math.h>
#include <time.h>
#include <dirent.h>
// ----- Libraries CUDA ----- //
#include "cuda.h"
#include <helper_cuda.h>
#include "curand_kernel.h"
// ----- Global variables ----- //
#define El_DIM 512
#define imin(a,b) (a<b?a:b)
using namespace std;
__constant__ float softening_ = 1.0e-12; // softening factor for nbody interaction
__device__ __managed__ int NewParticles = 0;
__device__ __managed__ int TotalProcesses = 0;
__device__ __managed__ bool Ampl = false;
const int ThreadsInit = 512;
const int blocksPerGrid = (int)( ( El_DIM * El_DIM + ThreadsInit -1 ) / ThreadsInit );
struct ParticleProperties{
float3 Position, Velocity, Force;
};
__device__ void initVector( ParticleProperties *dev_Vect, int index ){
dev_Vect[index].Position.x = -1.0;
dev_Vect[index].Position.y = -1.0;
dev_Vect[index].Position.z = -1.0;
dev_Vect[index].Velocity.x = 0.0;
dev_Vect[index].Velocity.y = 0.0;
dev_Vect[index].Velocity.z = 0.0;
dev_Vect[index].Force.x = 0.0;
dev_Vect[index].Force.y = 0.0;
dev_Vect[index].Force.z = 0.0;
}
__device__ void SetVector( ParticleProperties *dev_Vect, float3 position, float4 v, int index ){
dev_Vect[index].Position.x = position.x;
dev_Vect[index].Position.y = position.y;
dev_Vect[index].Position.z = position.z;
dev_Vect[index].Velocity.x = v.x;
dev_Vect[index].Velocity.y = v.y;
dev_Vect[index].Velocity.z = v.z;
dev_Vect[index].Force.x = 0.0;
dev_Vect[index].Force.y = 0.0;
dev_Vect[index].Force.z = 0.0;
}
__device__ float3 bodyBodyInteraction( float3 fi, float3 bi, float3 bj ){
float3 r;
// r_ij [4 FLOPS]
r.x = ( bj.x - bi.x );
r.y = ( bj.y - bi.y );
r.z = ( bj.z - bi.z );
r.z = 0.0;
// distSqr = dot(r_ij, r_ij) + EPS^2 [7 FLOPS]
float distSqr = r.x * r.x + ( r.y * r.y + ( r.z * r.z + softening_ * softening_ ) );
// invDistCube =1/distSqr^(3/2) [4 FLOPS (2 mul, 1 sqrt, 1 inv)]
float invDist = rsqrt(distSqr);
float invDistCube = invDist * invDist * invDist;
// s = m_j * invDistCube [2 FLOP]
float s = invDistCube;
// a_i = a_i + s * r_ij [6 FLOPS]
fi.x += r.x * s;
fi.y += r.y * s;
fi.z += r.z * s;
return fi;
}
__device__ float3 computeBodyAccel( float3 force, float3 bodyPos, ParticleProperties * positions, const int numTiles, const int nbody ){
extern __shared__ float3 sharedPos[];
int computedNbody = 0;
for( int tile = 0; tile < numTiles; tile++ ){
sharedPos[threadIdx.x] = positions[tile * blockDim.x + threadIdx.x].Position;
__syncthreads();
// This is the "tile_calculation" from the GPUG3 article.
#pragma unroll 128
for( unsigned int counter = 0; counter < blockDim.x; counter++ ){
force = bodyBodyInteraction(force, bodyPos, sharedPos[counter]);
computedNbody++;
if( computedNbody == nbody ) break;
}
__syncthreads();
}
return force;
}
__global__ void integrateBodies( ParticleProperties * __restrict__ dev_vector, float deltaTime, int numTiles, int nbody ){
int index = blockIdx.x * blockDim.x + threadIdx.x;
float3 position = {0.0, 0.0, 0.0};
float3 force = {0.0, 0.0, 0.0};
if( index < nbody ){
position = dev_vector[index].Position;
force = computeBodyAccel( force, position, dev_vector, numTiles, nbody );
// store new force
dev_vector[index].Position = position;
dev_vector[index].Force = force;
}
}
__global__ void IntegrationKernel( ParticleProperties * __restrict__ dev_vector, const float deltaT, const int nbody ){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
float3 dvel;
float3 velocity;
if( tid < nbody ){
// integrate
dvel.x = dev_vector[tid].Force.x * deltaT * 0.5;
dvel.y = dev_vector[tid].Force.y * deltaT * 0.5;
dvel.z = dev_vector[tid].Force.z * deltaT * 0.5;
velocity.x = dev_vector[tid].Velocity.x + dvel.x;
velocity.y = dev_vector[tid].Velocity.y + dvel.y;
velocity.z = dev_vector[tid].Velocity.z + dvel.z;
dev_vector[tid].Position.x += velocity.x * deltaT;
dev_vector[tid].Position.y += velocity.y * deltaT;
dev_vector[tid].Position.z += velocity.z * deltaT;
dev_vector[tid].Velocity.x = velocity.x + dvel.x;
dev_vector[tid].Velocity.y = velocity.y + dvel.y;
dev_vector[tid].Velocity.z = velocity.z + dvel.z;
}
}
__global__ void ParticleAmplification( curandState *state, ParticleProperties * __restrict__ dev_vectorIonisation,
ParticleProperties * __restrict__ dev_Ionisation,
const float dt, int numbodies ){
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int LocalProcesses = 0;
float3 position = {0.0, 0.0, 0.0};
float4 v_new = {0.0, 0.0, 0.0, 0.0};
float prob = 0.0;
if( TotalProcesses >= El_DIM * El_DIM - 1 ) Ampl = false;
if( tid < numbodies ){
position.x = dev_vectorIonisation[tid].Position.x;
position.y = dev_vectorIonisation[tid].Position.y;
position.z = dev_vectorIonisation[tid].Position.z;
prob = curand_uniform( &state[tid] );
if( Ampl ){
if( prob < 1.e-3 ){
atomicAdd( &TotalProcesses, 1 );
LocalProcesses = atomicAdd( &NewParticles, 1 );
v_new.x = 0.0;
v_new.y = 0.0;
v_new.z = 0.0;
SetVector( dev_Ionisation, position, v_new, LocalProcesses );
}
}
}
}
__global__ void initCurand( curandState *state, unsigned long seed ){
int tid = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, tid, 0, &state[tid]);
}
__global__ void initProcessIoni( ParticleProperties *dev_Vect ){
int x = threadIdx.x + blockIdx.x * blockDim.x;
initVector( dev_Vect, x );
}
__global__ void Enumerate_Nbody( ParticleProperties *dev_Vect, int *N, int PrevNbody ){
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int gid = blockIdx.x;
extern __shared__ int cache[];
if( tid == 0 )
*N = 0;
if( threadIdx.x == 0 )
cache[gid] = 0;
__syncthreads();
while( tid < PrevNbody ){
if( dev_Vect[tid].Position.x > -1.0 )
atomicAdd( &(cache[gid]), 1 );
tid += blockDim.x * gridDim.x;
}
__syncthreads();
if( threadIdx.x == 0 )
atomicAdd( N, cache[gid] );
}
void DynamicAlloc( ParticleProperties **DynamicVector, const ParticleProperties *StaticVector, const int n, int nbody, const int max ){
ParticleProperties *h_vectorIonisation = new ParticleProperties [nbody];
ParticleProperties *VectTemporary = new ParticleProperties [n];
checkCudaErrors( cudaMemcpy( VectTemporary, *DynamicVector, n * sizeof(ParticleProperties), cudaMemcpyDeviceToHost ) );
checkCudaErrors( cudaFree( *DynamicVector ) );
int i = 0;
int j = 0;
for( i = 0; i < n; i++ ){
if( VectTemporary[i].Position.x > -1.0 ){
h_vectorIonisation[j] = VectTemporary[i];
j++;
}
}
delete [] VectTemporary;
if( NewParticles != 0 ){
ParticleProperties *StaticVectTemporary = new ParticleProperties [max];
checkCudaErrors( cudaMemcpy( StaticVectTemporary, StaticVector, max * sizeof(ParticleProperties), cudaMemcpyDeviceToHost ) );
int k = 0;
#pragma unroll 32
for( i = 0; i < max; i++ ){
if( StaticVectTemporary[i].Position.x > -1.0 ){
h_vectorIonisation[j + k] = StaticVectTemporary[i];
k++;
}
}
delete [] StaticVectTemporary;
}
if( nbody > 0 ){
checkCudaErrors( cudaMalloc( (void**)DynamicVector, nbody * sizeof(ParticleProperties) ) );
checkCudaErrors( cudaMemcpy( *DynamicVector, h_vectorIonisation, nbody * sizeof(ParticleProperties), cudaMemcpyHostToDevice ) );
}
delete [] h_vectorIonisation;
}
int main( int argc_, char **argv_ ){
cudaDeviceReset();
cudaDeviceProp prop;
checkCudaErrors( cudaGetDeviceProperties( &prop, 0 ) );
int Newers = 256;
int nbody = 1;
Ampl = true;
int *dev_nbody;
checkCudaErrors( cudaMalloc( (void**)&dev_nbody, sizeof(int) ) );
checkCudaErrors( cudaMemcpy( dev_nbody, &nbody, sizeof(int), cudaMemcpyHostToDevice ) );
float dt = 0.5e-13;
float3 pos;
pos.x = 1.0 / 2.0 * 1.0e-3;
pos.y = 1.0 / 2.0 * 1.0e-3;
pos.z = 1.0 / 2.0 * 1.0e-3;
float3 speed;
speed.x = 0.0;
speed.y = 0.0;
speed.z = 0.0;
ParticleProperties *dev_vectorIonisation;
checkCudaErrors( cudaMalloc( (void**)&dev_vectorIonisation, nbody * sizeof(ParticleProperties) ) );
ParticleProperties *host_vectorIonisation = new ParticleProperties [nbody];
clog << "Particles array initialisation...";
for( int i = 0; i < nbody; i++ ){
host_vectorIonisation[i].Position.x = drand48() * 1.0e-6 + pos.x;
host_vectorIonisation[i].Position.y = drand48() * 1.0e-6 + pos.y;
host_vectorIonisation[i].Position.z = 0.0;
host_vectorIonisation[i].Velocity.x = speed.x;
host_vectorIonisation[i].Velocity.y = speed.y;
host_vectorIonisation[i].Velocity.z = speed.z;
host_vectorIonisation[i].Force.x = 0.0;
host_vectorIonisation[i].Force.y = 0.0;
host_vectorIonisation[i].Force.z = 0.0;
}
checkCudaErrors( cudaMemcpy( dev_vectorIonisation, host_vectorIonisation, nbody * sizeof(ParticleProperties), cudaMemcpyHostToDevice ) );
delete [] host_vectorIonisation;
clog << "Done" << endl;
ParticleProperties *dev_Ionisation;
checkCudaErrors( cudaMalloc( (void**)&dev_Ionisation, Newers * sizeof(ParticleProperties) ) );
curandState *RndState;
checkCudaErrors( cudaMalloc( (void**)&RndState, El_DIM * El_DIM * sizeof(curandState) ) );
unsigned long seed = 1773;
clog << "cuRand array initialisation...";
initCurand<<<blocksPerGrid, ThreadsInit>>>( RndState, seed );
initProcessIoni<<<1, Newers>>>( dev_Ionisation );
clog << "Done" << endl;
clog << "Propagation of " << nbody << " primary particle(s)." << endl;
int ProcessTemp = 0;
int nbodyTemp = nbody;
int blocksInit = (nbody + ThreadsInit - 1) / ThreadsInit;
int numTiles = (nbody + ThreadsInit - 1) / ThreadsInit;
size_t sharedMemSize = ThreadsInit * sizeof(float3);
char buffer[64];
setvbuf(stdout, buffer, _IOFBF, sizeof(buffer));
while( nbody > 0 ){
integrateBodies<<<blocksInit, ThreadsInit, sharedMemSize>>>( dev_vectorIonisation, dt, numTiles, nbodyTemp );
IntegrationKernel<<<blocksInit, ThreadsInit>>>( dev_vectorIonisation, dt, nbodyTemp );
ParticleAmplification<<<blocksInit, ThreadsInit>>>( RndState, dev_vectorIonisation, dev_Ionisation, dt, nbodyTemp );
checkCudaErrors( cudaDeviceSynchronize() );
Enumerate_Nbody<<<blocksInit, ThreadsInit, blocksInit * sizeof(int)>>>( dev_vectorIonisation, dev_nbody, nbodyTemp );
checkCudaErrors( cudaDeviceSynchronize() );
getLastCudaError("Kernel enumerate bodies execution failed");
checkCudaErrors( cudaMemcpy( &nbody, dev_nbody, sizeof(int), cudaMemcpyDeviceToHost ) );
nbody += NewParticles;
if( NewParticles > ProcessTemp ) ProcessTemp = NewParticles;
if( nbody != nbodyTemp ){
DynamicAlloc( &dev_vectorIonisation, dev_Ionisation, nbodyTemp, nbody, Newers );
numTiles = blocksInit = ( nbody + ThreadsInit - 1) / ThreadsInit;
if( NewParticles != 0 ){
initProcessIoni<<<1, Newers>>>( dev_Ionisation );
checkCudaErrors( cudaDeviceSynchronize() );
}
nbodyTemp = nbody;
NewParticles = 0;
checkCudaErrors( cudaDeviceSynchronize() );
}
printf("\r nbodies: %d", nbodyTemp);
}
checkCudaErrors( cudaFree( dev_Ionisation ) );
}
这是在具有 3.5 计算能力的 GTX Titan Black 上执行的
您的问题始于这行代码(在 main
中):
numTiles = blocksInit = ( nbody + ThreadsInit - 1) / ThreadsInit;
这会创建足够的图块来完全覆盖 nbody
的大小,但并非每个图块都充满了物体。
问题实际上在此时出现在您的 computeBodyAccel
例程中,从 integrateBodies
:
for( int tile = 0; tile < numTiles; tile++ ){
sharedPos[threadIdx.x] = positions[tile * blockDim.x + threadIdx.x].Position;
您对 positions
的索引没有任何保护,并且您假设每个图块对于 threadIdx.x
的每个值都有一个有效的 positions
条目。但事实并非如此,通过使用 -lineinfo
编译代码并使用 cuda-memcheck
编译代码可以发现问题的第一个表现形式。在这种情况下,由于 cuda-memcheck
提供了严格的内存保护,您的代码(对我而言)在大约 500 个实体而不是 28000 个时失败。具体的失败是大小为 4 的无效全局读取,在最后一行代码如上所示。 (因此,这不是与共享内存的 write 相关的索引问题。)从根本上说,问题是 tile*blockDim.s + threadIdx.x
可以超过 nbody
,而你索引 out-of-bounds 阅读 positions
。 (使用 -lineinfo
识别失败的特定内核代码行已涵盖 here)
对您的 computeBodyAccel
例程进行的以下 limit-checking 修改允许我 运行 您的代码达到大约 262,000 个身体,在那里它停止增加(由于 Ampl
限制在 El_DIM*El_DIM
) 并保持在那里:
__device__ float3 computeBodyAccel( float3 force, float3 bodyPos, ParticleProperties * positions, const int numTiles, const int nbody ){
extern __shared__ float3 sharedPos[];
int computedNbody = 0;
for( int tile = 0; tile < numTiles; tile++ ){
if ((tile*blockDim.x + threadIdx.x) < nbody)
sharedPos[threadIdx.x] = positions[tile * blockDim.x + threadIdx.x].Position;
__syncthreads();
// This is the "tile_calculation" from the GPUG3 article.
int limit = blockDim.x;
if (tile = (numTiles - 1)) limit -= (numTiles*blockDim.x)-nbody;
#pragma unroll 128
for( unsigned int counter = 0; counter < limit; counter++ ){
force = bodyBodyInteraction(force, bodyPos, sharedPos[counter]);
computedNbody++;
if( computedNbody == nbody ) break;
}
__syncthreads();
}
return force;
}
您的代码中似乎还有一个问题,即使通过上述修复似乎 运行。如果你运行代码(上面"fix")和cuda-memcheck
并使用here描述的-lineinfo
方法,你会发现(由于内存更紧cuda-memcheck
强制执行的范围检查)当物体数量变大时,最终你在 ParticleAmplification
中尝试创建一个新粒子并在最后调用 SetVector
时遇到另一个内存访问错误.这条线之间似乎存在竞争条件:
if( TotalProcesses >= El_DIM * El_DIM - 1 ) Ampl = false;
和以下可能同时增加 TotalProcesses
和 LocalProcesses
的行:
atomicAdd( &TotalProcesses, 1 );
LocalProcesses = atomicAdd( &NewParticles, 1 );
因为你有很多线程 运行ning 并行,这种类型的限制检查是无用的。您需要更仔细地管理新粒子的建立,方法是检查 atomicAdd
操作的实际返回值并查看它们是否超出限制。