CUDA 中的多个内核调用
Multiple kernel calls in CUDA
我试图在 CUDA 上多次调用同一个内核(使用一个不同的输入参数),但它只执行第一个内核并且不会执行其他内核调用。
假设输入数组是
new_value0=[123.814935276; 234; 100; 166; 203.0866414; 383; 186; 338; 173.0984233]
和
new_value1=[186.221113; 391; 64; 235; 195.7454998; 275; 218; 121; 118.0333872]
部分输出是:
entra
entra
entra
334
549
524
alpha1.000000
alpha1.000000
alpha1.000000
in 2 idx-j 0-0 Value 123.814934 - m=334 - k=0
mlx -1618.175171
in 1 idx-j 0-1 Value 234.000000 - m=334 k=1
mlx -571.983032
in 1 idx-j 0-2 Value 100.000000 - m=334 k=2
mlx -208.243652
in 1 idx-j 1-0 Value 166.000000 - m=549 k=3
mlx 477.821777
in 2 idx-j 1-1 Value 203.086639 - m=549 - k=4
mlx -2448.556396
in 1 idx-j 1-2 Value 383.000000 - m=549 k=5
mlx -549.565674
in 1 idx-j 2-0 Value 186.000000 - m=524 k=6
mlx 239.955444
in 1 idx-j 2-1 Value 338.000000 - m=524 k=7
mlx 1873.975708
in 2 idx-j 2-2 Value 173.098419 - m=524 - k=8
mlx -835.600220
mlx =-835.600220
bs = -835.600220 .
esci
esci
esci
它来自第一次内核调用。
这是内核:
__global__ void calculateMLpa( int N, float *bs, float *Value, float alphaxixj, float tauxi, const int sz, int dim, int *m){
int idx = blockIdx.x * blockDim.x + threadIdx.x;
printf("entra\n");
if(idx<N){
bs[idx]=0;
int i,k=0;
float mlx = 0;
float v;
float alphaxi;
m[idx]=0;
int state[9];
int p, j, t;
int cont=0;
if(idx==0){
m[idx]=Value[idx+1]+Value[idx+2];
}
else if(idx==1){
m[idx]=Value[idx+2]+Value[idx+4];
}else{
m[idx]=Value[idx+4]+Value[idx+5];
}
printf("%d \n",m[idx]);
alphaxi = alphaxixj * (((float) sz) - 1.0);
alphaxi = alphaxixj;
printf("alpha%f \n",alphaxi);
if(idx==0){
for(i=0;i<sz;i++){
for (j = 0; j < sz; j++) {
// xi!=xj
if (i!=j){
if(j==0) {
k=i*3;
}
else if(j==1){
k=i*3+1;
}
else if(j==2) {
k=i*3+2;
}
mlx = mlx + lgamma(alphaxixj + Value[k]) - lgamma(alphaxixj);
printf("in 1 idx-j %d-%d Value %f - m=%d k=%d \n",i,j,Value[k],m[i],k);
printf(" mlx %f \n",mlx);
//k++;
}
// xi
else {
if(j==0) {
k=i*3;
}
else if(j==1){
k=i*3+1;
}
else if(j==2) {
k=i*3+2;
}
mlx = mlx + lgamma(alphaxi) - lgamma(alphaxi + m[i]);
mlx = mlx + lgamma(alphaxi + m[i] + 1.0)+ (alphaxi + 1.0) * log(tauxi);
mlx = mlx - lgamma(alphaxi + 1.0)- (alphaxi + m[i] + 1.0) * log(tauxi + Value[k]);
printf("in 2 idx-j %d-%d Value %f - m=%d - k=%d \n",i,j,Value[k],m[i],k);
printf(" mlx %f \n",mlx);
//k++;
}
}
}
printf("mlx =%f \n",mlx);
bs[idx]=mlx;
printf("bs = %f .\n",bs[idx]);
}
}
printf("esci\n");
}
代码如下:
int main (void){
printf("START");
FILE *pf;
const int N=9;
char fName[2083];
char *parents[3]={"0","1","2"};
char *traject[9]={"0-0","0-1","0-2","1-0","1-1","1-2","2-0","2-1","2-2"};
size_t parents_len;
size_t traject_len;
parents_len=sizeof(char)/sizeof(parents[0]);
traject_len=sizeof(char)/sizeof(traject[0]);
//possibile malloc
//pointer host to memory
char **parents_dev;
char **traject_dev;
//allocate on device
cudaMalloc((void **)&parents_dev,sizeof(char**)*parents_len);
cudaMalloc((void **)&traject_dev,sizeof(char**)*traject_len);
//host to Device
cudaMemcpy(parents_dev,parents,sizeof(char**)*parents_len,cudaMemcpyHostToDevice);
cudaMemcpy(traject_dev,traject,sizeof(char**)*traject_len,cudaMemcpyHostToDevice);
//Loop start
int file,Epoca;
float *bs;
float *bs_dev;
int file_size0=28;
int file_size1=55;
int file_size3=109;
//size_t size = N * sizeof(float);
bs=(float *)malloc(N * sizeof(float));
cudaMalloc((void **)&bs_dev, N * sizeof(float));
float *new_value0,*new_value0_dev;
new_value0=(float *)malloc(file_size0*N/3);
cudaMalloc((void **)&new_value0_dev, N * file_size0/3);
//
float *new_value1,*new_value1_dev;
new_value1=(float *)malloc(file_size0*N/3);
cudaMalloc((void **)&new_value1_dev, N * file_size0/3);
//
float *new_value2,*new_value2_dev;
new_value2=(float *)malloc(file_size0*N/3);
cudaMalloc((void **)&new_value2_dev, N * file_size0/3);
//
//one parent 1,2
float *new_value00,*new_value00_dev;
new_value00=(float *)malloc(file_size1*N/6);
cudaMalloc((void **)&new_value00_dev, N * file_size1/6);
//
float *new_value01,*new_value01_dev;
new_value01=(float *)malloc(file_size1*N/6);
cudaMalloc((void **)&new_value01_dev, N * file_size1/6);
//
float *new_value10,*new_value10_dev;
new_value10=(float *)malloc(file_size1*N/6);
cudaMalloc((void **)&new_value10_dev, N * file_size1/6);
//
float *new_value11,*new_value11_dev;
new_value11=(float *)malloc(file_size1*N/6);
cudaMalloc((void **)&new_value11_dev, N * file_size1/6);
//
float *new_value20,*new_value20_dev;
new_value20=(float *)malloc(file_size1*N/6);
cudaMalloc((void **)&new_value20_dev, N * file_size1/6);
//
float *new_value21,*new_value21_dev;
new_value21=(float *)malloc(file_size1*N/6);
cudaMalloc((void **)&new_value21_dev, N * file_size1/6);
//
//double parent
float *new_value000,*new_value000_dev;
new_value000=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value000_dev, N * file_size3/12);
//
float *new_value001,*new_value001_dev;
new_value001=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value001_dev, N * file_size3/12);
//
float *new_value010,*new_value010_dev;
new_value010=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value010_dev, N * file_size3/12);
//
float *new_value011,*new_value011_dev;
new_value011=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value011_dev, N * file_size3/12);
//
float *new_value100,*new_value100_dev;
new_value100=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value100_dev, N * file_size3/12);
//
float *new_value101,*new_value101_dev;
new_value101=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value101_dev, N * file_size3/12);
//
float *new_value110,*new_value110_dev;
new_value110=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value110_dev, N * file_size3/12);
//
float *new_value111,*new_value111_dev;
new_value111=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value111_dev, N * file_size3/12);
//
float *new_value200,*new_value200_dev;
new_value200=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value200_dev, N * file_size3/12);
//
float *new_value201,*new_value201_dev;
new_value201=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value201_dev, N * file_size3/12);
//
float *new_value210,*new_value210_dev;
new_value210=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value210_dev, N * file_size3/12);
//
float *new_value211,*new_value211_dev;
new_value211=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value211_dev, N * file_size3/12);
//int file;
for(file=0;file<4;file++){
int f, i, j, file_size=0, kk=0;
//file IO
sprintf(fName, "//home//user//prova%d.csv",file);
pf=fopen(fName,"r");
char *X;
char *PaX;
int Time;
char *pa;
char *xixj;
float val;
char buffer[BUFSIZ], *ptr;
if (pf)
{
/*
* Read each line from the file.
*/
while(fgets(buffer, sizeof buffer, pf)){
file_size++;
}
fclose(pf);
}
//variabile per kernel
float *Value, *Value_dev;
Value=(float *)malloc(file_size*N);
cudaMalloc((void **)&Value_dev, N * file_size);
//
pf=fopen(fName,"r");
if(pf)
{
printf("\nnumero righe file %d = %d\n",file,file_size);
char *state[file_size];
while(fgets(buffer, sizeof buffer, pf))
{
//printf("start csv \n");
char *token;
char *ptr = buffer;
const char end[2]=",";//fgets(buffer, sizeof buffer, pf);
token = strtok(ptr, end);
f=0;
/* walk through other tokens */
while( token != NULL )
{
if(f==0){
X=token;
// printf( "X %s\n", token );
}else if(f==1){
PaX=token;
// printf( "PaX %s\n", token );
}
else if(f==2){
Time=strtod(token,NULL);
// printf( "Time %f \n", token );
}
else if(f==3){
pa=token;
// printf( "pa %s \n", token );
}
else if(f==4){
xixj=(token);
// printf( "xixj %s \n", token );
}
else{
Value[kk]=strtod(&token[1], NULL);
// printf("Value %f \n", Value[kk]);
kk++;
}
token = strtok(NULL, end);
f++;
}
}
//
//insert in variable
if (file==0){
for (i=0;i<(file_size0-1)/3;++i){
new_value0[i]=Value[i+1];
cudaMemcpy(new_value0_dev,new_value0,N*sizeof(file_size0), cudaMemcpyHostToDevice);
new_value1[i]=Value[i + 1+((file_size0-1)/3)];
cudaMemcpy(new_value1_dev,new_value1,N*sizeof(file_size0), cudaMemcpyHostToDevice);
new_value2[i]=Value[i + (1+ 2*(file_size0-1)/3)];
cudaMemcpy(new_value2_dev,new_value2,N*sizeof(file_size0), cudaMemcpyHostToDevice);
// printf(" new_value- %d - %f - %f - %f \n",i,new_value0[i],new_value1[i],new_value2[i]);
}
}else if(file==1 || file==2){
for (i=0; i<(file_size1-1)/6;++i)
{
new_value00[i]=Value[i+1];
cudaMemcpy(new_value00_dev,new_value00,N*sizeof(file_size0), cudaMemcpyHostToDevice);
new_value01[i]=Value[i+ ((file_size0-1)/3)+1];
cudaMemcpy(new_value01_dev,new_value01,N*sizeof(file_size1), cudaMemcpyHostToDevice);
new_value10[i]=Value[i+ (2*(file_size1-1)/6)+1];
cudaMemcpy(new_value10_dev,new_value10,N*sizeof(file_size1), cudaMemcpyHostToDevice);
new_value11[i]=Value[i+ (3*(file_size1-1)/6)+1];
cudaMemcpy(new_value11_dev,new_value11,N*sizeof(file_size1), cudaMemcpyHostToDevice);
new_value20[i]=Value[i+ (4*(file_size1-1)/6)+1];
cudaMemcpy(new_value20_dev,new_value20,N*sizeof(file_size1), cudaMemcpyHostToDevice);
new_value21[i]=Value[i+ (5*(file_size1-1)/6)+1];
cudaMemcpy(new_value21_dev,new_value21,N*sizeof(file_size1), cudaMemcpyHostToDevice);
// printf(" new_value- %d - %f - %f - %f - %f - %f - %f \n",i,new_value00[i],new_value01[i],new_value10[i],new_value11[i],new_value20[i],new_value21[i]);
}
}else{
for (i=0; i<(file_size3-1)/12;++i)
{
new_value000[i]=Value[i+1];
cudaMemcpy(new_value000_dev,new_value000,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value001[i]=Value[i+ ((file_size3-1)/12)+1];
cudaMemcpy(new_value001_dev,new_value001,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value010[i]=Value[i+ (2*(file_size3-1)/12)+1];
cudaMemcpy(new_value010_dev,new_value010,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value011[i]=Value[i+ (3*(file_size3-1)/12)+1];
cudaMemcpy(new_value011_dev,new_value011,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value100[i]=Value[i+ (4*(file_size3-1)/12)+1];
cudaMemcpy(new_value100_dev,new_value100,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value101[i]=Value[i+ (5*(file_size3-1)/12)+1];
cudaMemcpy(new_value101_dev,new_value101,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value110[i]=Value[i+ (6*(file_size3-1)/12)+1];
cudaMemcpy(new_value110_dev,new_value110,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value111[i]=Value[i+ (7*(file_size3-1)/12)+1];
cudaMemcpy(new_value111_dev,new_value111,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value200[i]=Value[i+ (8*(file_size3-1)/12)+1];
cudaMemcpy(new_value200_dev,new_value200,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value201[i]=Value[i+ (9*(file_size3-1)/12)+1];
cudaMemcpy(new_value201_dev,new_value201,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value210[i]=Value[i+ (10*(file_size3-1)/12)+1];
cudaMemcpy(new_value210_dev,new_value210,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value211[i]=Value[i+ (11*(file_size3-1)/12)+1];
cudaMemcpy(new_value211_dev,new_value211,N*sizeof(file_size3), cudaMemcpyHostToDevice);
// printf(" new_value- %d - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f \n",i,new_value000[i],new_value001[i],new_value010[i],new_value011[i],new_value100[i],new_value101[i],new_value110[i],new_value111[i],new_value200[i],new_value201[i],new_value210[i],new_value211[i]);
}
}
}
}
//cudaMemcpy(Value_dev,Value,N*sizeof(file_size), cudaMemcpyHostToDevice);
//variable of kernel
//no parent
//START computation
printf("\nPRE KERNEL\n");
const int sz=(sizeof(parents)/sizeof(*(parents)));
const int dim=(sizeof(traject)/sizeof(*(traject)));
printf("%d - %d \n",sz, dim);
//chiamata kernel
int block_size = 3;
int n_blocks =1 ;
int *m, *m_dev;
m=(int *)malloc(sz*N);
cudaMalloc((void **)&m_dev, N * sz);
float *trns_dev;
cudaMalloc((void **)&trns_dev, N * dim);
int i;
for(i=0;i<(file_size0-1)/3;i++){
printf(" new_value- %d - %f - %f - %f \n",i,new_value0[i],new_value1[i],new_value2[i]);
}
printf("\n");
for(i=0;i<(file_size1-1)/6;i++){
printf(" new_value- %d - %f - %f - %f - %f - %f - %f \n",i,new_value00[i],new_value01[i],new_value10[i],new_value11[i],new_value20[i],new_value21[i]);
}
printf("\n");
for(i=0;i<(file_size3-1)/12;i++){
printf(" new_value- %d - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f \n",i,new_value000[i],new_value001[i],new_value010[i],new_value011[i],new_value100[i],new_value101[i],new_value110[i],new_value111[i],new_value200[i],new_value201[i],new_value210[i],new_value211[i]);
}
for(Epoca=0; Epoca<3; Epoca++){
bs=0;
float bf=0;
cudaMalloc((void **)&bf, N * sz);
cudaMemcpy(bs_dev,bs,N*sizeof(float), cudaMemcpyHostToDevice);
if(Epoca==0){
calculateMLpa<<<n_blocks, block_size >>>(N,bs_dev,new_value0_dev,1.0,0.1,sz,dim,m_dev);
cudaDeviceSynchronize();
cudaMemcpy(bs,bs_dev,N*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(m,m_dev,N*sizeof(float), cudaMemcpyDeviceToHost);
bf =+ bs[0];
printf("score= %f m0 = %d, m1 = %d, m2 = %d \n\n", bf, m[0], m[1], m[2]);
calculateMLpa<<<n_blocks, block_size >>>(N,bs_dev,new_value00_dev,1.0,0.1,sz,dim,m_dev);
cudaDeviceSynchronize();
cudaMemcpy(bs,bs_dev,N*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(m,m_dev,N*sizeof(float), cudaMemcpyDeviceToHost);
bf =+ bs[0];
printf("score= %f \n", bf);
}
printf("score %d= %f \n",Epoca, bf);
}
free(bs_dev);
}
我认为我可以将它与流并行化,但我以前从未使用过它。我看了 this 开始。
听起来你应该使用并行 CUDA streams。
一个有趣的选项:
CUDA 7 introduces a new option, the per-thread default stream, that
has two effects. First, it gives each host thread its own default
stream. This means that commands issued to the default stream by
different host threads can run concurrently.
同样值得一提的是:
As described by the CUDA C Programming Guide, asynchronous commands
return control to the calling host thread before the device has
finished the requested task (they are non-blocking). These commands
are:
Kernel launches; Memory copies between two addresses to the same
device memory; Memory copies from host to device of a memory block of
64 KB or less; Memory copies performed by functions with the Async
suffix; Memory set function calls.
我试图在 CUDA 上多次调用同一个内核(使用一个不同的输入参数),但它只执行第一个内核并且不会执行其他内核调用。
假设输入数组是
new_value0=[123.814935276; 234; 100; 166; 203.0866414; 383; 186; 338; 173.0984233]
和
new_value1=[186.221113; 391; 64; 235; 195.7454998; 275; 218; 121; 118.0333872]
部分输出是:
entra
entra
entra
334
549
524
alpha1.000000
alpha1.000000
alpha1.000000
in 2 idx-j 0-0 Value 123.814934 - m=334 - k=0
mlx -1618.175171
in 1 idx-j 0-1 Value 234.000000 - m=334 k=1
mlx -571.983032
in 1 idx-j 0-2 Value 100.000000 - m=334 k=2
mlx -208.243652
in 1 idx-j 1-0 Value 166.000000 - m=549 k=3
mlx 477.821777
in 2 idx-j 1-1 Value 203.086639 - m=549 - k=4
mlx -2448.556396
in 1 idx-j 1-2 Value 383.000000 - m=549 k=5
mlx -549.565674
in 1 idx-j 2-0 Value 186.000000 - m=524 k=6
mlx 239.955444
in 1 idx-j 2-1 Value 338.000000 - m=524 k=7
mlx 1873.975708
in 2 idx-j 2-2 Value 173.098419 - m=524 - k=8
mlx -835.600220
mlx =-835.600220
bs = -835.600220 .
esci
esci
esci
它来自第一次内核调用。
这是内核:
__global__ void calculateMLpa( int N, float *bs, float *Value, float alphaxixj, float tauxi, const int sz, int dim, int *m){
int idx = blockIdx.x * blockDim.x + threadIdx.x;
printf("entra\n");
if(idx<N){
bs[idx]=0;
int i,k=0;
float mlx = 0;
float v;
float alphaxi;
m[idx]=0;
int state[9];
int p, j, t;
int cont=0;
if(idx==0){
m[idx]=Value[idx+1]+Value[idx+2];
}
else if(idx==1){
m[idx]=Value[idx+2]+Value[idx+4];
}else{
m[idx]=Value[idx+4]+Value[idx+5];
}
printf("%d \n",m[idx]);
alphaxi = alphaxixj * (((float) sz) - 1.0);
alphaxi = alphaxixj;
printf("alpha%f \n",alphaxi);
if(idx==0){
for(i=0;i<sz;i++){
for (j = 0; j < sz; j++) {
// xi!=xj
if (i!=j){
if(j==0) {
k=i*3;
}
else if(j==1){
k=i*3+1;
}
else if(j==2) {
k=i*3+2;
}
mlx = mlx + lgamma(alphaxixj + Value[k]) - lgamma(alphaxixj);
printf("in 1 idx-j %d-%d Value %f - m=%d k=%d \n",i,j,Value[k],m[i],k);
printf(" mlx %f \n",mlx);
//k++;
}
// xi
else {
if(j==0) {
k=i*3;
}
else if(j==1){
k=i*3+1;
}
else if(j==2) {
k=i*3+2;
}
mlx = mlx + lgamma(alphaxi) - lgamma(alphaxi + m[i]);
mlx = mlx + lgamma(alphaxi + m[i] + 1.0)+ (alphaxi + 1.0) * log(tauxi);
mlx = mlx - lgamma(alphaxi + 1.0)- (alphaxi + m[i] + 1.0) * log(tauxi + Value[k]);
printf("in 2 idx-j %d-%d Value %f - m=%d - k=%d \n",i,j,Value[k],m[i],k);
printf(" mlx %f \n",mlx);
//k++;
}
}
}
printf("mlx =%f \n",mlx);
bs[idx]=mlx;
printf("bs = %f .\n",bs[idx]);
}
}
printf("esci\n");
}
代码如下:
int main (void){
printf("START");
FILE *pf;
const int N=9;
char fName[2083];
char *parents[3]={"0","1","2"};
char *traject[9]={"0-0","0-1","0-2","1-0","1-1","1-2","2-0","2-1","2-2"};
size_t parents_len;
size_t traject_len;
parents_len=sizeof(char)/sizeof(parents[0]);
traject_len=sizeof(char)/sizeof(traject[0]);
//possibile malloc
//pointer host to memory
char **parents_dev;
char **traject_dev;
//allocate on device
cudaMalloc((void **)&parents_dev,sizeof(char**)*parents_len);
cudaMalloc((void **)&traject_dev,sizeof(char**)*traject_len);
//host to Device
cudaMemcpy(parents_dev,parents,sizeof(char**)*parents_len,cudaMemcpyHostToDevice);
cudaMemcpy(traject_dev,traject,sizeof(char**)*traject_len,cudaMemcpyHostToDevice);
//Loop start
int file,Epoca;
float *bs;
float *bs_dev;
int file_size0=28;
int file_size1=55;
int file_size3=109;
//size_t size = N * sizeof(float);
bs=(float *)malloc(N * sizeof(float));
cudaMalloc((void **)&bs_dev, N * sizeof(float));
float *new_value0,*new_value0_dev;
new_value0=(float *)malloc(file_size0*N/3);
cudaMalloc((void **)&new_value0_dev, N * file_size0/3);
//
float *new_value1,*new_value1_dev;
new_value1=(float *)malloc(file_size0*N/3);
cudaMalloc((void **)&new_value1_dev, N * file_size0/3);
//
float *new_value2,*new_value2_dev;
new_value2=(float *)malloc(file_size0*N/3);
cudaMalloc((void **)&new_value2_dev, N * file_size0/3);
//
//one parent 1,2
float *new_value00,*new_value00_dev;
new_value00=(float *)malloc(file_size1*N/6);
cudaMalloc((void **)&new_value00_dev, N * file_size1/6);
//
float *new_value01,*new_value01_dev;
new_value01=(float *)malloc(file_size1*N/6);
cudaMalloc((void **)&new_value01_dev, N * file_size1/6);
//
float *new_value10,*new_value10_dev;
new_value10=(float *)malloc(file_size1*N/6);
cudaMalloc((void **)&new_value10_dev, N * file_size1/6);
//
float *new_value11,*new_value11_dev;
new_value11=(float *)malloc(file_size1*N/6);
cudaMalloc((void **)&new_value11_dev, N * file_size1/6);
//
float *new_value20,*new_value20_dev;
new_value20=(float *)malloc(file_size1*N/6);
cudaMalloc((void **)&new_value20_dev, N * file_size1/6);
//
float *new_value21,*new_value21_dev;
new_value21=(float *)malloc(file_size1*N/6);
cudaMalloc((void **)&new_value21_dev, N * file_size1/6);
//
//double parent
float *new_value000,*new_value000_dev;
new_value000=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value000_dev, N * file_size3/12);
//
float *new_value001,*new_value001_dev;
new_value001=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value001_dev, N * file_size3/12);
//
float *new_value010,*new_value010_dev;
new_value010=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value010_dev, N * file_size3/12);
//
float *new_value011,*new_value011_dev;
new_value011=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value011_dev, N * file_size3/12);
//
float *new_value100,*new_value100_dev;
new_value100=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value100_dev, N * file_size3/12);
//
float *new_value101,*new_value101_dev;
new_value101=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value101_dev, N * file_size3/12);
//
float *new_value110,*new_value110_dev;
new_value110=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value110_dev, N * file_size3/12);
//
float *new_value111,*new_value111_dev;
new_value111=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value111_dev, N * file_size3/12);
//
float *new_value200,*new_value200_dev;
new_value200=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value200_dev, N * file_size3/12);
//
float *new_value201,*new_value201_dev;
new_value201=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value201_dev, N * file_size3/12);
//
float *new_value210,*new_value210_dev;
new_value210=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value210_dev, N * file_size3/12);
//
float *new_value211,*new_value211_dev;
new_value211=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value211_dev, N * file_size3/12);
//int file;
for(file=0;file<4;file++){
int f, i, j, file_size=0, kk=0;
//file IO
sprintf(fName, "//home//user//prova%d.csv",file);
pf=fopen(fName,"r");
char *X;
char *PaX;
int Time;
char *pa;
char *xixj;
float val;
char buffer[BUFSIZ], *ptr;
if (pf)
{
/*
* Read each line from the file.
*/
while(fgets(buffer, sizeof buffer, pf)){
file_size++;
}
fclose(pf);
}
//variabile per kernel
float *Value, *Value_dev;
Value=(float *)malloc(file_size*N);
cudaMalloc((void **)&Value_dev, N * file_size);
//
pf=fopen(fName,"r");
if(pf)
{
printf("\nnumero righe file %d = %d\n",file,file_size);
char *state[file_size];
while(fgets(buffer, sizeof buffer, pf))
{
//printf("start csv \n");
char *token;
char *ptr = buffer;
const char end[2]=",";//fgets(buffer, sizeof buffer, pf);
token = strtok(ptr, end);
f=0;
/* walk through other tokens */
while( token != NULL )
{
if(f==0){
X=token;
// printf( "X %s\n", token );
}else if(f==1){
PaX=token;
// printf( "PaX %s\n", token );
}
else if(f==2){
Time=strtod(token,NULL);
// printf( "Time %f \n", token );
}
else if(f==3){
pa=token;
// printf( "pa %s \n", token );
}
else if(f==4){
xixj=(token);
// printf( "xixj %s \n", token );
}
else{
Value[kk]=strtod(&token[1], NULL);
// printf("Value %f \n", Value[kk]);
kk++;
}
token = strtok(NULL, end);
f++;
}
}
//
//insert in variable
if (file==0){
for (i=0;i<(file_size0-1)/3;++i){
new_value0[i]=Value[i+1];
cudaMemcpy(new_value0_dev,new_value0,N*sizeof(file_size0), cudaMemcpyHostToDevice);
new_value1[i]=Value[i + 1+((file_size0-1)/3)];
cudaMemcpy(new_value1_dev,new_value1,N*sizeof(file_size0), cudaMemcpyHostToDevice);
new_value2[i]=Value[i + (1+ 2*(file_size0-1)/3)];
cudaMemcpy(new_value2_dev,new_value2,N*sizeof(file_size0), cudaMemcpyHostToDevice);
// printf(" new_value- %d - %f - %f - %f \n",i,new_value0[i],new_value1[i],new_value2[i]);
}
}else if(file==1 || file==2){
for (i=0; i<(file_size1-1)/6;++i)
{
new_value00[i]=Value[i+1];
cudaMemcpy(new_value00_dev,new_value00,N*sizeof(file_size0), cudaMemcpyHostToDevice);
new_value01[i]=Value[i+ ((file_size0-1)/3)+1];
cudaMemcpy(new_value01_dev,new_value01,N*sizeof(file_size1), cudaMemcpyHostToDevice);
new_value10[i]=Value[i+ (2*(file_size1-1)/6)+1];
cudaMemcpy(new_value10_dev,new_value10,N*sizeof(file_size1), cudaMemcpyHostToDevice);
new_value11[i]=Value[i+ (3*(file_size1-1)/6)+1];
cudaMemcpy(new_value11_dev,new_value11,N*sizeof(file_size1), cudaMemcpyHostToDevice);
new_value20[i]=Value[i+ (4*(file_size1-1)/6)+1];
cudaMemcpy(new_value20_dev,new_value20,N*sizeof(file_size1), cudaMemcpyHostToDevice);
new_value21[i]=Value[i+ (5*(file_size1-1)/6)+1];
cudaMemcpy(new_value21_dev,new_value21,N*sizeof(file_size1), cudaMemcpyHostToDevice);
// printf(" new_value- %d - %f - %f - %f - %f - %f - %f \n",i,new_value00[i],new_value01[i],new_value10[i],new_value11[i],new_value20[i],new_value21[i]);
}
}else{
for (i=0; i<(file_size3-1)/12;++i)
{
new_value000[i]=Value[i+1];
cudaMemcpy(new_value000_dev,new_value000,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value001[i]=Value[i+ ((file_size3-1)/12)+1];
cudaMemcpy(new_value001_dev,new_value001,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value010[i]=Value[i+ (2*(file_size3-1)/12)+1];
cudaMemcpy(new_value010_dev,new_value010,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value011[i]=Value[i+ (3*(file_size3-1)/12)+1];
cudaMemcpy(new_value011_dev,new_value011,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value100[i]=Value[i+ (4*(file_size3-1)/12)+1];
cudaMemcpy(new_value100_dev,new_value100,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value101[i]=Value[i+ (5*(file_size3-1)/12)+1];
cudaMemcpy(new_value101_dev,new_value101,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value110[i]=Value[i+ (6*(file_size3-1)/12)+1];
cudaMemcpy(new_value110_dev,new_value110,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value111[i]=Value[i+ (7*(file_size3-1)/12)+1];
cudaMemcpy(new_value111_dev,new_value111,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value200[i]=Value[i+ (8*(file_size3-1)/12)+1];
cudaMemcpy(new_value200_dev,new_value200,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value201[i]=Value[i+ (9*(file_size3-1)/12)+1];
cudaMemcpy(new_value201_dev,new_value201,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value210[i]=Value[i+ (10*(file_size3-1)/12)+1];
cudaMemcpy(new_value210_dev,new_value210,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value211[i]=Value[i+ (11*(file_size3-1)/12)+1];
cudaMemcpy(new_value211_dev,new_value211,N*sizeof(file_size3), cudaMemcpyHostToDevice);
// printf(" new_value- %d - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f \n",i,new_value000[i],new_value001[i],new_value010[i],new_value011[i],new_value100[i],new_value101[i],new_value110[i],new_value111[i],new_value200[i],new_value201[i],new_value210[i],new_value211[i]);
}
}
}
}
//cudaMemcpy(Value_dev,Value,N*sizeof(file_size), cudaMemcpyHostToDevice);
//variable of kernel
//no parent
//START computation
printf("\nPRE KERNEL\n");
const int sz=(sizeof(parents)/sizeof(*(parents)));
const int dim=(sizeof(traject)/sizeof(*(traject)));
printf("%d - %d \n",sz, dim);
//chiamata kernel
int block_size = 3;
int n_blocks =1 ;
int *m, *m_dev;
m=(int *)malloc(sz*N);
cudaMalloc((void **)&m_dev, N * sz);
float *trns_dev;
cudaMalloc((void **)&trns_dev, N * dim);
int i;
for(i=0;i<(file_size0-1)/3;i++){
printf(" new_value- %d - %f - %f - %f \n",i,new_value0[i],new_value1[i],new_value2[i]);
}
printf("\n");
for(i=0;i<(file_size1-1)/6;i++){
printf(" new_value- %d - %f - %f - %f - %f - %f - %f \n",i,new_value00[i],new_value01[i],new_value10[i],new_value11[i],new_value20[i],new_value21[i]);
}
printf("\n");
for(i=0;i<(file_size3-1)/12;i++){
printf(" new_value- %d - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f \n",i,new_value000[i],new_value001[i],new_value010[i],new_value011[i],new_value100[i],new_value101[i],new_value110[i],new_value111[i],new_value200[i],new_value201[i],new_value210[i],new_value211[i]);
}
for(Epoca=0; Epoca<3; Epoca++){
bs=0;
float bf=0;
cudaMalloc((void **)&bf, N * sz);
cudaMemcpy(bs_dev,bs,N*sizeof(float), cudaMemcpyHostToDevice);
if(Epoca==0){
calculateMLpa<<<n_blocks, block_size >>>(N,bs_dev,new_value0_dev,1.0,0.1,sz,dim,m_dev);
cudaDeviceSynchronize();
cudaMemcpy(bs,bs_dev,N*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(m,m_dev,N*sizeof(float), cudaMemcpyDeviceToHost);
bf =+ bs[0];
printf("score= %f m0 = %d, m1 = %d, m2 = %d \n\n", bf, m[0], m[1], m[2]);
calculateMLpa<<<n_blocks, block_size >>>(N,bs_dev,new_value00_dev,1.0,0.1,sz,dim,m_dev);
cudaDeviceSynchronize();
cudaMemcpy(bs,bs_dev,N*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(m,m_dev,N*sizeof(float), cudaMemcpyDeviceToHost);
bf =+ bs[0];
printf("score= %f \n", bf);
}
printf("score %d= %f \n",Epoca, bf);
}
free(bs_dev);
}
我认为我可以将它与流并行化,但我以前从未使用过它。我看了 this 开始。
听起来你应该使用并行 CUDA streams。
一个有趣的选项:
CUDA 7 introduces a new option, the per-thread default stream, that has two effects. First, it gives each host thread its own default stream. This means that commands issued to the default stream by different host threads can run concurrently.
同样值得一提的是:
As described by the CUDA C Programming Guide, asynchronous commands return control to the calling host thread before the device has finished the requested task (they are non-blocking). These commands are:
Kernel launches; Memory copies between two addresses to the same device memory; Memory copies from host to device of a memory block of 64 KB or less; Memory copies performed by functions with the Async suffix; Memory set function calls.