在 CUDA 内核中实现 memcpy 的正确方法是什么?
What is a correct way to implement memcpy inside a CUDA kernel?
我正在 CUDA 中实现 PDE 求解器 (Lax-Friedrichs),这是我之前用 C 编写的。请在下面找到 C 代码:
void solve(int M, double u[M+3][M+3], double unp1[M+3][M+3], double params[3]){
int i;
int j;
int n;
for (n=0; n<params[0]; n++){
for (i=0; i<M+2; i++)
for(j=0; j<M+2; j++){
unp1[i][j] = 0.25*(u[i+1][j] + u[i-1][j] + u[i][j+1] + u[i][j-1])
- params[1]*(u[i+1][j] - u[i-1][j])
- params[2]*(u[i][j+1] - u[i][j-1]);
}
memcpy(u, unp1, pow(M+3,2)*sizeof(double));
/*Periodic Boundary Conditions*/
for (i=0; i<M+2; i++){
u[0][i] = u[N+1][i];
u[i][0] = u[i][N+1];
u[N+2][i] = u[1][i];
u[i][N+2] = u[i][1];
}
}
}
而且效果很好。但是当我试图在 CUDA 中实现它时,我没有得到相同的数据。不幸的是,我无法准确指出确切的问题,因为我是整个并行编程的初学者,但我认为这可能与求解器上的 u[i*(N+3) + j] = unp1[i*(N+3) + j]
有关,因为我无法真正在内核中执行 memcpy ,因为它没有改变任何东西,我不知道如何进行。我看了一下This previous answer,可惜没能解决我的问题。这是我正在尝试编码的 CUDA 求解器:
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <stdlib.h>
#include <iostream>
#include <algorithm>
/*Configuration of the grid*/
const int N = 100; //Number of nodes
const double xmin = -1.0;
const double ymin = -1.0;
const double xmax = 1.0;
const double ymax = 1.0;
const double tmax = 0.5;
/*Configuration of the simulation physics*/
const double dx = (xmax - xmin)/N;
const double dy = (ymax - ymin)/N;
const double dt = 0.009;
const double vx = 1.0;
const double vy = 1.0;
__global__ void initializeDomain(double *x, double *y){
/*Initializes the grid of size (N+3)x(N+3) to better accomodate Boundary Conditions*/
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int j=index ; j<N+3; j+=stride){
x[j] = xmin + (j-1)*dx;
y[j] = ymin + (j-1)*dy;
}
}
__global__ void initializeU(double *x, double *y, double *u0){
double sigma_x = 2.0;
double sigma_y = 6.0;
int index_x = blockIdx.x * blockDim.x + threadIdx.x;
int stride_x = blockDim.x * gridDim.x;
int index_y = blockIdx.y * blockDim.y + threadIdx.y;
int stride_y = blockDim.y * gridDim.y;
for (int i = index_x; i < N+3; i += stride_x)
for (int j = index_y; j < N+3; j+= stride_y){
u0[i*(N+3) + j] = exp(-200*(pow(x[i],2)/(2*pow(sigma_x,2)) + pow(y[j],2)/(2*pow(sigma_y,2))));
u0[i*(N+3) + j] *= 1/(2*M_PI*sigma_x*sigma_y);
//u[i*(N+3) + j] = u0[i*(N+3) + j];
//unp1[i*(N+3) + j] = u0[i*(N+3) + j];
}
}
void initializeParams(double params[3]){
params[0] = round(tmax/dt);
params[1] = vx*dt/(2*dx);
params[2] = vy*dt/(2*dy);
}
__global__ void solve(double *u, double *unp1, double params[3]){
int index_x = blockIdx.x * blockDim.x + threadIdx.x;
int stride_x = blockDim.x * gridDim.x;
int index_y = blockIdx.y * blockDim.y + threadIdx.y;
int stride_y = blockDim.y * gridDim.y;
for (int i = index_x; i < N+2; i += stride_x)
for (int j = index_y; j < N+2; j += stride_y){
unp1[i*(N+3) + j] = 0.25*(u[(i+1)*(N+3) + j] + u[(i-1)*(N+3) + j] + u[i*(N+3) + (j+1)] + u[i*(N+3) + (j-1)]) \
- params[1]*(u[(i+1)*(N+3) + j] - u[(i-1)*(N+3) + j]) \
- params[2]*(u[i*(N+3) + (j+1)] - u[i*(N+3) + (j-1)]);
}
}
__global__ void bc(double *u){
int index_x = blockIdx.x * blockDim.x + threadIdx.x;
int stride_x = blockDim.x * gridDim.x;
/*Also BC are set on parallel */
for (int i = index_x; i < N+2; i += stride_x){
u[0*(N+3) + i] = u[(N+1)*(N+3) + i];
u[i*(N+3) + 0] = u[i*(N+3) + (N+1)];
u[(N+2)*(N+3) + i] = u[1*(N+3) + i];
u[i*(N+3) + (N+2)] = u[i*(N+3) + 1];
}
}
int main(){
int i;
int j;
double *x = (double *)malloc((N+3)*sizeof(double));
double *y = (double *)malloc((N+3)*sizeof(double));
double *d_x, *d_y;
cudaMalloc(&d_x, (N+3)*sizeof(double));
cudaMalloc(&d_y, (N+3)*sizeof(double));
initializeDomain<<<1, 1>>>(d_x, d_y);
cudaDeviceSynchronize();
cudaMemcpy(x, d_x, (N+3)*sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(y, d_y, (N+3)*sizeof(double), cudaMemcpyDeviceToHost);
FILE *fout1 = fopen("data_x.csv", "w");
FILE *fout2 = fopen("data_y.csv", "w");
for (i=0; i<N+3; i++){
if (i==N+2){
fprintf(fout1, "%.5f", x[i]);
fprintf(fout2, "%.5f", y[i]);
}
else{
fprintf(fout1, "%.5f, ", x[i]);
fprintf(fout2, "%.5f, ", y[i]);
}
}
dim3 Block2D(1,1);
dim3 ThreadsPerBlock(1,1);
double *d_u0;
double *u0 = (double *)malloc((N+3)*(N+3)*sizeof(double));
cudaMalloc(&d_u0, (N+3)*(N+3)*sizeof(double));
initializeU<<<Block2D, ThreadsPerBlock>>>(d_x, d_y, d_u0);
cudaDeviceSynchronize();
cudaMemcpy(u0, d_u0, (N+3)*(N+3)*sizeof(double), cudaMemcpyDeviceToHost);
/*Initialize parameters*/
double params[3];
initializeParams(params);
/*Allocate memory for u and unp1 on device for the solver*/
double *d_u, *d_unp1;
cudaMalloc(&d_u, (N+3)*(N+3)*sizeof(double));
cudaMalloc(&d_unp1, (N+3)*(N+3)*sizeof(double));
cudaMemcpy(d_u, d_u0, (N+3)*(N+3)*sizeof(double), cudaMemcpyDeviceToDevice);
cudaMemcpy(d_unp1, d_u0, (N+3)*(N+3)*sizeof(double), cudaMemcpyDeviceToDevice);
/*Solve*/
for (int n=0; n<params[0]; n++){
solve<<<Block2D, ThreadsPerBlock>>>(d_u, d_unp1, params);
double *temp = d_u;
d_u = d_unp1;
d_unp1 = temp;
bc<<<1,1>>>(d_u);
cudaDeviceSynchronize();
}
/*Copy results on host*/
double *u = (double *)malloc((N+3)*(N+3)*sizeof(double));
cudaMemcpy(u, d_u, (N+3)*(N+3)*sizeof(double), cudaMemcpyDeviceToHost);
FILE *fu = fopen("data_u.csv", "w");
for (i=0; i<N+3; i++){
for(j=0; j<N+3; j++)
if (j==N+2)
fprintf(fu, "%.5f", u[i*(N+3) + j]);
else
fprintf(fu, "%.5f, ", u[i*(N+3) + j]);
fprintf(fu, "\n");
}
fclose(fu);
free(x);
free(y);
free(u0);
free(u);
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_u0);
cudaFree(d_u);
cudaFree(d_unp1);
return 0;
}
不幸的是,我一直遇到同样的问题:我得到的数据是 0.0000。
让您感到困惑的一件事是您的原始算法具有正确性所需的顺序:
- 从
u
更新 unp
- 复制
unp
到u
- 执行边界条件
- 重复
您的算法要求第 1 步 完全 在第 2 步开始之前完成,同样第 2 步在第 3 步之前完成。您的 CUDA 实现(将第 1 步和第 3 步,或 1 ,2,3) 在单个内核中不保留或保证该顺序。 CUDA 线程可以以任何顺序执行。如果您严格地将其应用于您的代码(例如,假设索引为 0 的线程在任何其他线程开始之前 完全执行 。这将是有效的 CUDA 执行)那么您将看到您的内核设计不保留所需的顺序。
所以做这样的事情:
创建求解核只是第一步:
__global__ void solve(double *u, double *unp1, double params[3]){
int index_x = blockIdx.x * blockDim.x + threadIdx.x;
int stride_x = blockDim.x * gridDim.x;
int index_y = blockIdx.y * blockDim.y + threadIdx.y;
int stride_y = blockDim.y * gridDim.y;
for (int i = index_x; i < N+2; i += stride_x)
for (int j = index_y; j < N+2; j += stride_y){
unp1[i*(N+3) + j] = 0.25*(u[(i+1)*(N+3) + j] + u[(i-1)*(N+3) + j] + u[i*(N+3) + (j+1)] + u[i*(N+3) + (j-1)]) \
- params[1]*(u[(i+1)*(N+3) + j] - u[(i-1)*(N+3) + j]) \
- params[2]*(u[i*(N+3) + (j+1)] - u[i*(N+3) + (j-1)]);
u[i*(N+3) + j] = unp1[i*(N+3) + j];
}
}
不用理会memcpy
操作。更好的方法是交换指针(在 主机代码 中)。
创建一个单独的内核来强制执行边界:
__global__ void bc(double *u, double *unp1, double params[3]){
int index_x = blockIdx.x * blockDim.x + threadIdx.x;
int stride_x = blockDim.x * gridDim.x;
int index_y = blockIdx.y * blockDim.y + threadIdx.y;
int stride_y = blockDim.y * gridDim.y;
/*Also BC are set on parallel */
for (int i = index_x; i < N+2; i += stride_x){
u[0*(N+3) + i] = u[(N+1)*(N+3) + i];
u[i*(N+3) + 0] = u[i*(N+3) + (N+1)];
u[(N+2)*(N+3) + i] = u[1*(N+3) + i];
u[i*(N+3) + (N+2)] = u[i*(N+3) + 1];
}
}
修改您的主机代码以按顺序调用这些内核,中间交换指针:
/*Solve*/
for(int n = 0; n<params[0]; n++){
solve<<<Block2D, ThreadsPerBlock>>>(d_u, d_unp1, params);
double *temp = d_u;
d_u = d_unp1;
d_unp1 = temp;
bc<<<Block2D, ThreadsPerBlock>>>(d_u, d_unp1, params);
cudaDeviceSynchronize();
}
(在浏览器中编码,未经测试)
这将强制执行您的算法所需的顺序。
注意:正如下面评论中所指出的,solve
内核如上所述(以及在 OP 的原始 post 和他们的 posted CPU 代码中version) 有至少与 i-1
和 j-1
索引模式相关的索引错误。这些应该被修复,否则代码 损坏 。修复它们需要决定如何处理边缘情况,OP 没有提供指导,因此我保留了该代码。
我正在 CUDA 中实现 PDE 求解器 (Lax-Friedrichs),这是我之前用 C 编写的。请在下面找到 C 代码:
void solve(int M, double u[M+3][M+3], double unp1[M+3][M+3], double params[3]){
int i;
int j;
int n;
for (n=0; n<params[0]; n++){
for (i=0; i<M+2; i++)
for(j=0; j<M+2; j++){
unp1[i][j] = 0.25*(u[i+1][j] + u[i-1][j] + u[i][j+1] + u[i][j-1])
- params[1]*(u[i+1][j] - u[i-1][j])
- params[2]*(u[i][j+1] - u[i][j-1]);
}
memcpy(u, unp1, pow(M+3,2)*sizeof(double));
/*Periodic Boundary Conditions*/
for (i=0; i<M+2; i++){
u[0][i] = u[N+1][i];
u[i][0] = u[i][N+1];
u[N+2][i] = u[1][i];
u[i][N+2] = u[i][1];
}
}
}
而且效果很好。但是当我试图在 CUDA 中实现它时,我没有得到相同的数据。不幸的是,我无法准确指出确切的问题,因为我是整个并行编程的初学者,但我认为这可能与求解器上的 u[i*(N+3) + j] = unp1[i*(N+3) + j]
有关,因为我无法真正在内核中执行 memcpy ,因为它没有改变任何东西,我不知道如何进行。我看了一下This previous answer,可惜没能解决我的问题。这是我正在尝试编码的 CUDA 求解器:
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <stdlib.h>
#include <iostream>
#include <algorithm>
/*Configuration of the grid*/
const int N = 100; //Number of nodes
const double xmin = -1.0;
const double ymin = -1.0;
const double xmax = 1.0;
const double ymax = 1.0;
const double tmax = 0.5;
/*Configuration of the simulation physics*/
const double dx = (xmax - xmin)/N;
const double dy = (ymax - ymin)/N;
const double dt = 0.009;
const double vx = 1.0;
const double vy = 1.0;
__global__ void initializeDomain(double *x, double *y){
/*Initializes the grid of size (N+3)x(N+3) to better accomodate Boundary Conditions*/
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int j=index ; j<N+3; j+=stride){
x[j] = xmin + (j-1)*dx;
y[j] = ymin + (j-1)*dy;
}
}
__global__ void initializeU(double *x, double *y, double *u0){
double sigma_x = 2.0;
double sigma_y = 6.0;
int index_x = blockIdx.x * blockDim.x + threadIdx.x;
int stride_x = blockDim.x * gridDim.x;
int index_y = blockIdx.y * blockDim.y + threadIdx.y;
int stride_y = blockDim.y * gridDim.y;
for (int i = index_x; i < N+3; i += stride_x)
for (int j = index_y; j < N+3; j+= stride_y){
u0[i*(N+3) + j] = exp(-200*(pow(x[i],2)/(2*pow(sigma_x,2)) + pow(y[j],2)/(2*pow(sigma_y,2))));
u0[i*(N+3) + j] *= 1/(2*M_PI*sigma_x*sigma_y);
//u[i*(N+3) + j] = u0[i*(N+3) + j];
//unp1[i*(N+3) + j] = u0[i*(N+3) + j];
}
}
void initializeParams(double params[3]){
params[0] = round(tmax/dt);
params[1] = vx*dt/(2*dx);
params[2] = vy*dt/(2*dy);
}
__global__ void solve(double *u, double *unp1, double params[3]){
int index_x = blockIdx.x * blockDim.x + threadIdx.x;
int stride_x = blockDim.x * gridDim.x;
int index_y = blockIdx.y * blockDim.y + threadIdx.y;
int stride_y = blockDim.y * gridDim.y;
for (int i = index_x; i < N+2; i += stride_x)
for (int j = index_y; j < N+2; j += stride_y){
unp1[i*(N+3) + j] = 0.25*(u[(i+1)*(N+3) + j] + u[(i-1)*(N+3) + j] + u[i*(N+3) + (j+1)] + u[i*(N+3) + (j-1)]) \
- params[1]*(u[(i+1)*(N+3) + j] - u[(i-1)*(N+3) + j]) \
- params[2]*(u[i*(N+3) + (j+1)] - u[i*(N+3) + (j-1)]);
}
}
__global__ void bc(double *u){
int index_x = blockIdx.x * blockDim.x + threadIdx.x;
int stride_x = blockDim.x * gridDim.x;
/*Also BC are set on parallel */
for (int i = index_x; i < N+2; i += stride_x){
u[0*(N+3) + i] = u[(N+1)*(N+3) + i];
u[i*(N+3) + 0] = u[i*(N+3) + (N+1)];
u[(N+2)*(N+3) + i] = u[1*(N+3) + i];
u[i*(N+3) + (N+2)] = u[i*(N+3) + 1];
}
}
int main(){
int i;
int j;
double *x = (double *)malloc((N+3)*sizeof(double));
double *y = (double *)malloc((N+3)*sizeof(double));
double *d_x, *d_y;
cudaMalloc(&d_x, (N+3)*sizeof(double));
cudaMalloc(&d_y, (N+3)*sizeof(double));
initializeDomain<<<1, 1>>>(d_x, d_y);
cudaDeviceSynchronize();
cudaMemcpy(x, d_x, (N+3)*sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(y, d_y, (N+3)*sizeof(double), cudaMemcpyDeviceToHost);
FILE *fout1 = fopen("data_x.csv", "w");
FILE *fout2 = fopen("data_y.csv", "w");
for (i=0; i<N+3; i++){
if (i==N+2){
fprintf(fout1, "%.5f", x[i]);
fprintf(fout2, "%.5f", y[i]);
}
else{
fprintf(fout1, "%.5f, ", x[i]);
fprintf(fout2, "%.5f, ", y[i]);
}
}
dim3 Block2D(1,1);
dim3 ThreadsPerBlock(1,1);
double *d_u0;
double *u0 = (double *)malloc((N+3)*(N+3)*sizeof(double));
cudaMalloc(&d_u0, (N+3)*(N+3)*sizeof(double));
initializeU<<<Block2D, ThreadsPerBlock>>>(d_x, d_y, d_u0);
cudaDeviceSynchronize();
cudaMemcpy(u0, d_u0, (N+3)*(N+3)*sizeof(double), cudaMemcpyDeviceToHost);
/*Initialize parameters*/
double params[3];
initializeParams(params);
/*Allocate memory for u and unp1 on device for the solver*/
double *d_u, *d_unp1;
cudaMalloc(&d_u, (N+3)*(N+3)*sizeof(double));
cudaMalloc(&d_unp1, (N+3)*(N+3)*sizeof(double));
cudaMemcpy(d_u, d_u0, (N+3)*(N+3)*sizeof(double), cudaMemcpyDeviceToDevice);
cudaMemcpy(d_unp1, d_u0, (N+3)*(N+3)*sizeof(double), cudaMemcpyDeviceToDevice);
/*Solve*/
for (int n=0; n<params[0]; n++){
solve<<<Block2D, ThreadsPerBlock>>>(d_u, d_unp1, params);
double *temp = d_u;
d_u = d_unp1;
d_unp1 = temp;
bc<<<1,1>>>(d_u);
cudaDeviceSynchronize();
}
/*Copy results on host*/
double *u = (double *)malloc((N+3)*(N+3)*sizeof(double));
cudaMemcpy(u, d_u, (N+3)*(N+3)*sizeof(double), cudaMemcpyDeviceToHost);
FILE *fu = fopen("data_u.csv", "w");
for (i=0; i<N+3; i++){
for(j=0; j<N+3; j++)
if (j==N+2)
fprintf(fu, "%.5f", u[i*(N+3) + j]);
else
fprintf(fu, "%.5f, ", u[i*(N+3) + j]);
fprintf(fu, "\n");
}
fclose(fu);
free(x);
free(y);
free(u0);
free(u);
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_u0);
cudaFree(d_u);
cudaFree(d_unp1);
return 0;
}
不幸的是,我一直遇到同样的问题:我得到的数据是 0.0000。
让您感到困惑的一件事是您的原始算法具有正确性所需的顺序:
- 从
u
更新 - 复制
unp
到u
- 执行边界条件
- 重复
unp
您的算法要求第 1 步 完全 在第 2 步开始之前完成,同样第 2 步在第 3 步之前完成。您的 CUDA 实现(将第 1 步和第 3 步,或 1 ,2,3) 在单个内核中不保留或保证该顺序。 CUDA 线程可以以任何顺序执行。如果您严格地将其应用于您的代码(例如,假设索引为 0 的线程在任何其他线程开始之前 完全执行 。这将是有效的 CUDA 执行)那么您将看到您的内核设计不保留所需的顺序。
所以做这样的事情:
创建求解核只是第一步:
__global__ void solve(double *u, double *unp1, double params[3]){ int index_x = blockIdx.x * blockDim.x + threadIdx.x; int stride_x = blockDim.x * gridDim.x; int index_y = blockIdx.y * blockDim.y + threadIdx.y; int stride_y = blockDim.y * gridDim.y; for (int i = index_x; i < N+2; i += stride_x) for (int j = index_y; j < N+2; j += stride_y){ unp1[i*(N+3) + j] = 0.25*(u[(i+1)*(N+3) + j] + u[(i-1)*(N+3) + j] + u[i*(N+3) + (j+1)] + u[i*(N+3) + (j-1)]) \ - params[1]*(u[(i+1)*(N+3) + j] - u[(i-1)*(N+3) + j]) \ - params[2]*(u[i*(N+3) + (j+1)] - u[i*(N+3) + (j-1)]); u[i*(N+3) + j] = unp1[i*(N+3) + j]; } }
不用理会
memcpy
操作。更好的方法是交换指针(在 主机代码 中)。创建一个单独的内核来强制执行边界:
__global__ void bc(double *u, double *unp1, double params[3]){ int index_x = blockIdx.x * blockDim.x + threadIdx.x; int stride_x = blockDim.x * gridDim.x; int index_y = blockIdx.y * blockDim.y + threadIdx.y; int stride_y = blockDim.y * gridDim.y; /*Also BC are set on parallel */ for (int i = index_x; i < N+2; i += stride_x){ u[0*(N+3) + i] = u[(N+1)*(N+3) + i]; u[i*(N+3) + 0] = u[i*(N+3) + (N+1)]; u[(N+2)*(N+3) + i] = u[1*(N+3) + i]; u[i*(N+3) + (N+2)] = u[i*(N+3) + 1]; } }
修改您的主机代码以按顺序调用这些内核,中间交换指针:
/*Solve*/ for(int n = 0; n<params[0]; n++){ solve<<<Block2D, ThreadsPerBlock>>>(d_u, d_unp1, params); double *temp = d_u; d_u = d_unp1; d_unp1 = temp; bc<<<Block2D, ThreadsPerBlock>>>(d_u, d_unp1, params); cudaDeviceSynchronize(); }
(在浏览器中编码,未经测试)
这将强制执行您的算法所需的顺序。
注意:正如下面评论中所指出的,solve
内核如上所述(以及在 OP 的原始 post 和他们的 posted CPU 代码中version) 有至少与 i-1
和 j-1
索引模式相关的索引错误。这些应该被修复,否则代码 损坏 。修复它们需要决定如何处理边缘情况,OP 没有提供指导,因此我保留了该代码。