在 Numba 中的 GPU 上求解一维热方程
Solving 1D heat equation on GPU in Numba
我是 GPU 使用的新手,我正在尝试在 Numba 中编写一个内核来数值求解一维热方程。我还编写了一个 Numpy 版本的 PDE 求解器,结果发现 GPU 内核没有提供正确的结果。下面我展示了两个脚本计算的状态向量的比较:
此外,内核在每个 运行 处生成的结果略有不同。这可能是与线程管理相关的一些问题,即使我在每次迭代时都同步了线程。如果能提供一些帮助,我们将不胜感激。
from numba import cuda, void, float32
import numpy as np
import scipy.stats as stats
import time
import matplotlib.pyplot as plt
##################### Numba GPU Version
@cuda.jit(void(float32[::1], float32[::1]))
def solve_pde(u, parameters):
# Space and time parameters
dx = parameters[0]
dt = parameters[1]
t = parameters[2]
t_end = parameters[3]
u_size = u.size
# Index of thread on GPU
i = cuda.grid(1)
# Condition to avoid threads accessing indices out of array
if i < u_size:
while t < t_end:
if(i in [0, 1, u_size-2, u_size-1]): # Natural boundary conditions
u[i] = np.float32(0.)
else:
# Compute second order derivatives
RHS = np.float32(0.005)*(u[i + 1] - 2*u[i] + u[i - 1])/(dx*dx)
# Update state vector
u[i] += RHS*dt
# Update time
t += dt
# Wait until all threads finish computing
cuda.syncthreads()
# Space and time parameters
dx = 0.01
dt = 0.01
t0 = 0
t_end = 200
parameters = np.array([dx, dt, t0, t_end], dtype="float32")
# Initial state vector
x = np.linspace(0, 6, int(6/dx), dtype="float32")
u = np.array(stats.norm.pdf(x, 3, 0.3), dtype="float32")
# Manage the number of threads
threads_per_block = 32
blocks_per_grid = (u.size + (threads_per_block - 1)) \
// threads_per_block
# Send the state vector and the parameters to the device
d_u = cuda.to_device(u)
d_parameters = cuda.to_device(parameters)
# Start timer
start = time.perf_counter()
# Start parallel simulations
solve_pde[blocks_per_grid, threads_per_block](d_u, d_parameters)
# Move the final state vector to the host
u_end = d_u.copy_to_host()
# Measure the time elapsed and print the result
end = time.perf_counter()
print(end - start)
# Plot the final state vector
plt.figure(figsize=(14, 10))
plt.plot(x, u_end, 'b-')
##################### Numpy Version
u = np.array(stats.norm.pdf(x, 3, 0.3), dtype="float32")
u_size = u.size
t = t0
while t < t_end:
for i in range(u_size):
if(i in [0, 1, u_size-2, u_size-1]):
u[i] = 0
else:
RHS = 0.005*(u[i + 1] - 2*u[i] + u[i - 1])/(dx*dx)
u[i] += RHS*dt
t += dt
plt.figure(figsize=(14, 10))
plt.plot(x, u, 'r-')
问题肯定来自 u
由 GPU 线程同时读取和写入导致 竞争条件 。您需要处理 两个不同的缓冲区 以防止出现此问题。请注意,您可以在计算步骤结束时交换缓冲区。
此外,请注意 cuda.syncthreads
不会“等待所有线程完成计算”。是block level synchronization barrier。 AFAIK,如果您想等待所有线程完成给定 time-step,唯一的方法是再次 运行 CUDA 内核(每个 time-step 一个)。
请注意,运行内核非常昂贵,因此如果要计算的数组很大(例如,肯定在在你的情况下至少 100_000)。除此之外,请注意 1.0/(dx*dx)
可以预先计算以避免除法缓慢。
我post我按照上面的建议写的代码。我将 Numba 和 Numpy 脚本生成的结果与具有高斯初始条件的一维热方程的解析解进行了比较。现在一切顺利。
这是新代码:
from numba import cuda, void, float32
import numpy as np
import scipy.stats as stats
import time
import matplotlib.pyplot as plt
##################### Numba GPU Version
@cuda.jit(void(float32[::1], float32[::1], float32[::1]))
def solve_pde(u, v, parameters):
# Equation parameters
dx = parameters[0]
dt = parameters[1]
D = parameters[2]
u_size = u.size
# Index of thread on GPU
i = cuda.grid(1)
# Condition to avoid threads accessing indices out of array
if i < u_size:
if(i in [0, 1, u_size-2, u_size-1]): # Natural boundary conditions
v[i] = np.float32(0.)
else:
# Compute second order derivatives
RHS = D*(u[i + 1] - 2*u[i] + u[i - 1])/(dx*dx)
# Update state vector
v[i] = u[i] + RHS*dt
# Equation parameters
dx = 0.01
dt = 0.01
t0 = 0
t_end = 100
D = 0.005 # Diffusion coefficient
parameters = np.array([dx, dt, D], dtype="float32")
# Initial state vector
mu = 3
std = 0.3
x = np.linspace(0, 6, int(6/dx), dtype="float32")
u = np.array(stats.norm.pdf(x, mu, std), dtype="float32")
# Manage the number of threads
threads_per_block = 32
blocks_per_grid = (u.size + (threads_per_block - 1)) \
// threads_per_block
# Start timer
start = time.perf_counter()
# Initialize the state vector u and the buffer v on the GPU
d_u = cuda.to_device(u)
d_v = cuda.device_array(u.shape, dtype="float32")
d_parameters = cuda.to_device(parameters)
# Loop across time
for i in range(int((t_end-t0)/dt)):
if (i % 2) == 0:
solve_pde[blocks_per_grid, threads_per_block](d_u, d_v, d_parameters)
else:
solve_pde[blocks_per_grid, threads_per_block](d_v, d_u, d_parameters)
u = d_u.copy_to_host()
# Measure the time elapsed and print the result
end = time.perf_counter()
print(end - start)
# Plot the final state vector
plt.figure(figsize=(14, 10))
plt.plot(x, u, 'b-')
##################### Numpy Version
u = np.array(stats.norm.pdf(x, mu, std), dtype="float32")
v = np.empty_like(u)
u_size = u.size
start = time.perf_counter()
t = t0
while t < t_end:
for i in range(u_size):
if(i in [0, 1, u_size-2, u_size-1]):
v[i] = 0
else:
RHS = D*(u[i + 1] - 2*u[i] + u[i - 1])/(dx*dx)
v[i] = RHS*dt
u += v
t += dt
end = time.perf_counter()
print(end-start)
plt.figure(figsize=(14, 10))
plt.plot(x, u, 'r-')
##################### Analytical Solution
plt.figure(figsize=(14, 10))
plt.plot(x, (1/np.sqrt(np.pi*(2*std**2+4*D*t_end)))*np.exp(-((x-mu)**2)/(2*std**2+4*D*t_end)), 'g-')
我是 GPU 使用的新手,我正在尝试在 Numba 中编写一个内核来数值求解一维热方程。我还编写了一个 Numpy 版本的 PDE 求解器,结果发现 GPU 内核没有提供正确的结果。下面我展示了两个脚本计算的状态向量的比较:
此外,内核在每个 运行 处生成的结果略有不同。这可能是与线程管理相关的一些问题,即使我在每次迭代时都同步了线程。如果能提供一些帮助,我们将不胜感激。
from numba import cuda, void, float32
import numpy as np
import scipy.stats as stats
import time
import matplotlib.pyplot as plt
##################### Numba GPU Version
@cuda.jit(void(float32[::1], float32[::1]))
def solve_pde(u, parameters):
# Space and time parameters
dx = parameters[0]
dt = parameters[1]
t = parameters[2]
t_end = parameters[3]
u_size = u.size
# Index of thread on GPU
i = cuda.grid(1)
# Condition to avoid threads accessing indices out of array
if i < u_size:
while t < t_end:
if(i in [0, 1, u_size-2, u_size-1]): # Natural boundary conditions
u[i] = np.float32(0.)
else:
# Compute second order derivatives
RHS = np.float32(0.005)*(u[i + 1] - 2*u[i] + u[i - 1])/(dx*dx)
# Update state vector
u[i] += RHS*dt
# Update time
t += dt
# Wait until all threads finish computing
cuda.syncthreads()
# Space and time parameters
dx = 0.01
dt = 0.01
t0 = 0
t_end = 200
parameters = np.array([dx, dt, t0, t_end], dtype="float32")
# Initial state vector
x = np.linspace(0, 6, int(6/dx), dtype="float32")
u = np.array(stats.norm.pdf(x, 3, 0.3), dtype="float32")
# Manage the number of threads
threads_per_block = 32
blocks_per_grid = (u.size + (threads_per_block - 1)) \
// threads_per_block
# Send the state vector and the parameters to the device
d_u = cuda.to_device(u)
d_parameters = cuda.to_device(parameters)
# Start timer
start = time.perf_counter()
# Start parallel simulations
solve_pde[blocks_per_grid, threads_per_block](d_u, d_parameters)
# Move the final state vector to the host
u_end = d_u.copy_to_host()
# Measure the time elapsed and print the result
end = time.perf_counter()
print(end - start)
# Plot the final state vector
plt.figure(figsize=(14, 10))
plt.plot(x, u_end, 'b-')
##################### Numpy Version
u = np.array(stats.norm.pdf(x, 3, 0.3), dtype="float32")
u_size = u.size
t = t0
while t < t_end:
for i in range(u_size):
if(i in [0, 1, u_size-2, u_size-1]):
u[i] = 0
else:
RHS = 0.005*(u[i + 1] - 2*u[i] + u[i - 1])/(dx*dx)
u[i] += RHS*dt
t += dt
plt.figure(figsize=(14, 10))
plt.plot(x, u, 'r-')
问题肯定来自 u
由 GPU 线程同时读取和写入导致 竞争条件 。您需要处理 两个不同的缓冲区 以防止出现此问题。请注意,您可以在计算步骤结束时交换缓冲区。
此外,请注意 cuda.syncthreads
不会“等待所有线程完成计算”。是block level synchronization barrier。 AFAIK,如果您想等待所有线程完成给定 time-step,唯一的方法是再次 运行 CUDA 内核(每个 time-step 一个)。
请注意,运行内核非常昂贵,因此如果要计算的数组很大(例如,肯定在在你的情况下至少 100_000)。除此之外,请注意 1.0/(dx*dx)
可以预先计算以避免除法缓慢。
我post我按照上面的建议写的代码。我将 Numba 和 Numpy 脚本生成的结果与具有高斯初始条件的一维热方程的解析解进行了比较。现在一切顺利。
这是新代码:
from numba import cuda, void, float32
import numpy as np
import scipy.stats as stats
import time
import matplotlib.pyplot as plt
##################### Numba GPU Version
@cuda.jit(void(float32[::1], float32[::1], float32[::1]))
def solve_pde(u, v, parameters):
# Equation parameters
dx = parameters[0]
dt = parameters[1]
D = parameters[2]
u_size = u.size
# Index of thread on GPU
i = cuda.grid(1)
# Condition to avoid threads accessing indices out of array
if i < u_size:
if(i in [0, 1, u_size-2, u_size-1]): # Natural boundary conditions
v[i] = np.float32(0.)
else:
# Compute second order derivatives
RHS = D*(u[i + 1] - 2*u[i] + u[i - 1])/(dx*dx)
# Update state vector
v[i] = u[i] + RHS*dt
# Equation parameters
dx = 0.01
dt = 0.01
t0 = 0
t_end = 100
D = 0.005 # Diffusion coefficient
parameters = np.array([dx, dt, D], dtype="float32")
# Initial state vector
mu = 3
std = 0.3
x = np.linspace(0, 6, int(6/dx), dtype="float32")
u = np.array(stats.norm.pdf(x, mu, std), dtype="float32")
# Manage the number of threads
threads_per_block = 32
blocks_per_grid = (u.size + (threads_per_block - 1)) \
// threads_per_block
# Start timer
start = time.perf_counter()
# Initialize the state vector u and the buffer v on the GPU
d_u = cuda.to_device(u)
d_v = cuda.device_array(u.shape, dtype="float32")
d_parameters = cuda.to_device(parameters)
# Loop across time
for i in range(int((t_end-t0)/dt)):
if (i % 2) == 0:
solve_pde[blocks_per_grid, threads_per_block](d_u, d_v, d_parameters)
else:
solve_pde[blocks_per_grid, threads_per_block](d_v, d_u, d_parameters)
u = d_u.copy_to_host()
# Measure the time elapsed and print the result
end = time.perf_counter()
print(end - start)
# Plot the final state vector
plt.figure(figsize=(14, 10))
plt.plot(x, u, 'b-')
##################### Numpy Version
u = np.array(stats.norm.pdf(x, mu, std), dtype="float32")
v = np.empty_like(u)
u_size = u.size
start = time.perf_counter()
t = t0
while t < t_end:
for i in range(u_size):
if(i in [0, 1, u_size-2, u_size-1]):
v[i] = 0
else:
RHS = D*(u[i + 1] - 2*u[i] + u[i - 1])/(dx*dx)
v[i] = RHS*dt
u += v
t += dt
end = time.perf_counter()
print(end-start)
plt.figure(figsize=(14, 10))
plt.plot(x, u, 'r-')
##################### Analytical Solution
plt.figure(figsize=(14, 10))
plt.plot(x, (1/np.sqrt(np.pi*(2*std**2+4*D*t_end)))*np.exp(-((x-mu)**2)/(2*std**2+4*D*t_end)), 'g-')