全对角化 NumPy vs SciPy
Full diagonalization NumPy vs SciPy
我必须使用对角化例程来获取厄米特复矩阵的所有特征对。我有点受性能限制,因为我需要重复操作数千次并且我的矩阵大约为 8000x8000。我对 NumPy 和 SciPy 例程进行了一些比较,以对厄米特矩阵进行对角化,我在 6 个物理内核的机器上得到了这些时间:
我观察到对于 8000x8000 的矩阵,这个比例缩小到大约 0.8 分钟,我需要重复这个过程 50000 次。我在这里遗漏了什么,或者这些实际上是表演时间吗?总的来说,如果这需要重复几次,这一切看起来都很慢。事实上,在 30 核机器上,我观察到的性能提升很小。我在 Anaconda 发行版下使用 Python3.8,因此它与 MKL 相关联。
这是示例代码
import numpy as np
import scipy.linalg
import matplotlib.pyplot as pyt
from time import time
t_ls = []
d_ls = np.array([100, 500, 1000,2000,4000])
for N in d_ls:
A =np.random.rand( N,N ) + 1j*np.random.rand( N,N )
A = 0.5*( A + np.conj(A.T) )
ts = time()
evals, evecs = np.linalg.eigh( A )
t_np = time()-ts
ts = time()
evals2, evecs2 = scipy.linalg.eigh( A )
t_sp = time()-ts
t_ls.append(np.array([t_np, t_sp]))
t_ls = np.array(t_ls)
pyt.plot( d_ls, t_ls[:,0], marker='s' )
pyt.plot( d_ls, t_ls[:,1], marker='^')
pyt.xlabel("N")
pyt.ylabel("time(secs)")
pyt.legend(["NumPy", "SciPy"])
pyt.show()
使用 SVD 和 MP 并行化
通过 post 中的一些评论,我尝试了矩阵的 SVD 和多处理。在所有情况下,我仍然认为使用 NumPy eigh 的序列化方法是最有效的方法;这是代码:
import numpy as np
import scipy.linalg
import matplotlib.pyplot as pyt
from time import time
import psutil
def f_mp_pool(*args):
N = args[0]
A =np.random.rand( N,N ) + 1j*np.random.rand( N,N )
A = 0.5*( A + np.conj(A.T) )
evals, evecs = np.linalg.eigh(A)
return evals, evecs
nreps = 100
N = 700
ts = time()
for n in range(nreps):
A =np.random.rand( N,N ) + 1j*np.random.rand( N,N )
A = 0.5*( A + np.conj(A.T) )
res = np.linalg.eigh(A)
print("serialized:", time()-ts)
#use svd
import scipy.linalg
ts = time()
for n in range(nreps):
res = scipy.linalg.svd( A, full_matrices=True, check_finite=False )
print("SVD:", time()-ts)
import multiprocessing as mp
nproc = psutil.cpu_count(logical=False)-1
mp_pool = mp.Pool(processes=nproc)
args_ls = [ (N,) for n in range(nreps) ]
ts = time()
res = mp_pool.starmap( f_mp_pool, args_ls )
print("parallel:", time()-ts)
Pytorch 会更快,如果你有 GPU,它也可以利用它,但不是那么多,因为 QR 迭代不利于并行计算。我有一个潜在的解决方案可以在 GPU 上加速该部分,但我从未真正实施过它。
import numpy as np
import scipy.linalg
import torch
import matplotlib.pyplot as plt
from time import time
t_ls = []
d_ls = np.array([100, 500, 1000,2000,4000])
for N in d_ls:
A =np.random.rand( N,N ) + 1j*np.random.rand( N,N )
A = 0.5*( A + np.conj(A.T) )
# skipping numpy, it is slow here, you may put it back if you want
# ts = time()
# evals, evecs = np.linalg.eigh( A )
# t_np = time()-ts
ts = time()
evals2, evecs2 = scipy.linalg.eigh( A )
t_sp = time()-ts
# When using CPU torch will use intra operation
# parallelism, so if you care about latency
# this is better than using multiprocessing
A_cpu = torch.as_tensor(A)
ts = time()
evals3, evecs3 = torch.linalg.eigh(A_cpu)
t_cpu = time() - ts;
if torch.cuda.is_available():
# Using GPU will give a significant speedup for some
# operations, I guess the
A_gpu = A_cpu.to('cuda')
torch.cuda.synchronize()
ts = time()
evals4, evecs4 = torch.linalg.eigh(A_gpu)
torch.cuda.synchronize()
t_gpu = time() - ts;
else:
t_gpu = np.nan #if you don't have GPU let's skip this part
t_ls.append(np.array([np.nan, t_sp, t_cpu, t_gpu]))
print(t_ls[-1])
t_ls = np.array(t_ls)
plt.plot( d_ls, t_ls[:,0], marker='s' )
plt.plot( d_ls, t_ls[:,1], marker='^')
plt.plot( d_ls, t_ls[:,2], marker='+')
plt.plot( d_ls, t_ls[:,3], marker='d')
plt.xlabel("N")
plt.ylabel("time(secs)")
plt.legend(["NumPy", "SciPy", "PyTorch CPU", "PyTorch GPU"])
我的剧情
我必须使用对角化例程来获取厄米特复矩阵的所有特征对。我有点受性能限制,因为我需要重复操作数千次并且我的矩阵大约为 8000x8000。我对 NumPy 和 SciPy 例程进行了一些比较,以对厄米特矩阵进行对角化,我在 6 个物理内核的机器上得到了这些时间:
我观察到对于 8000x8000 的矩阵,这个比例缩小到大约 0.8 分钟,我需要重复这个过程 50000 次。我在这里遗漏了什么,或者这些实际上是表演时间吗?总的来说,如果这需要重复几次,这一切看起来都很慢。事实上,在 30 核机器上,我观察到的性能提升很小。我在 Anaconda 发行版下使用 Python3.8,因此它与 MKL 相关联。
这是示例代码
import numpy as np
import scipy.linalg
import matplotlib.pyplot as pyt
from time import time
t_ls = []
d_ls = np.array([100, 500, 1000,2000,4000])
for N in d_ls:
A =np.random.rand( N,N ) + 1j*np.random.rand( N,N )
A = 0.5*( A + np.conj(A.T) )
ts = time()
evals, evecs = np.linalg.eigh( A )
t_np = time()-ts
ts = time()
evals2, evecs2 = scipy.linalg.eigh( A )
t_sp = time()-ts
t_ls.append(np.array([t_np, t_sp]))
t_ls = np.array(t_ls)
pyt.plot( d_ls, t_ls[:,0], marker='s' )
pyt.plot( d_ls, t_ls[:,1], marker='^')
pyt.xlabel("N")
pyt.ylabel("time(secs)")
pyt.legend(["NumPy", "SciPy"])
pyt.show()
使用 SVD 和 MP 并行化
通过 post 中的一些评论,我尝试了矩阵的 SVD 和多处理。在所有情况下,我仍然认为使用 NumPy eigh 的序列化方法是最有效的方法;这是代码:
import numpy as np
import scipy.linalg
import matplotlib.pyplot as pyt
from time import time
import psutil
def f_mp_pool(*args):
N = args[0]
A =np.random.rand( N,N ) + 1j*np.random.rand( N,N )
A = 0.5*( A + np.conj(A.T) )
evals, evecs = np.linalg.eigh(A)
return evals, evecs
nreps = 100
N = 700
ts = time()
for n in range(nreps):
A =np.random.rand( N,N ) + 1j*np.random.rand( N,N )
A = 0.5*( A + np.conj(A.T) )
res = np.linalg.eigh(A)
print("serialized:", time()-ts)
#use svd
import scipy.linalg
ts = time()
for n in range(nreps):
res = scipy.linalg.svd( A, full_matrices=True, check_finite=False )
print("SVD:", time()-ts)
import multiprocessing as mp
nproc = psutil.cpu_count(logical=False)-1
mp_pool = mp.Pool(processes=nproc)
args_ls = [ (N,) for n in range(nreps) ]
ts = time()
res = mp_pool.starmap( f_mp_pool, args_ls )
print("parallel:", time()-ts)
Pytorch 会更快,如果你有 GPU,它也可以利用它,但不是那么多,因为 QR 迭代不利于并行计算。我有一个潜在的解决方案可以在 GPU 上加速该部分,但我从未真正实施过它。
import numpy as np
import scipy.linalg
import torch
import matplotlib.pyplot as plt
from time import time
t_ls = []
d_ls = np.array([100, 500, 1000,2000,4000])
for N in d_ls:
A =np.random.rand( N,N ) + 1j*np.random.rand( N,N )
A = 0.5*( A + np.conj(A.T) )
# skipping numpy, it is slow here, you may put it back if you want
# ts = time()
# evals, evecs = np.linalg.eigh( A )
# t_np = time()-ts
ts = time()
evals2, evecs2 = scipy.linalg.eigh( A )
t_sp = time()-ts
# When using CPU torch will use intra operation
# parallelism, so if you care about latency
# this is better than using multiprocessing
A_cpu = torch.as_tensor(A)
ts = time()
evals3, evecs3 = torch.linalg.eigh(A_cpu)
t_cpu = time() - ts;
if torch.cuda.is_available():
# Using GPU will give a significant speedup for some
# operations, I guess the
A_gpu = A_cpu.to('cuda')
torch.cuda.synchronize()
ts = time()
evals4, evecs4 = torch.linalg.eigh(A_gpu)
torch.cuda.synchronize()
t_gpu = time() - ts;
else:
t_gpu = np.nan #if you don't have GPU let's skip this part
t_ls.append(np.array([np.nan, t_sp, t_cpu, t_gpu]))
print(t_ls[-1])
t_ls = np.array(t_ls)
plt.plot( d_ls, t_ls[:,0], marker='s' )
plt.plot( d_ls, t_ls[:,1], marker='^')
plt.plot( d_ls, t_ls[:,2], marker='+')
plt.plot( d_ls, t_ls[:,3], marker='d')
plt.xlabel("N")
plt.ylabel("time(secs)")
plt.legend(["NumPy", "SciPy", "PyTorch CPU", "PyTorch GPU"])
我的剧情