优化 python 中的矩阵运算,numpy
optimizing matrix operations in python, numpy
这是一个优化问题。给定矩阵 E、H、Q、F 和方法 my_func_basic 中的逻辑(参见代码块),填充矩阵 V。是否有任何可能的方法(例如通过矢量化)来加速计算?谢谢
import timeit
import numpy as np
n = 20
m = 90
# E: m x n
E = np.random.randn(m,n)
# H, Q: m x m
H = np.random.randn(m,m)
Q = np.random.randn(m,m)
# F: n x n
F = np.random.randn(n,n)
# V: m x m
V = np.zeros(shape=(m,m))
def my_func_basic():
for x in range(n):
for y in range(n):
if x == y:
V[x][y] = np.nan
continue
h = H[x][y]
e = np.array([E[x,:]+h*E[y,:]])
v1 = np.dot(np.dot(e,F),np.transpose(e))[0][0]
v2 = Q[x][x]+h**2*Q[y][y]
V[x][y] = v1/np.sqrt(v2)
print(timeit.timeit(my_func_basic,number=1000),'(sec), too slow...')
这将是用 vectorized
方法解决它的一种方法 -
import numpy as np
def vectorized_approach(V,H,E,F,Q,n):
# Create a copy of V to store output values into it
V_vectorized = V.copy()
# Calculate v1 in a vectorized fashion
E1 = (E[None,:n,:]*H[:n,:n,None] + E[:n,None,:]).reshape(-1,n)
E2 = np.dot(E1,F)
v1_vectorized = np.einsum('ij,ji->i',E2,E1.T).reshape(n,n)
np.fill_diagonal(v1_vectorized, np.nan)
# Calculate v2 in a vectorized fashion
Q_diag = np.diag(Q[:n,:n])
v2_vectorized = Q_diag[:,None] + H[:n,:n]**2*Q_diag[None,:]
# Finally, get vectorized version of output V
V_vectorized[:n,:n] = v1_vectorized/np.sqrt(v2_vectorized)
return V_vectorized
测试:
1) 设置输入 -
In [314]: n = 20
...: m = 90
...: # E: m x n
...: E = np.random.randn(m,n)
...: # H, Q: m x m
...: H = np.random.randn(m,m)
...: Q = np.random.randn(m,m)
...: # F: n x n
...: F = np.random.randn(n,n)
...: # V: m x m
...: V = np.zeros(shape=(m,m))
...:
2) 验证结果 -
In [327]: out_basic_approach = my_func_basic(V,H,E,F,Q,n)
...: out_vectorized_approach = vectorized_approach(V,H,E,F,Q,n)
...:
...: mask1 = ~np.isnan(out_basic_approach)
...: mask2 = ~np.isnan(out_vectorized_approach)
...:
In [328]: np.allclose(mask1,mask2)
Out[328]: True
In [329]: np.allclose(out_basic_approach[mask1],out_vectorized_approach[mask1])
Out[329]: True
3) 运行时测试 -
In [330]: %timeit my_func_basic(V,H,E,F,Q,n)
100 loops, best of 3: 12.2 ms per loop
In [331]: %timeit vectorized_approach(V,H,E,F,Q,n)
1000 loops, best of 3: 222 µs per loop
这是一个优化问题。给定矩阵 E、H、Q、F 和方法 my_func_basic 中的逻辑(参见代码块),填充矩阵 V。是否有任何可能的方法(例如通过矢量化)来加速计算?谢谢
import timeit
import numpy as np
n = 20
m = 90
# E: m x n
E = np.random.randn(m,n)
# H, Q: m x m
H = np.random.randn(m,m)
Q = np.random.randn(m,m)
# F: n x n
F = np.random.randn(n,n)
# V: m x m
V = np.zeros(shape=(m,m))
def my_func_basic():
for x in range(n):
for y in range(n):
if x == y:
V[x][y] = np.nan
continue
h = H[x][y]
e = np.array([E[x,:]+h*E[y,:]])
v1 = np.dot(np.dot(e,F),np.transpose(e))[0][0]
v2 = Q[x][x]+h**2*Q[y][y]
V[x][y] = v1/np.sqrt(v2)
print(timeit.timeit(my_func_basic,number=1000),'(sec), too slow...')
这将是用 vectorized
方法解决它的一种方法 -
import numpy as np
def vectorized_approach(V,H,E,F,Q,n):
# Create a copy of V to store output values into it
V_vectorized = V.copy()
# Calculate v1 in a vectorized fashion
E1 = (E[None,:n,:]*H[:n,:n,None] + E[:n,None,:]).reshape(-1,n)
E2 = np.dot(E1,F)
v1_vectorized = np.einsum('ij,ji->i',E2,E1.T).reshape(n,n)
np.fill_diagonal(v1_vectorized, np.nan)
# Calculate v2 in a vectorized fashion
Q_diag = np.diag(Q[:n,:n])
v2_vectorized = Q_diag[:,None] + H[:n,:n]**2*Q_diag[None,:]
# Finally, get vectorized version of output V
V_vectorized[:n,:n] = v1_vectorized/np.sqrt(v2_vectorized)
return V_vectorized
测试:
1) 设置输入 -
In [314]: n = 20
...: m = 90
...: # E: m x n
...: E = np.random.randn(m,n)
...: # H, Q: m x m
...: H = np.random.randn(m,m)
...: Q = np.random.randn(m,m)
...: # F: n x n
...: F = np.random.randn(n,n)
...: # V: m x m
...: V = np.zeros(shape=(m,m))
...:
2) 验证结果 -
In [327]: out_basic_approach = my_func_basic(V,H,E,F,Q,n)
...: out_vectorized_approach = vectorized_approach(V,H,E,F,Q,n)
...:
...: mask1 = ~np.isnan(out_basic_approach)
...: mask2 = ~np.isnan(out_vectorized_approach)
...:
In [328]: np.allclose(mask1,mask2)
Out[328]: True
In [329]: np.allclose(out_basic_approach[mask1],out_vectorized_approach[mask1])
Out[329]: True
3) 运行时测试 -
In [330]: %timeit my_func_basic(V,H,E,F,Q,n)
100 loops, best of 3: 12.2 ms per loop
In [331]: %timeit vectorized_approach(V,H,E,F,Q,n)
1000 loops, best of 3: 222 µs per loop