加速Least_squarescipy
Speed up Least_square scipy
如何加快函数 least_square
的速度?我们有六个变量(3 个方向角和 3 个轴平移)需要优化。我们应用两组3D点,平面上的两组点和一个投影矩阵作为函数的输入。
dSeed = np.zeros(6)
optRes = least_squares(minReproj, dSeed, method='lm', max_nfev=600,
args=(points_prev, points_cur, d3d_prev, d3d_cur, Proj1))
该函数将点的前后投影误差最小化。
def minReproj(dof, d2dPoints1, d2dPoints2, d3dPoints1, d3dPoints2, w2cMatrix):
Rmat = genEulerZXZ(dof[0], dof[1], dof[2]) # my function
translationArray = np.array([[dof[3]], [dof[4]], [dof[5]]])
temp = np.hstack((Rmat, translationArray))
perspectiveProj = np.vstack((temp, [0, 0, 0, 1]))
numPoints = d2dPoints1.shape[0]
errorA = np.zeros((numPoints,3))
errorB = np.zeros((numPoints,3))
forwardProj = np.matmul(w2cMatrix, perspectiveProj)
backwardProj = np.matmul(w2cMatrix, np.linalg.inv(perspectiveProj))
for i in range(numPoints):
Ja = np.ones((3))
Jb = np.ones((3))
Wa = np.ones((4))
Wb = np.ones((4))
Ja[0:2] = d2dPoints1[i,:]
Jb[0:2] = d2dPoints2[i,:]
Wa[0:3] = d3dPoints1[i,:]
Wb[0:3] = d3dPoints2[i,:]
JaPred = np.matmul(forwardProj, Wb)
JaPred /= JaPred[-1]
e1 = Ja - JaPred
JbPred = np.matmul(backwardProj, Wa)
JbPred /= JbPred[-1]
e2 = Jb - JbPred
errorA[i,:] = e1
errorB[i,:] = e2
residual = np.vstack((errorA,errorB))
return residual.flatten()
def genEulerZXZ(psi, theta, sigma):
c1 = cos(psi)
s1 = sin(psi)
c2 = cos(theta)
s2 = sin(theta)
c3 = cos(sigma)
s3 = sin(sigma)
mat = np.zeros((3,3))
mat[0,0] = (c1 * c3) - (s1 * c2 * s3)
mat[0,1] = (-c1 * s3) - (s1 * c2 * c3)
mat[0,2] = (s1 * s2)
mat[1,0] = (s1 * c3) + (c1 * c2 * s3)
mat[1,1] = (-s1 * s3) + (c1 * c2 * c3)
mat[1,2] = (-c1 * s2)
mat[2,0] = (s2 * s3)
mat[2,1] = (s2 * c3)
mat[2,2] = c2
return mat
此优化需要 0.2 到 0.4 秒,这太多了。也许你知道如何加快这个过程?
或者也许还有另一种方法可以找到两个点集的相对旋转和平移?
对于 rpoleski:
96 0.023 0.000 19.406 0.202 /usr/local/lib/python3.7/dist-packages/scipy/optimize/_lsq/least_squares.py:240(least_squares)
4548 0.132 0.000 18.986 0.004 /usr/local/lib/python3.7/dist-packages/scipy/optimize/_lsq/least_squares.py:801(fun_wrapped)
96 0.012 0.000 18.797 0.196 /usr/local/lib/python3.7/dist-packages/scipy/optimize/_lsq/least_squares.py:42(call_minpack)
4548 11.102 0.002 18.789 0.004 /home/pi/helperFunctions.py:29(minimizeReprojection)
最有可能的是,在 scipy.optimize.least_squares()
期间,大部分时间花在了计算残差上,在您的情况下,它采用 minReproj()
中的代码形式。
但是,您在 minReproj()
中提供的代码似乎有一个次优的内存管理,可以通过预分配大大改进。这将显着提高速度。
例如,vstack()
和 hstack()
由于必须将参数复制到最终结果的内存中而遭受重大损失。考虑一下 minReproj()
的前几行,我在 gen_affine_OP()
中对其进行了重新组合。这些可以重写为 gen_affine()
,时间更好(我还重写了 gen_euler_zxz()
以不分配新内存):
import numpy as np
from math import sin, cos
def gen_euler_zxz(result, psi, theta, sigma):
c1 = cos(psi)
s1 = sin(psi)
c2 = cos(theta)
s2 = sin(theta)
c3 = cos(sigma)
s3 = sin(sigma)
result[0,0] = (c1 * c3) - (s1 * c2 * s3)
result[0,1] = (-c1 * s3) - (s1 * c2 * c3)
result[0,2] = (s1 * s2)
result[1,0] = (s1 * c3) + (c1 * c2 * s3)
result[1,1] = (-s1 * s3) + (c1 * c2 * c3)
result[1,2] = (-c1 * s2)
result[2,0] = (s2 * s3)
result[2,1] = (s2 * c3)
result[2,2] = c2
return result
def gen_affine(dof):
result = np.zeros((4, 4))
gen_euler_zxz(result[:3, :3], dof[0], dof[1], dof[2])
result[:3, 3] = dof[3:]
result[3, 3] = 1
return result
def gen_affine_OP(dof):
Rmat = gen_euler_zxz(np.empty((3, 3)), dof[0], dof[1], dof[2])
translationArray = np.array([[dof[3]], [dof[4]], [dof[5]]])
temp = np.hstack((Rmat, translationArray))
return np.vstack((temp, [0, 0, 0, 1]))
dof = 1, 2, 3, 4, 5, 6
%timeit gen_affine_OP(dof)
# 100000 loops, best of 3: 16.6 µs per loop
%timeit gen_affine(dof)
# 100000 loops, best of 3: 5.02 µs per loop
同样,可以通过定义更大的残差并为 errorA
和 errorB
提供视图来避免 residual = np.vstack((errorA,errorB))
调用。
另一个例子是当创建Ja
, Jb
, Wa
, Wb
:
def func_OP(numPoints):
for i in range(numPoints):
Ja = np.ones((3))
Jb = np.ones((3))
Wa = np.ones((4))
Wb = np.ones((4))
def func(n):
Ja = np.empty(3)
Jb = np.empty(3)
Wa = np.empty(3)
Wb = np.empty(3)
for i in range(n):
Ja[:] = 1
Jb[:] = 1
Wa[:] = 1
Wb[:] = 1
%timeit func_OP(1000)
# 100 loops, best of 3: 9.31 ms per loop
%timeit func(1000)
# 100 loops, best of 3: 2.2 ms per loop
此外,.flatten()
正在制作您不需要的副本,而 .ravel()
只会 return 一个视图,但这足以满足您的需要,而且速度更快:
a = np.ones((300, 300))
%timeit a.ravel()
# 1000000 loops, best of 3: 215 ns per loop
%timeit a.flatten()
# 10000 loops, best of 3: 113 µs per loop
最后的评论是关于加快主循环的。
我没有为此设计一个简单的矢量化重写,但您可以使用 Numba 加快速度(只要它在非对象模式下编译)。
为此,您还需要对 gen_euler_zxz()
进行 JIT 修饰,并且需要将 np.matmul()
调用替换为 np.dot()
(因为 Numba 不支持 np.matmul()
.
最终,修改后的 minReproj()
看起来像:
from math import sin, cos
import numpy as np
import numba as nb
matmul = np.dot
@nb.jit
def gen_euler_zxz(result, psi, theta, sigma):
c1 = cos(psi)
s1 = sin(psi)
c2 = cos(theta)
s2 = sin(theta)
c3 = cos(sigma)
s3 = sin(sigma)
result[0, 0] = (c1 * c3) - (s1 * c2 * s3)
result[0, 1] = (-c1 * s3) - (s1 * c2 * c3)
result[0, 2] = (s1 * s2)
result[1, 0] = (s1 * c3) + (c1 * c2 * s3)
result[1, 1] = (-s1 * s3) + (c1 * c2 * c3)
result[1, 2] = (-c1 * s2)
result[2, 0] = (s2 * s3)
result[2, 1] = (s2 * c3)
result[2, 2] = c2
return result
@nb.jit
def minReproj(dof, d2dPoints1, d2dPoints2, d3dPoints1, d3dPoints2, w2cMatrix):
perspectiveProj = np.zeros((4, 4))
gen_euler_zxz(perspectiveProj[:3, :3], dof[0], dof[1], dof[2])
perspectiveProj[:3, 3] = dof[3:]
perspectiveProj[3, 3] = 1
numPoints = d2dPoints1.shape[0]
residual = np.empty((2 * numPoints, 3))
forwardProj = matmul(w2cMatrix, perspectiveProj)
backwardProj = matmul(w2cMatrix, np.linalg.inv(perspectiveProj))
Ja = np.empty(3)
Jb = np.empty(3)
Wa = np.empty(4)
Wb = np.empty(4)
for i in range(numPoints):
j = i + numPoints
Ja[2] = Jb[2] = 1.0
Wa[3] = Wb[3] = 1.0
Ja[0:2] = d2dPoints1[i, :]
Jb[0:2] = d2dPoints2[i, :]
Wa[0:3] = d3dPoints1[i, :]
Wb[0:3] = d3dPoints2[i, :]
JaPred = matmul(forwardProj, Wb)
JaPred /= JaPred[-1]
residual[i, :] = Ja - JaPred
JbPred = matmul(backwardProj, Wa)
JbPred /= JbPred[-1]
residual[j, :] = Jb - JbPred
return residual.ravel()
在一些玩具数据上测试它显示了显着的加速,当适当使用 Numba 时这会变得非常引人注目:
n = 10000
dof = 1, 2, 3, 4, 5, 6
d2dPoints1 = np.random.random((n, 2))
d2dPoints2 = np.random.random((n, 2))
d3dPoints1 = np.random.random((n, 3))
d3dPoints2 = np.random.random((n, 3))
w2cMatrix = np.random.random((3, 4))
x = minReproj_OP(dof, d2dPoints1, d2dPoints2, d3dPoints1, d3dPoints2, w2cMatrix)
y = minReproj(dof, d2dPoints1, d2dPoints2, d3dPoints1, d3dPoints2, w2cMatrix)
z = minReproj_nb(dof, d2dPoints1, d2dPoints2, d3dPoints1, d3dPoints2, w2cMatrix)
print(np.allclose(x, y))
# True
print(np.allclose(x, z))
# True
%timeit minReproj_OP(dof, d2dPoints1, d2dPoints2, d3dPoints1, d3dPoints2, w2cMatrix)
# 1 loop, best of 3: 277 ms per loop
%timeit minReproj(dof, d2dPoints1, d2dPoints2, d3dPoints1, d3dPoints2, w2cMatrix)
# 10 loops, best of 3: 177 ms per loop
%timeit minReproj_nb(dof, d2dPoints1, d2dPoints2, d3dPoints1, d3dPoints2, w2cMatrix)
# 100 loops, best of 3: 5.69 ms per loop
如何加快函数 least_square
的速度?我们有六个变量(3 个方向角和 3 个轴平移)需要优化。我们应用两组3D点,平面上的两组点和一个投影矩阵作为函数的输入。
dSeed = np.zeros(6)
optRes = least_squares(minReproj, dSeed, method='lm', max_nfev=600,
args=(points_prev, points_cur, d3d_prev, d3d_cur, Proj1))
该函数将点的前后投影误差最小化。
def minReproj(dof, d2dPoints1, d2dPoints2, d3dPoints1, d3dPoints2, w2cMatrix):
Rmat = genEulerZXZ(dof[0], dof[1], dof[2]) # my function
translationArray = np.array([[dof[3]], [dof[4]], [dof[5]]])
temp = np.hstack((Rmat, translationArray))
perspectiveProj = np.vstack((temp, [0, 0, 0, 1]))
numPoints = d2dPoints1.shape[0]
errorA = np.zeros((numPoints,3))
errorB = np.zeros((numPoints,3))
forwardProj = np.matmul(w2cMatrix, perspectiveProj)
backwardProj = np.matmul(w2cMatrix, np.linalg.inv(perspectiveProj))
for i in range(numPoints):
Ja = np.ones((3))
Jb = np.ones((3))
Wa = np.ones((4))
Wb = np.ones((4))
Ja[0:2] = d2dPoints1[i,:]
Jb[0:2] = d2dPoints2[i,:]
Wa[0:3] = d3dPoints1[i,:]
Wb[0:3] = d3dPoints2[i,:]
JaPred = np.matmul(forwardProj, Wb)
JaPred /= JaPred[-1]
e1 = Ja - JaPred
JbPred = np.matmul(backwardProj, Wa)
JbPred /= JbPred[-1]
e2 = Jb - JbPred
errorA[i,:] = e1
errorB[i,:] = e2
residual = np.vstack((errorA,errorB))
return residual.flatten()
def genEulerZXZ(psi, theta, sigma):
c1 = cos(psi)
s1 = sin(psi)
c2 = cos(theta)
s2 = sin(theta)
c3 = cos(sigma)
s3 = sin(sigma)
mat = np.zeros((3,3))
mat[0,0] = (c1 * c3) - (s1 * c2 * s3)
mat[0,1] = (-c1 * s3) - (s1 * c2 * c3)
mat[0,2] = (s1 * s2)
mat[1,0] = (s1 * c3) + (c1 * c2 * s3)
mat[1,1] = (-s1 * s3) + (c1 * c2 * c3)
mat[1,2] = (-c1 * s2)
mat[2,0] = (s2 * s3)
mat[2,1] = (s2 * c3)
mat[2,2] = c2
return mat
此优化需要 0.2 到 0.4 秒,这太多了。也许你知道如何加快这个过程? 或者也许还有另一种方法可以找到两个点集的相对旋转和平移? 对于 rpoleski:
96 0.023 0.000 19.406 0.202 /usr/local/lib/python3.7/dist-packages/scipy/optimize/_lsq/least_squares.py:240(least_squares)
4548 0.132 0.000 18.986 0.004 /usr/local/lib/python3.7/dist-packages/scipy/optimize/_lsq/least_squares.py:801(fun_wrapped)
96 0.012 0.000 18.797 0.196 /usr/local/lib/python3.7/dist-packages/scipy/optimize/_lsq/least_squares.py:42(call_minpack)
4548 11.102 0.002 18.789 0.004 /home/pi/helperFunctions.py:29(minimizeReprojection)
最有可能的是,在 scipy.optimize.least_squares()
期间,大部分时间花在了计算残差上,在您的情况下,它采用 minReproj()
中的代码形式。
但是,您在 minReproj()
中提供的代码似乎有一个次优的内存管理,可以通过预分配大大改进。这将显着提高速度。
例如,vstack()
和 hstack()
由于必须将参数复制到最终结果的内存中而遭受重大损失。考虑一下 minReproj()
的前几行,我在 gen_affine_OP()
中对其进行了重新组合。这些可以重写为 gen_affine()
,时间更好(我还重写了 gen_euler_zxz()
以不分配新内存):
import numpy as np
from math import sin, cos
def gen_euler_zxz(result, psi, theta, sigma):
c1 = cos(psi)
s1 = sin(psi)
c2 = cos(theta)
s2 = sin(theta)
c3 = cos(sigma)
s3 = sin(sigma)
result[0,0] = (c1 * c3) - (s1 * c2 * s3)
result[0,1] = (-c1 * s3) - (s1 * c2 * c3)
result[0,2] = (s1 * s2)
result[1,0] = (s1 * c3) + (c1 * c2 * s3)
result[1,1] = (-s1 * s3) + (c1 * c2 * c3)
result[1,2] = (-c1 * s2)
result[2,0] = (s2 * s3)
result[2,1] = (s2 * c3)
result[2,2] = c2
return result
def gen_affine(dof):
result = np.zeros((4, 4))
gen_euler_zxz(result[:3, :3], dof[0], dof[1], dof[2])
result[:3, 3] = dof[3:]
result[3, 3] = 1
return result
def gen_affine_OP(dof):
Rmat = gen_euler_zxz(np.empty((3, 3)), dof[0], dof[1], dof[2])
translationArray = np.array([[dof[3]], [dof[4]], [dof[5]]])
temp = np.hstack((Rmat, translationArray))
return np.vstack((temp, [0, 0, 0, 1]))
dof = 1, 2, 3, 4, 5, 6
%timeit gen_affine_OP(dof)
# 100000 loops, best of 3: 16.6 µs per loop
%timeit gen_affine(dof)
# 100000 loops, best of 3: 5.02 µs per loop
同样,可以通过定义更大的残差并为 errorA
和 errorB
提供视图来避免 residual = np.vstack((errorA,errorB))
调用。
另一个例子是当创建Ja
, Jb
, Wa
, Wb
:
def func_OP(numPoints):
for i in range(numPoints):
Ja = np.ones((3))
Jb = np.ones((3))
Wa = np.ones((4))
Wb = np.ones((4))
def func(n):
Ja = np.empty(3)
Jb = np.empty(3)
Wa = np.empty(3)
Wb = np.empty(3)
for i in range(n):
Ja[:] = 1
Jb[:] = 1
Wa[:] = 1
Wb[:] = 1
%timeit func_OP(1000)
# 100 loops, best of 3: 9.31 ms per loop
%timeit func(1000)
# 100 loops, best of 3: 2.2 ms per loop
此外,.flatten()
正在制作您不需要的副本,而 .ravel()
只会 return 一个视图,但这足以满足您的需要,而且速度更快:
a = np.ones((300, 300))
%timeit a.ravel()
# 1000000 loops, best of 3: 215 ns per loop
%timeit a.flatten()
# 10000 loops, best of 3: 113 µs per loop
最后的评论是关于加快主循环的。 我没有为此设计一个简单的矢量化重写,但您可以使用 Numba 加快速度(只要它在非对象模式下编译)。
为此,您还需要对 gen_euler_zxz()
进行 JIT 修饰,并且需要将 np.matmul()
调用替换为 np.dot()
(因为 Numba 不支持 np.matmul()
.
最终,修改后的 minReproj()
看起来像:
from math import sin, cos
import numpy as np
import numba as nb
matmul = np.dot
@nb.jit
def gen_euler_zxz(result, psi, theta, sigma):
c1 = cos(psi)
s1 = sin(psi)
c2 = cos(theta)
s2 = sin(theta)
c3 = cos(sigma)
s3 = sin(sigma)
result[0, 0] = (c1 * c3) - (s1 * c2 * s3)
result[0, 1] = (-c1 * s3) - (s1 * c2 * c3)
result[0, 2] = (s1 * s2)
result[1, 0] = (s1 * c3) + (c1 * c2 * s3)
result[1, 1] = (-s1 * s3) + (c1 * c2 * c3)
result[1, 2] = (-c1 * s2)
result[2, 0] = (s2 * s3)
result[2, 1] = (s2 * c3)
result[2, 2] = c2
return result
@nb.jit
def minReproj(dof, d2dPoints1, d2dPoints2, d3dPoints1, d3dPoints2, w2cMatrix):
perspectiveProj = np.zeros((4, 4))
gen_euler_zxz(perspectiveProj[:3, :3], dof[0], dof[1], dof[2])
perspectiveProj[:3, 3] = dof[3:]
perspectiveProj[3, 3] = 1
numPoints = d2dPoints1.shape[0]
residual = np.empty((2 * numPoints, 3))
forwardProj = matmul(w2cMatrix, perspectiveProj)
backwardProj = matmul(w2cMatrix, np.linalg.inv(perspectiveProj))
Ja = np.empty(3)
Jb = np.empty(3)
Wa = np.empty(4)
Wb = np.empty(4)
for i in range(numPoints):
j = i + numPoints
Ja[2] = Jb[2] = 1.0
Wa[3] = Wb[3] = 1.0
Ja[0:2] = d2dPoints1[i, :]
Jb[0:2] = d2dPoints2[i, :]
Wa[0:3] = d3dPoints1[i, :]
Wb[0:3] = d3dPoints2[i, :]
JaPred = matmul(forwardProj, Wb)
JaPred /= JaPred[-1]
residual[i, :] = Ja - JaPred
JbPred = matmul(backwardProj, Wa)
JbPred /= JbPred[-1]
residual[j, :] = Jb - JbPred
return residual.ravel()
在一些玩具数据上测试它显示了显着的加速,当适当使用 Numba 时这会变得非常引人注目:
n = 10000
dof = 1, 2, 3, 4, 5, 6
d2dPoints1 = np.random.random((n, 2))
d2dPoints2 = np.random.random((n, 2))
d3dPoints1 = np.random.random((n, 3))
d3dPoints2 = np.random.random((n, 3))
w2cMatrix = np.random.random((3, 4))
x = minReproj_OP(dof, d2dPoints1, d2dPoints2, d3dPoints1, d3dPoints2, w2cMatrix)
y = minReproj(dof, d2dPoints1, d2dPoints2, d3dPoints1, d3dPoints2, w2cMatrix)
z = minReproj_nb(dof, d2dPoints1, d2dPoints2, d3dPoints1, d3dPoints2, w2cMatrix)
print(np.allclose(x, y))
# True
print(np.allclose(x, z))
# True
%timeit minReproj_OP(dof, d2dPoints1, d2dPoints2, d3dPoints1, d3dPoints2, w2cMatrix)
# 1 loop, best of 3: 277 ms per loop
%timeit minReproj(dof, d2dPoints1, d2dPoints2, d3dPoints1, d3dPoints2, w2cMatrix)
# 10 loops, best of 3: 177 ms per loop
%timeit minReproj_nb(dof, d2dPoints1, d2dPoints2, d3dPoints1, d3dPoints2, w2cMatrix)
# 100 loops, best of 3: 5.69 ms per loop