求根的闭式解
Close form solution for finding a root
假设我有一个 Pandas 系列 s
,其值总和为 1
,并且其值也都大于或等于 0
。我需要从所有值中减去一个常数,以使新系列的总和等于 0.6
。问题是,当我减去这个常量时,值永远不会小于零。
在数学公式中,假设我有一系列 x
,我想找到 k
MCVE
import pandas as pd
import numpy as np
from string import ascii_uppercase
np.random.seed([3, 141592653])
s = np.power(
1000, pd.Series(
np.random.rand(10),
list(ascii_uppercase[:10])
)
).pipe(lambda s: s / s.sum())
s
A 0.001352
B 0.163135
C 0.088365
D 0.010904
E 0.007615
F 0.407947
G 0.005856
H 0.198381
I 0.027455
J 0.088989
dtype: float64
总和为1
s.sum()
0.99999999999999989
我试过的
我可以使用 Scipy 的 optimize
模块中的牛顿法(以及其他方法)
from scipy.optimize import newton
def f(k):
return s.sub(k).clip(0).sum() - .6
找到这个函数的根会得到我需要的k
initial_guess = .1
k = newton(f, x0=initial_guess)
然后从s
中减去这个
new_s = s.sub(k).clip(0)
new_s
A 0.000000
B 0.093772
C 0.019002
D 0.000000
E 0.000000
F 0.338583
G 0.000000
H 0.129017
I 0.000000
J 0.019626
dtype: float64
新的总和是
new_s.sum()
0.60000000000000009
问题
我们可以在不使用求解器的情况下找到 k
吗?
更新:三种不同的实现 - 有趣的是,最不复杂的规模最好。
import numpy as np
def f_sort(A, target=0.6):
B = np.sort(A)
C = np.cumsum(np.r_[B[0], np.diff(B)] * np.arange(N, 0, -1))
idx = np.searchsorted(C, 1 - target)
return B[idx] + (1 - target - C[idx]) / (N-idx)
def f_partition(A, target=0.6):
target, l = 1 - target, len(A)
while len(A) > 1:
m = len(A) // 2
A = np.partition(A, m-1)
ls = A[:m].sum()
if ls + A[m-1] * (l-m) > target:
A = A[:m]
else:
l -= m
target -= ls
A = A[m:]
return target / l
def f_direct(A, target=0.6):
target = 1 - target
while True:
gt = A > target / len(A)
if np.all(gt):
return target / len(A)
target -= A[~gt].sum()
A = A[gt]
N = 10
A = np.random.random(N)
A /= A.sum()
print(f_sort(A), np.clip(A-f_sort(A), 0, None).sum())
print(f_partition(A), np.clip(A-f_partition(A), 0, None).sum())
print(f_direct(A), np.clip(A-f_direct(A), 0, None).sum())
from timeit import timeit
kwds = dict(globals=globals(), number=1000)
N = 100000
A = np.random.random(N)
A /= A.sum()
print(timeit('f_sort(A)', **kwds))
print(timeit('f_partition(A)', **kwds))
print(timeit('f_direct(A)', **kwds))
样本运行:
0.04813686999999732 0.5999999999999999
0.048136869999997306 0.6000000000000001
0.048136869999997306 0.6000000000000001
8.38109541599988
2.1064437470049597
1.2743922089866828
一个精确的解决方案,只要求排序,然后在 O(n) 中(好吧,更少:我们只需要与将变为零的值的数量一样多的循环):
我们尽可能将最小值变为零,然后在其余值之间共享剩余的多余部分:
l = [0.001352, 0.163135, 0.088365, 0.010904, 0.007615, 0.407947,
0.005856, 0.198381, 0.027455, 0.088989]
initial_sum = sum(l)
target_sum = 0.6
# number of values not yet turned to zero
non_zero = len(l)
# already substracted by substracting the constant where possible
substracted = 0
# what we want to substract to each value
constant = 0
for v in sorted(l):
if initial_sum - substracted - non_zero * (v - constant) >= target_sum:
substracted += non_zero * (v - constant)
constant = v
non_zero -= 1
else:
constant += (initial_sum - substracted - target_sum) / non_zero
break
l = [v - constant if v > constant else 0 for v in l]
print(l)
print(sum(l))
# [0, 0.09377160000000001, 0.019001600000000007, 0, 0, 0.3385836, 0, 0.1290176, 0, 0.019625600000000007]
# 0.6
我没想到 newton
会成功。但在大型阵列上,它确实如此。
numba.njit
灵感来自 Thierry's
在 numba
s jit
的排序数组上使用循环
import numpy as np
from numba import njit
@njit
def find_k_numba(a, t):
a = np.sort(a)
m = len(a)
s = a.sum()
to_remove = s - t
if to_remove <= 0:
k = 0
else:
for i, x in enumerate(a):
k = to_remove / (m - i)
if k < x:
break
else:
to_remove -= x
return k
numpy
灵感来自 Paul's
Numpy 肩负重任。
import numpy as np
def find_k_numpy(a, t):
a = np.sort(a)
m = len(a)
s = a.sum()
to_remove = s - t
if to_remove <= 0:
k = 0
else:
c = a.cumsum()
n = np.arange(m)[::-1]
b = n * a + c
i = np.searchsorted(b, to_remove)
k = a[i] + (to_remove - b[i]) / (m - i)
return k
scipy.optimize.newton
我通过牛顿的方法
import numpy as np
from scipy.optimize import newton
def find_k_newton(a, t):
s = a.sum()
if s <= t:
k = 0
else:
def f(k_):
return np.clip(a - k_, 0, a.max()).sum() - t
k = newton(f, (s - t) / len(a))
return k
计时赛
import pandas as pd
from timeit import timeit
res = pd.DataFrame(
np.nan, [10, 30, 100, 300, 1000, 3000, 10000, 30000],
'find_k_newton find_k_numpy find_k_numba'.split()
)
for i in res.index:
a = np.random.rand(i)
t = a.sum() * .6
for j in res.columns:
stmt = f'{j}(a, t)'
setp = f'from __main__ import {j}, a, t'
res.at[i, j] = timeit(stmt, setp, number=200)
结果
res.plot(loglog=True)
res.div(res.min(1), 0)
find_k_newton find_k_numpy find_k_numba
10 57.265421 17.552150 1.000000
30 29.221947 9.420263 1.000000
100 16.920463 5.294890 1.000000
300 10.761341 3.037060 1.000000
1000 1.455159 1.033066 1.000000
3000 1.000000 2.076484 2.550152
10000 1.000000 3.798906 4.421955
30000 1.000000 5.551422 6.784594
只是想为@piRSquared 的回答添加一个选项:find_k_hybrd
find_k_hybrd
是“numba”和“newton”解决方案的混合体。我在 NumbaMinpack 中使用 hybrd
寻根算法。 NumbaMinpack
对于这样的问题比 scipy
快,因为它的根查找方法可以在 jit-ed 函数中。
import numpy as np
from NumbaMinpack import hybrd, minpack_sig
from numba import njit, cfunc
n = 10000
np.random.seed(0)
a = np.random.rand(n)
t = a.sum()*.6
@cfunc(minpack_sig)
def func(k_, fvec, args):
t = args[n]
amax = -np.inf
for i in range(n):
if args[i] > amax:
amax = args[i]
args1 = np.empty(n)
for i in range(n):
args1[i] = args[i] - k_[0]
if args1[i] < 0.0:
args1[i] = 0.0
elif args1[i] > amax:
args1[i] = amax
argsum = args1.sum()
fvec[0] = argsum - t
funcptr = func.address
@njit
def find_k_hybrd(a, t):
s = a.sum()
if s <= t:
k = 0.0
else:
k_init = np.array([(s - t) / len(a)])
args = np.append(a, np.array([t]))
sol = hybrd(funcptr, k_init, args)
k = sol[0][0]
return k
print(find_k_hybrd(a, t))
假设我有一个 Pandas 系列 s
,其值总和为 1
,并且其值也都大于或等于 0
。我需要从所有值中减去一个常数,以使新系列的总和等于 0.6
。问题是,当我减去这个常量时,值永远不会小于零。
在数学公式中,假设我有一系列 x
,我想找到 k
MCVE
import pandas as pd
import numpy as np
from string import ascii_uppercase
np.random.seed([3, 141592653])
s = np.power(
1000, pd.Series(
np.random.rand(10),
list(ascii_uppercase[:10])
)
).pipe(lambda s: s / s.sum())
s
A 0.001352
B 0.163135
C 0.088365
D 0.010904
E 0.007615
F 0.407947
G 0.005856
H 0.198381
I 0.027455
J 0.088989
dtype: float64
总和为1
s.sum()
0.99999999999999989
我试过的
我可以使用 Scipy 的 optimize
模块中的牛顿法(以及其他方法)
from scipy.optimize import newton
def f(k):
return s.sub(k).clip(0).sum() - .6
找到这个函数的根会得到我需要的k
initial_guess = .1
k = newton(f, x0=initial_guess)
然后从s
new_s = s.sub(k).clip(0)
new_s
A 0.000000
B 0.093772
C 0.019002
D 0.000000
E 0.000000
F 0.338583
G 0.000000
H 0.129017
I 0.000000
J 0.019626
dtype: float64
新的总和是
new_s.sum()
0.60000000000000009
问题
我们可以在不使用求解器的情况下找到 k
吗?
更新:三种不同的实现 - 有趣的是,最不复杂的规模最好。
import numpy as np
def f_sort(A, target=0.6):
B = np.sort(A)
C = np.cumsum(np.r_[B[0], np.diff(B)] * np.arange(N, 0, -1))
idx = np.searchsorted(C, 1 - target)
return B[idx] + (1 - target - C[idx]) / (N-idx)
def f_partition(A, target=0.6):
target, l = 1 - target, len(A)
while len(A) > 1:
m = len(A) // 2
A = np.partition(A, m-1)
ls = A[:m].sum()
if ls + A[m-1] * (l-m) > target:
A = A[:m]
else:
l -= m
target -= ls
A = A[m:]
return target / l
def f_direct(A, target=0.6):
target = 1 - target
while True:
gt = A > target / len(A)
if np.all(gt):
return target / len(A)
target -= A[~gt].sum()
A = A[gt]
N = 10
A = np.random.random(N)
A /= A.sum()
print(f_sort(A), np.clip(A-f_sort(A), 0, None).sum())
print(f_partition(A), np.clip(A-f_partition(A), 0, None).sum())
print(f_direct(A), np.clip(A-f_direct(A), 0, None).sum())
from timeit import timeit
kwds = dict(globals=globals(), number=1000)
N = 100000
A = np.random.random(N)
A /= A.sum()
print(timeit('f_sort(A)', **kwds))
print(timeit('f_partition(A)', **kwds))
print(timeit('f_direct(A)', **kwds))
样本运行:
0.04813686999999732 0.5999999999999999
0.048136869999997306 0.6000000000000001
0.048136869999997306 0.6000000000000001
8.38109541599988
2.1064437470049597
1.2743922089866828
一个精确的解决方案,只要求排序,然后在 O(n) 中(好吧,更少:我们只需要与将变为零的值的数量一样多的循环):
我们尽可能将最小值变为零,然后在其余值之间共享剩余的多余部分:
l = [0.001352, 0.163135, 0.088365, 0.010904, 0.007615, 0.407947,
0.005856, 0.198381, 0.027455, 0.088989]
initial_sum = sum(l)
target_sum = 0.6
# number of values not yet turned to zero
non_zero = len(l)
# already substracted by substracting the constant where possible
substracted = 0
# what we want to substract to each value
constant = 0
for v in sorted(l):
if initial_sum - substracted - non_zero * (v - constant) >= target_sum:
substracted += non_zero * (v - constant)
constant = v
non_zero -= 1
else:
constant += (initial_sum - substracted - target_sum) / non_zero
break
l = [v - constant if v > constant else 0 for v in l]
print(l)
print(sum(l))
# [0, 0.09377160000000001, 0.019001600000000007, 0, 0, 0.3385836, 0, 0.1290176, 0, 0.019625600000000007]
# 0.6
我没想到 newton
会成功。但在大型阵列上,它确实如此。
numba.njit
灵感来自 Thierry's
在 numba
s jit
import numpy as np
from numba import njit
@njit
def find_k_numba(a, t):
a = np.sort(a)
m = len(a)
s = a.sum()
to_remove = s - t
if to_remove <= 0:
k = 0
else:
for i, x in enumerate(a):
k = to_remove / (m - i)
if k < x:
break
else:
to_remove -= x
return k
numpy
灵感来自 Paul's
Numpy 肩负重任。
import numpy as np
def find_k_numpy(a, t):
a = np.sort(a)
m = len(a)
s = a.sum()
to_remove = s - t
if to_remove <= 0:
k = 0
else:
c = a.cumsum()
n = np.arange(m)[::-1]
b = n * a + c
i = np.searchsorted(b, to_remove)
k = a[i] + (to_remove - b[i]) / (m - i)
return k
scipy.optimize.newton
我通过牛顿的方法
import numpy as np
from scipy.optimize import newton
def find_k_newton(a, t):
s = a.sum()
if s <= t:
k = 0
else:
def f(k_):
return np.clip(a - k_, 0, a.max()).sum() - t
k = newton(f, (s - t) / len(a))
return k
计时赛
import pandas as pd
from timeit import timeit
res = pd.DataFrame(
np.nan, [10, 30, 100, 300, 1000, 3000, 10000, 30000],
'find_k_newton find_k_numpy find_k_numba'.split()
)
for i in res.index:
a = np.random.rand(i)
t = a.sum() * .6
for j in res.columns:
stmt = f'{j}(a, t)'
setp = f'from __main__ import {j}, a, t'
res.at[i, j] = timeit(stmt, setp, number=200)
结果
res.plot(loglog=True)
res.div(res.min(1), 0)
find_k_newton find_k_numpy find_k_numba
10 57.265421 17.552150 1.000000
30 29.221947 9.420263 1.000000
100 16.920463 5.294890 1.000000
300 10.761341 3.037060 1.000000
1000 1.455159 1.033066 1.000000
3000 1.000000 2.076484 2.550152
10000 1.000000 3.798906 4.421955
30000 1.000000 5.551422 6.784594
只是想为@piRSquared 的回答添加一个选项:find_k_hybrd
find_k_hybrd
是“numba”和“newton”解决方案的混合体。我在 NumbaMinpack 中使用 hybrd
寻根算法。 NumbaMinpack
对于这样的问题比 scipy
快,因为它的根查找方法可以在 jit-ed 函数中。
import numpy as np
from NumbaMinpack import hybrd, minpack_sig
from numba import njit, cfunc
n = 10000
np.random.seed(0)
a = np.random.rand(n)
t = a.sum()*.6
@cfunc(minpack_sig)
def func(k_, fvec, args):
t = args[n]
amax = -np.inf
for i in range(n):
if args[i] > amax:
amax = args[i]
args1 = np.empty(n)
for i in range(n):
args1[i] = args[i] - k_[0]
if args1[i] < 0.0:
args1[i] = 0.0
elif args1[i] > amax:
args1[i] = amax
argsum = args1.sum()
fvec[0] = argsum - t
funcptr = func.address
@njit
def find_k_hybrd(a, t):
s = a.sum()
if s <= t:
k = 0.0
else:
k_init = np.array([(s - t) / len(a)])
args = np.append(a, np.array([t]))
sol = hybrd(funcptr, k_init, args)
k = sol[0][0]
return k
print(find_k_hybrd(a, t))