对数组中的每个元素 n 快速执行 n 次函数
Quickly performing a function n times for each element n in an array
我有一个 n_years by n_repeats 计数数据数组。
对于每个元素 (e),我想从损失严重性数组中抽取 e 次并取抽取的总和.
以下是目前为止我能做的最好的。它比 python 中的两个嵌套 for
循环快不了多少。在我的实际用例中,我的数组是 100,000 x 1,000。
有没有人知道如何使用纯 numpy 来完成此操作?
frequency = np.array(
[
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 1],
[1, 2, 1],
[1, 2, 1],
[2, 4, 2],
[2, 4, 2],
[3, 5, 2],
]
)
sev = np.array([1,1,2,2,1,2,3,4,5,1,1,2])
def calculate_insured_losses(frequency, severity_array):
def yearly_loss(element, severity_array=severity_array):
return 0 if element == 0 else np.random.choice(severity_array, size=element, replace=True).sum()
return np.vectorize(yearly_loss)(frequency.flatten()).reshape(frequency.shape)
calculate_insured_losses(freq, sev)
291 µs ± 10.6 µs 每个循环(7 次运行的平均值 ± 标准偏差,每次 1000 次循环)
编辑:带有嵌套循环的更简单代码
def calculate_insured_losses(frequency, severity):
def yearly_loss(element, severity_array=severity):
if element == 0:
return 0
else:
return np.random.choice(severity_array, size=element, replace=True).sum()
n_years, n_repeats = frequency.shape
losses = np.empty(shape=frequency.shape)
for year in range(n_years):
for repeat in range(n_repeats):
losses[year, repeat] = yearly_loss(frequency[year, repeat])
return losses
calculate_insured_losses(freq, sev)
你可以这样做更快:
import numpy as np
def calculate_insured_losses(frequency, severity_array):
# Flattened frequencies table
r = frequency.ravel()
# Accumulate
rcum = np.cumsum(r)
# Take all ramdom samples at once
c = np.random.choice(severity_array, rcum[-1], replace=True)
# Sum segments
res = np.add.reduceat(c, rcum - r)
# Make zero elements
res *= r.astype(bool)
# Return reshaped result
return res.reshape(frequency.shape)
# For comparison
def calculate_insured_losses_loop(frequency, severity_array):
def yearly_loss(element, severity_array=severity_array):
return 0 if element == 0 else np.random.choice(severity_array, size=element, replace=True).sum()
return np.vectorize(yearly_loss)(frequency.flatten()).reshape(frequency.shape)
# Test
frequency = np.array(
[
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 1],
[1, 2, 1],
[1, 2, 1],
[2, 4, 2],
[2, 4, 2],
[3, 5, 2],
]
)
sev = np.array([1, 1, 2, 2, 1, 2, 3, 4, 5, 1, 1, 2])
# Check results from functions match
np.random.seed(0)
res = calculate_insured_losses(frequency, sev)
np.random.seed(0)
res_loop = calculate_insured_losses_loop(frequency, sev)
print(np.all(res == res_loop))
# True
# Benchmark
%timeit calculate_insured_losses(frequency, sev)
# 32.4 µs ± 220 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
%timeit calculate_insured_losses_loop(frequency, sev)
# 383 µs ± 11.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
我有一个 n_years by n_repeats 计数数据数组。
对于每个元素 (e),我想从损失严重性数组中抽取 e 次并取抽取的总和.
以下是目前为止我能做的最好的。它比 python 中的两个嵌套 for
循环快不了多少。在我的实际用例中,我的数组是 100,000 x 1,000。
有没有人知道如何使用纯 numpy 来完成此操作?
frequency = np.array(
[
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 1],
[1, 2, 1],
[1, 2, 1],
[2, 4, 2],
[2, 4, 2],
[3, 5, 2],
]
)
sev = np.array([1,1,2,2,1,2,3,4,5,1,1,2])
def calculate_insured_losses(frequency, severity_array):
def yearly_loss(element, severity_array=severity_array):
return 0 if element == 0 else np.random.choice(severity_array, size=element, replace=True).sum()
return np.vectorize(yearly_loss)(frequency.flatten()).reshape(frequency.shape)
calculate_insured_losses(freq, sev)
291 µs ± 10.6 µs 每个循环(7 次运行的平均值 ± 标准偏差,每次 1000 次循环)
编辑:带有嵌套循环的更简单代码
def calculate_insured_losses(frequency, severity):
def yearly_loss(element, severity_array=severity):
if element == 0:
return 0
else:
return np.random.choice(severity_array, size=element, replace=True).sum()
n_years, n_repeats = frequency.shape
losses = np.empty(shape=frequency.shape)
for year in range(n_years):
for repeat in range(n_repeats):
losses[year, repeat] = yearly_loss(frequency[year, repeat])
return losses
calculate_insured_losses(freq, sev)
你可以这样做更快:
import numpy as np
def calculate_insured_losses(frequency, severity_array):
# Flattened frequencies table
r = frequency.ravel()
# Accumulate
rcum = np.cumsum(r)
# Take all ramdom samples at once
c = np.random.choice(severity_array, rcum[-1], replace=True)
# Sum segments
res = np.add.reduceat(c, rcum - r)
# Make zero elements
res *= r.astype(bool)
# Return reshaped result
return res.reshape(frequency.shape)
# For comparison
def calculate_insured_losses_loop(frequency, severity_array):
def yearly_loss(element, severity_array=severity_array):
return 0 if element == 0 else np.random.choice(severity_array, size=element, replace=True).sum()
return np.vectorize(yearly_loss)(frequency.flatten()).reshape(frequency.shape)
# Test
frequency = np.array(
[
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 1],
[1, 2, 1],
[1, 2, 1],
[2, 4, 2],
[2, 4, 2],
[3, 5, 2],
]
)
sev = np.array([1, 1, 2, 2, 1, 2, 3, 4, 5, 1, 1, 2])
# Check results from functions match
np.random.seed(0)
res = calculate_insured_losses(frequency, sev)
np.random.seed(0)
res_loop = calculate_insured_losses_loop(frequency, sev)
print(np.all(res == res_loop))
# True
# Benchmark
%timeit calculate_insured_losses(frequency, sev)
# 32.4 µs ± 220 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
%timeit calculate_insured_losses_loop(frequency, sev)
# 383 µs ± 11.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)