从 np.array 个不同长度的列表中查找分位数
Finding quantiles from np.array of different length lists
我正在尝试有效地计算一些可变长度直方图数据的四分位间距 IQR。我有列表列表中的数据。每个内部列表都是一个单独的直方图。大多数这些直方图的长度为 100,但长度可以在 50 - 150 整数之间变化。
示例数据:
list_of_hists = [
[13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42],
[13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 42],
[13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 43, 43, 43, 43],
[10, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 19, 25, 28]
]
我目前正在使用简单的 for 循环计算 IQR:
list_of_iqrs = []
for hist in list_of_hists:
iqr = np.quantile(hist, 0.75, interpolation="linear") - np.quantile(
hist, 0.25, interpolation="linear"
)
list_of_iqrs.append(iqr)
以上数据的预期结果:
list_of_iqrs = [10.0, 10.5, 11.5, 2.0]
鉴于此历史记录列表的长度约为 10**6 个元素,我希望找到一种使用数组计算来执行此操作的方法。不幸的是,当我试图将它变成一个数组时,我只得到一个列表数组:
array([
list([13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42]),
list([13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 42]),
list([13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 43, 43, 43, 43]),
list([10, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 19, 25, 28])
])
并且分位数计算没有像我预期的那样工作。
如何将此历史记录列表转换为数组并找到 IQR?
编辑:
另一个解决方案似乎是 append onto each hist and make them all the same length,然后将该历史记录列表转换为数组:
list_of_hists_ = [hist + [None]*(len(max(list_of_hists, key=len))-len(hist)) for hist in list_of_hists]
np.array(list_of_hists_)
但这很慢。也许我已经找到最快的方法了?
由于数据已排序(直方图),您可以利用此特性以更有效的方式计算 IQR。分别求出每半的中位数之差就够了。
import numpy as np
from time import time
list_of_hists = [
[13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42],
[13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 42],
[13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 43, 43, 43, 43],
[10, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 19, 25, 28],
[13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42],
[13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 42],
[13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 43, 43, 43, 43],
[10, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 19, 25, 28],
[13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42],
[13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 42],
[13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 43, 43, 43, 43],
[10, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 19, 25, 28],
]
list_of_hists = np.array(list_of_hists)
def sortedIQR(data):
pivot = len(data)//2
# First quartile (Q1)
Q1 = np.median(data[:pivot])
# Third quartile (Q3)
Q3 = np.median(data[pivot:])
# Interquartile range (IQR)
IQR = Q3 - Q1
return IQR
def simpleIQR(hist):
iqr = np.quantile(hist, 0.75, interpolation="linear") - np.quantile(
hist, 0.25, interpolation="linear"
)
return iqr
start = time()
answers = []
for idx, item in enumerate(list_of_hists):
answers.append(simpleIQR(item))
end = time()
print('Elapsed Time for Simple IQR: ', round(end-start, 5))
print(answers)
answers = []
start = time()
for idx, item in enumerate(list_of_hists):
answers.append(sortedIQR(item))
end = time()
print('Elapsed Time for Sorted IQR: ', round(end-start, 5))
print(answers)
输出:
Elapsed Time for Simple IQR: 0.004
[10.0, 10.5, 11.5, 2.0, 10.0, 10.5, 11.5, 2.0, 10.0, 10.5, 11.5, 2.0]
Elapsed Time for Sorted IQR: 0.001
[10.0, 10.5, 12.0, 2.0, 10.0, 10.5, 12.0, 2.0, 10.0, 10.5, 12.0, 2.0]
我正在尝试有效地计算一些可变长度直方图数据的四分位间距 IQR。我有列表列表中的数据。每个内部列表都是一个单独的直方图。大多数这些直方图的长度为 100,但长度可以在 50 - 150 整数之间变化。
示例数据:
list_of_hists = [
[13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42],
[13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 42],
[13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 43, 43, 43, 43],
[10, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 19, 25, 28]
]
我目前正在使用简单的 for 循环计算 IQR:
list_of_iqrs = []
for hist in list_of_hists:
iqr = np.quantile(hist, 0.75, interpolation="linear") - np.quantile(
hist, 0.25, interpolation="linear"
)
list_of_iqrs.append(iqr)
以上数据的预期结果:
list_of_iqrs = [10.0, 10.5, 11.5, 2.0]
鉴于此历史记录列表的长度约为 10**6 个元素,我希望找到一种使用数组计算来执行此操作的方法。不幸的是,当我试图将它变成一个数组时,我只得到一个列表数组:
array([
list([13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42]),
list([13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 42]),
list([13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 43, 43, 43, 43]),
list([10, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 19, 25, 28])
])
并且分位数计算没有像我预期的那样工作。
如何将此历史记录列表转换为数组并找到 IQR?
编辑: 另一个解决方案似乎是 append onto each hist and make them all the same length,然后将该历史记录列表转换为数组:
list_of_hists_ = [hist + [None]*(len(max(list_of_hists, key=len))-len(hist)) for hist in list_of_hists]
np.array(list_of_hists_)
但这很慢。也许我已经找到最快的方法了?
由于数据已排序(直方图),您可以利用此特性以更有效的方式计算 IQR。分别求出每半的中位数之差就够了。
import numpy as np
from time import time
list_of_hists = [
[13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42],
[13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 42],
[13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 43, 43, 43, 43],
[10, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 19, 25, 28],
[13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42],
[13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 42],
[13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 43, 43, 43, 43],
[10, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 19, 25, 28],
[13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42],
[13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 42],
[13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 43, 43, 43, 43],
[10, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 19, 25, 28],
]
list_of_hists = np.array(list_of_hists)
def sortedIQR(data):
pivot = len(data)//2
# First quartile (Q1)
Q1 = np.median(data[:pivot])
# Third quartile (Q3)
Q3 = np.median(data[pivot:])
# Interquartile range (IQR)
IQR = Q3 - Q1
return IQR
def simpleIQR(hist):
iqr = np.quantile(hist, 0.75, interpolation="linear") - np.quantile(
hist, 0.25, interpolation="linear"
)
return iqr
start = time()
answers = []
for idx, item in enumerate(list_of_hists):
answers.append(simpleIQR(item))
end = time()
print('Elapsed Time for Simple IQR: ', round(end-start, 5))
print(answers)
answers = []
start = time()
for idx, item in enumerate(list_of_hists):
answers.append(sortedIQR(item))
end = time()
print('Elapsed Time for Sorted IQR: ', round(end-start, 5))
print(answers)
输出:
Elapsed Time for Simple IQR: 0.004
[10.0, 10.5, 11.5, 2.0, 10.0, 10.5, 11.5, 2.0, 10.0, 10.5, 11.5, 2.0]
Elapsed Time for Sorted IQR: 0.001
[10.0, 10.5, 12.0, 2.0, 10.0, 10.5, 12.0, 2.0, 10.0, 10.5, 12.0, 2.0]