从 np.array 个不同长度的列表中查找分位数

Finding quantiles from np.array of different length lists

我正在尝试有效地计算一些可变长度直方图数据的四分位间距 IQR。我有列表列表中的数据。每个内部列表都是一个单独的直方图。大多数这些直方图的长度为 100,但长度可以在 50 - 150 整数之间变化。

示例数据:

list_of_hists = [
    [13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42],
    [13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 42],
    [13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 43, 43, 43, 43],
    [10, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 19, 25, 28]
]

我目前正在使用简单的 for 循环计算 IQR:

list_of_iqrs = []
for hist in list_of_hists:
    iqr = np.quantile(hist, 0.75, interpolation="linear") - np.quantile(
        hist, 0.25, interpolation="linear"
    )
    list_of_iqrs.append(iqr)

以上数据的预期结果:

list_of_iqrs = [10.0, 10.5, 11.5, 2.0]

鉴于此历史记录列表的长度约为 10**6 个元素,我希望找到一种使用数组计算来执行此操作的方法。不幸的是,当我试图将它变成一个数组时,我只得到一个列表数组:

array([
       list([13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42]),
       list([13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 42]),
       list([13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 43, 43, 43, 43]),
       list([10, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 19, 25, 28])
])

并且分位数计算没有像我预期的那样工作。

如何将此历史记录列表转换为数组并找到 IQR?

编辑: 另一个解决方案似乎是 append onto each hist and make them all the same length,然后将该历史记录列表转换为数组:

list_of_hists_ = [hist + [None]*(len(max(list_of_hists, key=len))-len(hist)) for hist in list_of_hists]
np.array(list_of_hists_)

但这很慢。也许我已经找到最快的方法了?

由于数据已排序(直方图),您可以利用此特性以更有效的方式计算 IQR。分别求出每半的中位数之差就够了。

import numpy as np
from time import time  
list_of_hists = [
    [13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42],
    [13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 42],
    [13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 43, 43, 43, 43],
    [10, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 19, 25, 28],
    [13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42],
    [13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 42],
    [13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 43, 43, 43, 43],
    [10, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 19, 25, 28],
    [13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42],
    [13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 42],
    [13, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24, 24, 24, 25, 25, 25, 25, 27, 28, 28, 29, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 35, 35, 36, 36, 42, 43, 43, 43, 43],
    [10, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 19, 25, 28],

]
list_of_hists = np.array(list_of_hists)

def sortedIQR(data):
    pivot = len(data)//2
    # First quartile (Q1)
    Q1 = np.median(data[:pivot])
    # Third quartile (Q3)
    Q3 = np.median(data[pivot:])
    # Interquartile range (IQR)
    IQR = Q3 - Q1
    return IQR

def simpleIQR(hist):
    iqr = np.quantile(hist, 0.75, interpolation="linear") - np.quantile(
        hist, 0.25, interpolation="linear"
    )
    return iqr

start = time()
answers = []
for idx, item in enumerate(list_of_hists):
    answers.append(simpleIQR(item))
end = time()
print('Elapsed Time for Simple IQR: ', round(end-start, 5))
print(answers)
answers = []
start = time()
for idx, item in enumerate(list_of_hists):
    answers.append(sortedIQR(item))
end = time()
print('Elapsed Time for Sorted IQR: ', round(end-start, 5))
print(answers)

输出:

Elapsed Time for Simple IQR:  0.004
[10.0, 10.5, 11.5, 2.0, 10.0, 10.5, 11.5, 2.0, 10.0, 10.5, 11.5, 2.0]
Elapsed Time for Sorted IQR:  0.001
[10.0, 10.5, 12.0, 2.0, 10.0, 10.5, 12.0, 2.0, 10.0, 10.5, 12.0, 2.0]