如何在 python 中的字典列表中查找项目的累计总和

How to find cumulative sum of items in a list of dictionaries in python

我有一个列表,类似于

a=[{'time':3},{'time':4},{'time':5}]

我想像这样逆序求值的累加和

b=[{'exp':3,'cumsum':12},{'exp':4,'cumsum':9},{'exp':5,'cumsum':5}]

最有效的方法是什么?我已经阅读了其他答案,其中使用 numpy 给出了

之类的解决方案
a=[1,2,3]
b=numpy.cumsum(a)

但我还需要在字典中插入 cumsum

a=[{'time':3},{'time':4},{'time':5}]
b = []
cumsum = 0
for e in a[::-1]:
    cumsum += e['time']
    b.insert(0, {'exp':e['time'], 'cumsum':cumsum})
print(b)

输出:

[{'exp': 3, 'cumsum': 12}, {'exp': 4, 'cumsum': 9}, {'exp': 5, 'cumsum': 5}]


所以事实证明,在列表的开头插入是 slow (O(n))。相反,尝试 deque (O(1)):

from collections import deque


a=[{'time':3},{'time':4},{'time':5}]
b = deque()
cumsum = 0
for e in a[::-1]:
    cumsum += e['time']
    b.appendleft({'exp':e['time'], 'cumsum':cumsum})
print(b)
print(list(b))

输出:

deque([{'cumsum': 12, 'exp': 3}, {'cumsum': 9, 'exp': 4}, {'cumsum': 5, 'exp': 5}])
[{'cumsum': 12, 'exp': 3}, {'cumsum': 9, 'exp': 4}, {'cumsum': 5, 'exp': 5}]


这里有一个脚本来测试ITT的每一种方法的速度,以及一个带有计时结果的图表:

from collections import deque
from copy import deepcopy
import numpy as np
import pandas as pd
from random import randint
from time import time


def Nehal_pandas(l):
    df = pd.DataFrame(l)
    df['cumsum'] = df.ix[::-1, 'time'].cumsum()[::-1]
    df.columns = ['exp', 'cumsum']
    return df.to_json(orient='records')


def Merlin_pandas(l):
    df           = pd.DataFrame(l).rename(columns={'time':'exp'})
    df["cumsum"] = df['exp'][::-1].cumsum()
    return df.to_dict(orient='records')


def RahulKP_numpy(l):
    cumsum_list = np.cumsum([i['time'] for i in l][::-1])[::-1]
    for i,j in zip(l,cumsum_list):
        i.update({'cumsum':j})


def Divakar_pandas(l):
    df = pd.DataFrame(l)
    df.columns = ['exp']
    df['cumsum'] = (df[::-1].cumsum())[::-1]
    return df.T.to_dict().values()


def cb_insert_0(l):
    b = []
    cumsum = 0
    for e in l[::-1]:
        cumsum += e['time']
        b.insert(0, {'exp':e['time'], 'cumsum':cumsum})
    return b


def cb_deque(l):
    b = deque()
    cumsum = 0
    for e in l[::-1]:
        cumsum += e['time']
        b.appendleft({'exp':e['time'], 'cumsum':cumsum})
    b = list(b)
    return b


def cb_deque_noconvert(l):
    b = deque()
    cumsum = 0
    for e in l[::-1]:
        cumsum += e['time']
        b.appendleft({'exp':e['time'], 'cumsum':cumsum})
    return b


def hpaulj_gen(l, var='value'):
    cum=0
    for i in l:
        j=i[var]
        cum += j
        yield {var:j, 'sum':cum}


def hpaulj_inplace(l, var='time'):
    cum = 0
    for i in l:
        cum += i[var]
        i['sum'] = cum


def test(number_of_lists, min_list_length, max_list_length):
    test_lists = []

    for _ in range(number_of_lists):
        test_list = []
        number_of_dicts = randint(min_list_length,max_list_length)
        for __ in range(number_of_dicts):
            random_value = randint(0,50)
            test_list.append({'time':random_value})
        test_lists.append(test_list)

    lists = deepcopy(test_lists)
    start_time = time()
    for l in lists:
        res = list(hpaulj_gen(l[::-1], 'time'))[::-1]
    elapsed_time = time() - start_time
    print('hpaulj generator:'.ljust(25), '%.2f' % (number_of_lists / elapsed_time), 'lists per second')

    lists = deepcopy(test_lists)
    start_time = time()
    for l in lists:
        hpaulj_inplace(l[::-1])
    elapsed_time = time() - start_time
    print('hpaulj in place:'.ljust(25), '%.2f' % (number_of_lists / elapsed_time), 'lists per second')

    lists = deepcopy(test_lists)
    start_time = time()
    for l in lists:
        res = cb_insert_0(l)
    elapsed_time = time() - start_time
    print('craig insert list at 0:'.ljust(25), '%.2f' % (number_of_lists / elapsed_time), 'lists per second')

    lists = deepcopy(test_lists)
    start_time = time()
    for l in lists:
        res = cb_deque(l)
    elapsed_time = time() - start_time
    print('craig deque:'.ljust(25), '%.2f' % (number_of_lists / elapsed_time), 'lists per second')

    lists = deepcopy(test_lists)
    start_time = time()
    for l in lists:
        res = cb_deque_noconvert(l)
    elapsed_time = time() - start_time
    print('craig deque no convert:'.ljust(25), '%.2f' % (number_of_lists / elapsed_time), 'lists per second')

    lists = deepcopy(test_lists)
    start_time = time()
    for l in lists:
        RahulKP_numpy(l) # l changed in place
    elapsed_time = time() - start_time
    print('Rahul K P numpy:'.ljust(25), '%.2f' % (number_of_lists / elapsed_time), 'lists per second')

    lists = deepcopy(test_lists)
    start_time = time()
    for l in lists:
        res = Divakar_pandas(l)
    elapsed_time = time() - start_time
    print('Divakar pandas:'.ljust(25), '%.2f' % (number_of_lists / elapsed_time), 'lists per second')

    lists = deepcopy(test_lists)
    start_time = time()
    for l in lists:
        res = Nehal_pandas(l)
    elapsed_time = time() - start_time
    print('Nehal pandas:'.ljust(25), '%.2f' % (number_of_lists / elapsed_time), 'lists per second')

    lists = deepcopy(test_lists)
    start_time = time()
    for l in lists:
        res = Merlin_pandas(l)
    elapsed_time = time() - start_time
    print('Merlin pandas:'.ljust(25), '%.2f' % (number_of_lists / elapsed_time), 'lists per second')

试试这个,

cumsum_list = np.cumsum([i['time'] for i in a][::-1])[::-1]
for i,j in zip(a,cumsum_list):
     i.update({'cumsum':j})

结果

[{'cumsum': 12, 'time': 3}, {'cumsum': 9, 'time': 4}, {'cumsum': 5, 'time': 5}]

效率

改成函数,

In [49]: def convert_dict(a):
....:     cumsum_list = np.cumsum([i['time'] for i in a][::-1])[::-1]
....:     for i,j in zip(a,cumsum_list):
....:              i.update({'cumsum':j})
....:     return a

然后是结果,

In [51]: convert_dict(a)
Out[51]: [{'cumsum': 12, 'time': 3}, {'cumsum': 9, 'time': 4}, {'cumsum': 5, 'time': 5}]

终于效率了,

In [52]: %timeit convert_dict(a)
The slowest run took 12.84 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 12.1 µs per loop

使用pandas:

In [4]: df = pd.DataFrame([{'time':3},{'time':4},{'time':5}])

In [5]: df
Out[5]: 
   time
0     3
1     4
2     5

In [6]: df['cumsum'] = df.ix[::-1, 'time'].cumsum()[::-1]

In [7]: df
Out[7]: 
   time  cumsum
0     3      12
1     4       9
2     5       5

In [8]: df.columns = ['exp', 'cumsum']

In [9]: df
Out[9]: 
   exp  cumsum
0    3      12
1    4       9
2    5       5

In [10]: df.to_json(orient='records')
Out[10]: '[{"exp":3,"cumsum":12},{"exp":4,"cumsum":9},{"exp":5,"cumsum":5}]'

这是另一种使用 pandas -

的方法
df = pd.DataFrame(a)
df.columns = ['exp']
df['cumsum'] = (df[::-1].cumsum())[::-1]
out = df.T.to_dict().values()

示例输入、输出 -

In [396]: a
Out[396]: [{'time': 3}, {'time': 4}, {'time': 5}]

In [397]: out
Out[397]: [{'cumsum': 12, 'exp': 3}, {'cumsum': 9, 'exp': 4}, {'cumsum': 5, 'exp': 5}

试试这个:

a            = [{'time':3},{'time':4},{'time':5}]
df           = pd.DataFrame(a).rename(columns={'time':'exp'})
df["cumsum"] = df['exp'][::-1].cumsum()
df.to_dict(orient='records')

字典未排序。

 [{'cumsum': 12, 'exp': 3}, {'cumsum': 9, 'exp': 4}, {'cumsum': 5, 'exp': 5}]

基于生成器的解决方案:

def foo(a, var='value'):
    cum=0
    for i in a:
        j=i[var]
        cum += j
        yield {var:j, 'sum':cum}

In [79]: a=[{'time':i} for i in range(5)]
In [80]: list(foo(a[::-1], var='time'))[::-1]
Out[80]: 
[{'sum': 10, 'time': 0},
 {'sum': 10, 'time': 1},
 {'sum': 9, 'time': 2},
 {'sum': 7, 'time': 3},
 {'sum': 4, 'time': 4}]

在快速测试中,这与 cb_insert_0

具有竞争力

就地版本的效果更好:

def foo2(a, var='time'):
    cum = 0
    for i in a:
        cum += i[var]
        i['sum'] = cum
foo2(a[::-1])