如何在 python 中的字典列表中查找项目的累计总和
How to find cumulative sum of items in a list of dictionaries in python
我有一个列表,类似于
a=[{'time':3},{'time':4},{'time':5}]
我想像这样逆序求值的累加和
b=[{'exp':3,'cumsum':12},{'exp':4,'cumsum':9},{'exp':5,'cumsum':5}]
最有效的方法是什么?我已经阅读了其他答案,其中使用 numpy
给出了
之类的解决方案
a=[1,2,3]
b=numpy.cumsum(a)
但我还需要在字典中插入 cumsum
a=[{'time':3},{'time':4},{'time':5}]
b = []
cumsum = 0
for e in a[::-1]:
cumsum += e['time']
b.insert(0, {'exp':e['time'], 'cumsum':cumsum})
print(b)
输出:
[{'exp': 3, 'cumsum': 12}, {'exp': 4, 'cumsum': 9}, {'exp': 5, 'cumsum': 5}]
所以事实证明,在列表的开头插入是 slow (O(n))。相反,尝试 deque
(O(1)):
from collections import deque
a=[{'time':3},{'time':4},{'time':5}]
b = deque()
cumsum = 0
for e in a[::-1]:
cumsum += e['time']
b.appendleft({'exp':e['time'], 'cumsum':cumsum})
print(b)
print(list(b))
输出:
deque([{'cumsum': 12, 'exp': 3}, {'cumsum': 9, 'exp': 4}, {'cumsum': 5, 'exp': 5}])
[{'cumsum': 12, 'exp': 3}, {'cumsum': 9, 'exp': 4}, {'cumsum': 5, 'exp': 5}]
这里有一个脚本来测试ITT的每一种方法的速度,以及一个带有计时结果的图表:
from collections import deque
from copy import deepcopy
import numpy as np
import pandas as pd
from random import randint
from time import time
def Nehal_pandas(l):
df = pd.DataFrame(l)
df['cumsum'] = df.ix[::-1, 'time'].cumsum()[::-1]
df.columns = ['exp', 'cumsum']
return df.to_json(orient='records')
def Merlin_pandas(l):
df = pd.DataFrame(l).rename(columns={'time':'exp'})
df["cumsum"] = df['exp'][::-1].cumsum()
return df.to_dict(orient='records')
def RahulKP_numpy(l):
cumsum_list = np.cumsum([i['time'] for i in l][::-1])[::-1]
for i,j in zip(l,cumsum_list):
i.update({'cumsum':j})
def Divakar_pandas(l):
df = pd.DataFrame(l)
df.columns = ['exp']
df['cumsum'] = (df[::-1].cumsum())[::-1]
return df.T.to_dict().values()
def cb_insert_0(l):
b = []
cumsum = 0
for e in l[::-1]:
cumsum += e['time']
b.insert(0, {'exp':e['time'], 'cumsum':cumsum})
return b
def cb_deque(l):
b = deque()
cumsum = 0
for e in l[::-1]:
cumsum += e['time']
b.appendleft({'exp':e['time'], 'cumsum':cumsum})
b = list(b)
return b
def cb_deque_noconvert(l):
b = deque()
cumsum = 0
for e in l[::-1]:
cumsum += e['time']
b.appendleft({'exp':e['time'], 'cumsum':cumsum})
return b
def hpaulj_gen(l, var='value'):
cum=0
for i in l:
j=i[var]
cum += j
yield {var:j, 'sum':cum}
def hpaulj_inplace(l, var='time'):
cum = 0
for i in l:
cum += i[var]
i['sum'] = cum
def test(number_of_lists, min_list_length, max_list_length):
test_lists = []
for _ in range(number_of_lists):
test_list = []
number_of_dicts = randint(min_list_length,max_list_length)
for __ in range(number_of_dicts):
random_value = randint(0,50)
test_list.append({'time':random_value})
test_lists.append(test_list)
lists = deepcopy(test_lists)
start_time = time()
for l in lists:
res = list(hpaulj_gen(l[::-1], 'time'))[::-1]
elapsed_time = time() - start_time
print('hpaulj generator:'.ljust(25), '%.2f' % (number_of_lists / elapsed_time), 'lists per second')
lists = deepcopy(test_lists)
start_time = time()
for l in lists:
hpaulj_inplace(l[::-1])
elapsed_time = time() - start_time
print('hpaulj in place:'.ljust(25), '%.2f' % (number_of_lists / elapsed_time), 'lists per second')
lists = deepcopy(test_lists)
start_time = time()
for l in lists:
res = cb_insert_0(l)
elapsed_time = time() - start_time
print('craig insert list at 0:'.ljust(25), '%.2f' % (number_of_lists / elapsed_time), 'lists per second')
lists = deepcopy(test_lists)
start_time = time()
for l in lists:
res = cb_deque(l)
elapsed_time = time() - start_time
print('craig deque:'.ljust(25), '%.2f' % (number_of_lists / elapsed_time), 'lists per second')
lists = deepcopy(test_lists)
start_time = time()
for l in lists:
res = cb_deque_noconvert(l)
elapsed_time = time() - start_time
print('craig deque no convert:'.ljust(25), '%.2f' % (number_of_lists / elapsed_time), 'lists per second')
lists = deepcopy(test_lists)
start_time = time()
for l in lists:
RahulKP_numpy(l) # l changed in place
elapsed_time = time() - start_time
print('Rahul K P numpy:'.ljust(25), '%.2f' % (number_of_lists / elapsed_time), 'lists per second')
lists = deepcopy(test_lists)
start_time = time()
for l in lists:
res = Divakar_pandas(l)
elapsed_time = time() - start_time
print('Divakar pandas:'.ljust(25), '%.2f' % (number_of_lists / elapsed_time), 'lists per second')
lists = deepcopy(test_lists)
start_time = time()
for l in lists:
res = Nehal_pandas(l)
elapsed_time = time() - start_time
print('Nehal pandas:'.ljust(25), '%.2f' % (number_of_lists / elapsed_time), 'lists per second')
lists = deepcopy(test_lists)
start_time = time()
for l in lists:
res = Merlin_pandas(l)
elapsed_time = time() - start_time
print('Merlin pandas:'.ljust(25), '%.2f' % (number_of_lists / elapsed_time), 'lists per second')
试试这个,
cumsum_list = np.cumsum([i['time'] for i in a][::-1])[::-1]
for i,j in zip(a,cumsum_list):
i.update({'cumsum':j})
结果
[{'cumsum': 12, 'time': 3}, {'cumsum': 9, 'time': 4}, {'cumsum': 5, 'time': 5}]
效率
改成函数,
In [49]: def convert_dict(a):
....: cumsum_list = np.cumsum([i['time'] for i in a][::-1])[::-1]
....: for i,j in zip(a,cumsum_list):
....: i.update({'cumsum':j})
....: return a
然后是结果,
In [51]: convert_dict(a)
Out[51]: [{'cumsum': 12, 'time': 3}, {'cumsum': 9, 'time': 4}, {'cumsum': 5, 'time': 5}]
终于效率了,
In [52]: %timeit convert_dict(a)
The slowest run took 12.84 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 12.1 µs per loop
使用pandas
:
In [4]: df = pd.DataFrame([{'time':3},{'time':4},{'time':5}])
In [5]: df
Out[5]:
time
0 3
1 4
2 5
In [6]: df['cumsum'] = df.ix[::-1, 'time'].cumsum()[::-1]
In [7]: df
Out[7]:
time cumsum
0 3 12
1 4 9
2 5 5
In [8]: df.columns = ['exp', 'cumsum']
In [9]: df
Out[9]:
exp cumsum
0 3 12
1 4 9
2 5 5
In [10]: df.to_json(orient='records')
Out[10]: '[{"exp":3,"cumsum":12},{"exp":4,"cumsum":9},{"exp":5,"cumsum":5}]'
这是另一种使用 pandas
-
的方法
df = pd.DataFrame(a)
df.columns = ['exp']
df['cumsum'] = (df[::-1].cumsum())[::-1]
out = df.T.to_dict().values()
示例输入、输出 -
In [396]: a
Out[396]: [{'time': 3}, {'time': 4}, {'time': 5}]
In [397]: out
Out[397]: [{'cumsum': 12, 'exp': 3}, {'cumsum': 9, 'exp': 4}, {'cumsum': 5, 'exp': 5}
试试这个:
a = [{'time':3},{'time':4},{'time':5}]
df = pd.DataFrame(a).rename(columns={'time':'exp'})
df["cumsum"] = df['exp'][::-1].cumsum()
df.to_dict(orient='records')
字典未排序。
[{'cumsum': 12, 'exp': 3}, {'cumsum': 9, 'exp': 4}, {'cumsum': 5, 'exp': 5}]
基于生成器的解决方案:
def foo(a, var='value'):
cum=0
for i in a:
j=i[var]
cum += j
yield {var:j, 'sum':cum}
In [79]: a=[{'time':i} for i in range(5)]
In [80]: list(foo(a[::-1], var='time'))[::-1]
Out[80]:
[{'sum': 10, 'time': 0},
{'sum': 10, 'time': 1},
{'sum': 9, 'time': 2},
{'sum': 7, 'time': 3},
{'sum': 4, 'time': 4}]
在快速测试中,这与 cb_insert_0
具有竞争力
就地版本的效果更好:
def foo2(a, var='time'):
cum = 0
for i in a:
cum += i[var]
i['sum'] = cum
foo2(a[::-1])
我有一个列表,类似于
a=[{'time':3},{'time':4},{'time':5}]
我想像这样逆序求值的累加和
b=[{'exp':3,'cumsum':12},{'exp':4,'cumsum':9},{'exp':5,'cumsum':5}]
最有效的方法是什么?我已经阅读了其他答案,其中使用 numpy
给出了
a=[1,2,3]
b=numpy.cumsum(a)
但我还需要在字典中插入 cumsum
a=[{'time':3},{'time':4},{'time':5}]
b = []
cumsum = 0
for e in a[::-1]:
cumsum += e['time']
b.insert(0, {'exp':e['time'], 'cumsum':cumsum})
print(b)
输出:
[{'exp': 3, 'cumsum': 12}, {'exp': 4, 'cumsum': 9}, {'exp': 5, 'cumsum': 5}]
所以事实证明,在列表的开头插入是 slow (O(n))。相反,尝试
deque
(O(1)):
from collections import deque
a=[{'time':3},{'time':4},{'time':5}]
b = deque()
cumsum = 0
for e in a[::-1]:
cumsum += e['time']
b.appendleft({'exp':e['time'], 'cumsum':cumsum})
print(b)
print(list(b))
输出:
deque([{'cumsum': 12, 'exp': 3}, {'cumsum': 9, 'exp': 4}, {'cumsum': 5, 'exp': 5}])
[{'cumsum': 12, 'exp': 3}, {'cumsum': 9, 'exp': 4}, {'cumsum': 5, 'exp': 5}]
这里有一个脚本来测试ITT的每一种方法的速度,以及一个带有计时结果的图表:
from collections import deque
from copy import deepcopy
import numpy as np
import pandas as pd
from random import randint
from time import time
def Nehal_pandas(l):
df = pd.DataFrame(l)
df['cumsum'] = df.ix[::-1, 'time'].cumsum()[::-1]
df.columns = ['exp', 'cumsum']
return df.to_json(orient='records')
def Merlin_pandas(l):
df = pd.DataFrame(l).rename(columns={'time':'exp'})
df["cumsum"] = df['exp'][::-1].cumsum()
return df.to_dict(orient='records')
def RahulKP_numpy(l):
cumsum_list = np.cumsum([i['time'] for i in l][::-1])[::-1]
for i,j in zip(l,cumsum_list):
i.update({'cumsum':j})
def Divakar_pandas(l):
df = pd.DataFrame(l)
df.columns = ['exp']
df['cumsum'] = (df[::-1].cumsum())[::-1]
return df.T.to_dict().values()
def cb_insert_0(l):
b = []
cumsum = 0
for e in l[::-1]:
cumsum += e['time']
b.insert(0, {'exp':e['time'], 'cumsum':cumsum})
return b
def cb_deque(l):
b = deque()
cumsum = 0
for e in l[::-1]:
cumsum += e['time']
b.appendleft({'exp':e['time'], 'cumsum':cumsum})
b = list(b)
return b
def cb_deque_noconvert(l):
b = deque()
cumsum = 0
for e in l[::-1]:
cumsum += e['time']
b.appendleft({'exp':e['time'], 'cumsum':cumsum})
return b
def hpaulj_gen(l, var='value'):
cum=0
for i in l:
j=i[var]
cum += j
yield {var:j, 'sum':cum}
def hpaulj_inplace(l, var='time'):
cum = 0
for i in l:
cum += i[var]
i['sum'] = cum
def test(number_of_lists, min_list_length, max_list_length):
test_lists = []
for _ in range(number_of_lists):
test_list = []
number_of_dicts = randint(min_list_length,max_list_length)
for __ in range(number_of_dicts):
random_value = randint(0,50)
test_list.append({'time':random_value})
test_lists.append(test_list)
lists = deepcopy(test_lists)
start_time = time()
for l in lists:
res = list(hpaulj_gen(l[::-1], 'time'))[::-1]
elapsed_time = time() - start_time
print('hpaulj generator:'.ljust(25), '%.2f' % (number_of_lists / elapsed_time), 'lists per second')
lists = deepcopy(test_lists)
start_time = time()
for l in lists:
hpaulj_inplace(l[::-1])
elapsed_time = time() - start_time
print('hpaulj in place:'.ljust(25), '%.2f' % (number_of_lists / elapsed_time), 'lists per second')
lists = deepcopy(test_lists)
start_time = time()
for l in lists:
res = cb_insert_0(l)
elapsed_time = time() - start_time
print('craig insert list at 0:'.ljust(25), '%.2f' % (number_of_lists / elapsed_time), 'lists per second')
lists = deepcopy(test_lists)
start_time = time()
for l in lists:
res = cb_deque(l)
elapsed_time = time() - start_time
print('craig deque:'.ljust(25), '%.2f' % (number_of_lists / elapsed_time), 'lists per second')
lists = deepcopy(test_lists)
start_time = time()
for l in lists:
res = cb_deque_noconvert(l)
elapsed_time = time() - start_time
print('craig deque no convert:'.ljust(25), '%.2f' % (number_of_lists / elapsed_time), 'lists per second')
lists = deepcopy(test_lists)
start_time = time()
for l in lists:
RahulKP_numpy(l) # l changed in place
elapsed_time = time() - start_time
print('Rahul K P numpy:'.ljust(25), '%.2f' % (number_of_lists / elapsed_time), 'lists per second')
lists = deepcopy(test_lists)
start_time = time()
for l in lists:
res = Divakar_pandas(l)
elapsed_time = time() - start_time
print('Divakar pandas:'.ljust(25), '%.2f' % (number_of_lists / elapsed_time), 'lists per second')
lists = deepcopy(test_lists)
start_time = time()
for l in lists:
res = Nehal_pandas(l)
elapsed_time = time() - start_time
print('Nehal pandas:'.ljust(25), '%.2f' % (number_of_lists / elapsed_time), 'lists per second')
lists = deepcopy(test_lists)
start_time = time()
for l in lists:
res = Merlin_pandas(l)
elapsed_time = time() - start_time
print('Merlin pandas:'.ljust(25), '%.2f' % (number_of_lists / elapsed_time), 'lists per second')
试试这个,
cumsum_list = np.cumsum([i['time'] for i in a][::-1])[::-1]
for i,j in zip(a,cumsum_list):
i.update({'cumsum':j})
结果
[{'cumsum': 12, 'time': 3}, {'cumsum': 9, 'time': 4}, {'cumsum': 5, 'time': 5}]
效率
改成函数,
In [49]: def convert_dict(a):
....: cumsum_list = np.cumsum([i['time'] for i in a][::-1])[::-1]
....: for i,j in zip(a,cumsum_list):
....: i.update({'cumsum':j})
....: return a
然后是结果,
In [51]: convert_dict(a)
Out[51]: [{'cumsum': 12, 'time': 3}, {'cumsum': 9, 'time': 4}, {'cumsum': 5, 'time': 5}]
终于效率了,
In [52]: %timeit convert_dict(a)
The slowest run took 12.84 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 12.1 µs per loop
使用pandas
:
In [4]: df = pd.DataFrame([{'time':3},{'time':4},{'time':5}])
In [5]: df
Out[5]:
time
0 3
1 4
2 5
In [6]: df['cumsum'] = df.ix[::-1, 'time'].cumsum()[::-1]
In [7]: df
Out[7]:
time cumsum
0 3 12
1 4 9
2 5 5
In [8]: df.columns = ['exp', 'cumsum']
In [9]: df
Out[9]:
exp cumsum
0 3 12
1 4 9
2 5 5
In [10]: df.to_json(orient='records')
Out[10]: '[{"exp":3,"cumsum":12},{"exp":4,"cumsum":9},{"exp":5,"cumsum":5}]'
这是另一种使用 pandas
-
df = pd.DataFrame(a)
df.columns = ['exp']
df['cumsum'] = (df[::-1].cumsum())[::-1]
out = df.T.to_dict().values()
示例输入、输出 -
In [396]: a
Out[396]: [{'time': 3}, {'time': 4}, {'time': 5}]
In [397]: out
Out[397]: [{'cumsum': 12, 'exp': 3}, {'cumsum': 9, 'exp': 4}, {'cumsum': 5, 'exp': 5}
试试这个:
a = [{'time':3},{'time':4},{'time':5}]
df = pd.DataFrame(a).rename(columns={'time':'exp'})
df["cumsum"] = df['exp'][::-1].cumsum()
df.to_dict(orient='records')
字典未排序。
[{'cumsum': 12, 'exp': 3}, {'cumsum': 9, 'exp': 4}, {'cumsum': 5, 'exp': 5}]
基于生成器的解决方案:
def foo(a, var='value'):
cum=0
for i in a:
j=i[var]
cum += j
yield {var:j, 'sum':cum}
In [79]: a=[{'time':i} for i in range(5)]
In [80]: list(foo(a[::-1], var='time'))[::-1]
Out[80]:
[{'sum': 10, 'time': 0},
{'sum': 10, 'time': 1},
{'sum': 9, 'time': 2},
{'sum': 7, 'time': 3},
{'sum': 4, 'time': 4}]
在快速测试中,这与 cb_insert_0
就地版本的效果更好:
def foo2(a, var='time'):
cum = 0
for i in a:
cum += i[var]
i['sum'] = cum
foo2(a[::-1])