加速 Pandas cummin/cummax
Speed up Pandas cummin/cummax
Pandas cummin
and cummax
函数对于我有很多组的用例来说似乎真的很慢。我怎样才能加快他们的速度?
更新
import pandas as pd
import numpy as np
from collections import defaultdict
def cummax(g, v):
df1 = pd.DataFrame(g, columns=['group'])
df2 = pd.DataFrame(v)
df = pd.concat([df1, df2], axis=1)
result = df.groupby('group').cummax()
result = result.values
return result
def transform(g, v):
df1 = pd.DataFrame(g, columns=['group'])
df2 = pd.DataFrame(v)
df = pd.concat([df1, df2], axis=1)
result = df.groupby('group').transform(lambda x: x.cummax())
result = result.values
return result
def itertuples(g, v):
df1 = pd.DataFrame(g, columns=['group'])
df2 = pd.DataFrame(v)
df = pd.concat([df1, df2], axis=1)
d = defaultdict(list)
result = [np.nan] * len(g)
def d_(g, v):
d[g].append(v)
if len(d[g]) > 1:
d[g][-1] = tuple(max(a,b) for (a,b) in zip(d[g][-2], d[g][-1]))
return d[g][-1]
for row in df.itertuples(index=True):
index = row[0]
result[index] = d_(row[1], row[2:])
result = np.asarray(result)
return result
def numpy(g, v):
d = defaultdict(list)
result = [np.nan] * len(g)
def d_(g, v):
d[g].append(v)
if len(d[g]) > 1:
d[g][-1] = np.maximum(d[g][-2], d[g][-1])
return d[g][-1]
for i in range(len(g)):
result[i] = d_(g[i], v[i])
result = np.asarray(result)
return result
LENGTH = 100000
g = np.random.randint(low=0, high=LENGTH/2, size=LENGTH)
v = np.random.rand(LENGTH, 40)
%timeit r1 = cummax(g, v)
%timeit r2 = transform(g, v)
%timeit r3 = itertuples(g, v)
%timeit r4 = numpy(g, v)
给予
1 loop, best of 3: 22.5 s per loop
1 loop, best of 3: 18.4 s per loop
1 loop, best of 3: 1.56 s per loop
1 loop, best of 3: 325 ms per loop
您对我如何改进我的代码有任何进一步的建议吗?
旧
import pandas as pd
import numpy as np
LENGTH = 100000
df = pd.DataFrame(
np.random.randint(low=0, high=LENGTH/2, size=(LENGTH,2)),
columns=['group', 'value'])
df.groupby('group').cummax()
我们将使用 defaultdict
,其中默认值为 -np.inf
,因为我将采用最大值并且我想要一个所有值都大于的默认值。
解决方案
给定一组数组 g
和要累积最大值的值 v
def pir1(g, v):
d = defaultdict(lambda: -np.inf)
result = np.empty(len(g))
def d_(g, v):
d[g] = max(d[g], v)
return d[g]
for i in range(len(g)):
result[i] = d_(g[i], v[i])
return result
示范[=49=]
LENGTH = 100000
g = np.random.randint(low=0, high=LENGTH/2, size=LENGTH)
v = np.random.rand(LENGTH)
准确性
vm = pd.DataFrame(dict(group=g, value=v)).groupby('group').value.cummax()
vm.eq(pir1(g, v)).all()
True
深入探讨
与 Divakar 的回答比较
头条图表
代码
我对 Divakar 的函数进行了一些修改以使其准确。
%%cython
import numpy as np
from collections import defaultdict
# this is a cythonized version of the next function
def pir1(g, v):
d = defaultdict(lambda: -np.inf)
result = np.empty(len(g))
def d_(g, v):
d[g] = max(d[g], v)
return d[g]
for i in range(len(g)):
result[i] = d_(g[i], v[i])
return result
def pir2(g, v):
d = defaultdict(lambda: -np.inf)
result = np.empty(len(g))
def d_(g, v):
d[g] = max(d[g], v)
return d[g]
for i in range(len(g)):
result[i] = d_(g[i], v[i])
return result
def argsort_unique(idx):
# Original idea :
n = idx.size
sidx = np.empty(n,dtype=int)
sidx[idx] = np.arange(n)
return sidx
def div1(groupby, value):
sidx = np.argsort(groupby,kind='mergesort')
sorted_groupby, sorted_value = groupby[sidx], value[sidx]
# Get shifts to be used for shifting each group
mx = sorted_value.max() + 1
shifts = sorted_groupby * mx
# Shift and get max accumlate along value col.
# Those shifts helping out in keeping cummulative max within each group.
group_cummaxed = np.maximum.accumulate(shifts + sorted_value) - shifts
return group_cummaxed[argsort_unique(sidx)]
def div2(groupby, value):
sidx = np.argsort(groupby, kind='mergesort')
sorted_groupby, sorted_value = groupby[sidx], value[sidx]
# factorize groups to integers
sorted_groupby = np.append(
0, sorted_groupby[1:] != sorted_groupby[:-1]).cumsum()
# Get shifts to be used for shifting each group
mx = sorted_value.max() + 1
shifts = (sorted_groupby - sorted_groupby.min()) * mx
# Shift and get max accumlate along value col.
# Those shifts helping out in keeping cummulative max within each group.
group_cummaxed = np.maximum.accumulate(shifts + sorted_value) - shifts
return group_cummaxed[argsort_unique(sidx)]
备注:
- 有必要对 Divakar 解中的组进行因式分解以对其进行概括
准确性
整数组
在基于整数的组中,div1
和 div2
产生相同的结果
np.isclose(div1(g, v), pir1(g, v)).all()
True
np.isclose(div2(g, v), pir1(g, v)).all()
True
普通组
基于字符串和浮点数的组 div1
变得不准确但很容易修复
g = g / 1000000
np.isclose(div1(g, v), pir1(g, v)).all()
False
np.isclose(div2(g, v), pir1(g, v)).all()
True
时间测试
results = pd.DataFrame(
index=pd.Index(['pir1', 'pir2', 'div1', 'div2'], name='method'),
columns=pd.Index(['Large', 'Medium', 'Small'], name='group size'))
size_map = dict(Large=100, Medium=10, Small=1)
from timeit import timeit
for i in results.index:
for j in results.columns:
results.set_value(
i, j,
timeit(
'{}(g // size_map[j], v)'.format(i),
'from __main__ import {}, size_map, g, v, j'.format(i),
number=100
)
)
results
results.T.plot.barh()
提议的方法
让我们为 table 带来一些 NumPy 魔法!好吧,我们将利用 np.maximum.accumulate
.
说明
要了解 maximum.accumulate
如何帮助我们,假设我们按顺序排列了这些组。
让我们考虑一个样本 grouby :
grouby column : [0, 0, 0, 1, 1, 2, 2, 2, 2, 2]
让我们考虑一个示例值:
value column : [3, 1, 4, 1, 3, 3, 1, 5, 2, 4]
仅在 value
上使用 maximum.accumulate
不会给我们想要的输出,因为我们只需要在每个组内进行这些累加。为此,一个技巧是将每个组与其之前的组相抵消。
可能没有什么方法可以进行抵消工作。一种简单的方法是用 value
+ 1 的最大偏移量来偏移每个组。对于示例,该偏移量将为 6
。因此,对于第二组,我们将添加 6
,第三组添加 12
,依此类推。因此,修改后的 value
将是 -
value column : [3, 1, 4, 7, 9, 15, 13, 17, 14, 16]
现在,我们可以使用maximum.accumulate
,累积将限制在每个组内-
value cummaxed: [3, 3, 4, 7, 9, 15, 15, 17, 17, 17])
要返回原始值,请减去之前添加的偏移量。
value cummaxed: [3, 3, 4, 1, 3, 3, 3, 5, 5, 5])
这是我们想要的结果!
一开始,我们假设分组是连续的。要获取该格式的数据,我们将使用 np.argsort(groupby,kind='mergesort')
获取排序索引,以便它保持相同数字的顺序,然后使用这些索引索引到 groupby
列。
为了解释负 groupby 元素,我们只需要偏移 max() - min()
而不是 max()
.
因此,实现看起来像这样 -
def argsort_unique(idx):
# Original idea :
n = idx.size
sidx = np.empty(n,dtype=int)
sidx[idx] = np.arange(n)
return sidx
def numpy_cummmax(groupby, value, factorize_groupby=0):
# Inputs : 1D arrays.
# Get sorted indices keeping the order. Sort groupby and value cols.
sidx = np.argsort(groupby,kind='mergesort')
sorted_groupby, sorted_value = groupby[sidx], value[sidx]
if factorize_groupby==1:
sf = np.concatenate(([0], np.flatnonzero(sorted_groupby[1:] != \
sorted_groupby[:-1])+1, [sorted_groupby.size] ))
sorted_groupby = np.repeat(np.arange(sf.size-1), sf[1:] - sf[:-1])
# Get shifts to be used for shifting each group
mx = sorted_groupby.max()-sorted_groupby.min()+1
shifts = sorted_groupby*mx
# Shift and get max accumlate along value col.
# Those shifts helping out in keeping cummulative max within each group.
group_cummaxed = np.maximum.accumulate(shifts + sorted_value) - shifts
return group_cummaxed[argsort_unique(sidx)]
运行时测试和验证
验证
1) Groupby 作为整数:
In [58]: # Setup with groupby as ints
...: LENGTH = 1000
...: g = np.random.randint(low=0, high=LENGTH/2, size=LENGTH)
...: v = np.random.rand(LENGTH)
...:
In [59]: df = pd.DataFrame(np.column_stack((g,v)),columns=['group', 'value'])
In [60]: # Verify results
...: out1 = df.groupby('group').cummax()
...: out2 = numpy_cummmax(df['group'].values, df['value'].values)
...: print np.allclose(out1.values.ravel(), out2, atol=1e-5)
...:
True
2) Groupby 作为浮点数:
In [10]: # Setup with groupby as floats
...: LENGTH = 100000
...: df = pd.DataFrame(np.random.randint(0,LENGTH//2,(LENGTH,2))/10.0, \
...: columns=['group', 'value'])
In [18]: # Verify results
...: out1 = df.groupby('group').cummax()
...: out2 = numpy_cummmax(df['group'].values, df['value'].values, factorize_groupby=1)
...: print np.allclose(out1.values.ravel(), out2, atol=1e-5)
...:
True
计时 -
1) Groupby as ints(与问题中用于计时的设置相同):
In [24]: LENGTH = 100000
...: g = np.random.randint(0,LENGTH//2,(LENGTH))/10.0
...: v = np.random.rand(LENGTH)
...:
In [25]: %timeit numpy(g, v) # Best solution from posted question
1 loops, best of 3: 373 ms per loop
In [26]: %timeit pir1(g, v) # @piRSquared's solution-1
1 loops, best of 3: 165 ms per loop
In [27]: %timeit pir2(g, v) # @piRSquared's solution-2
1 loops, best of 3: 157 ms per loop
In [28]: %timeit numpy_cummmax(g, v)
100 loops, best of 3: 18.3 ms per loop
2) Groupby 作为浮点数:
In [29]: LENGTH = 100000
...: g = np.random.randint(0,LENGTH//2,(LENGTH))/10.0
...: v = np.random.rand(LENGTH)
...:
In [30]: %timeit pir1(g, v) # @piRSquared's solution-1
1 loops, best of 3: 157 ms per loop
In [31]: %timeit pir2(g, v) # @piRSquared's solution-2
1 loops, best of 3: 156 ms per loop
In [32]: %timeit numpy_cummmax(g, v, factorize_groupby=1)
10 loops, best of 3: 20.8 ms per loop
In [34]: np.allclose(pir1(g, v),numpy_cummmax(g, v, factorize_groupby=1),atol=1e-5)
Out[34]: True
Pandas cummin
and cummax
函数对于我有很多组的用例来说似乎真的很慢。我怎样才能加快他们的速度?
更新
import pandas as pd
import numpy as np
from collections import defaultdict
def cummax(g, v):
df1 = pd.DataFrame(g, columns=['group'])
df2 = pd.DataFrame(v)
df = pd.concat([df1, df2], axis=1)
result = df.groupby('group').cummax()
result = result.values
return result
def transform(g, v):
df1 = pd.DataFrame(g, columns=['group'])
df2 = pd.DataFrame(v)
df = pd.concat([df1, df2], axis=1)
result = df.groupby('group').transform(lambda x: x.cummax())
result = result.values
return result
def itertuples(g, v):
df1 = pd.DataFrame(g, columns=['group'])
df2 = pd.DataFrame(v)
df = pd.concat([df1, df2], axis=1)
d = defaultdict(list)
result = [np.nan] * len(g)
def d_(g, v):
d[g].append(v)
if len(d[g]) > 1:
d[g][-1] = tuple(max(a,b) for (a,b) in zip(d[g][-2], d[g][-1]))
return d[g][-1]
for row in df.itertuples(index=True):
index = row[0]
result[index] = d_(row[1], row[2:])
result = np.asarray(result)
return result
def numpy(g, v):
d = defaultdict(list)
result = [np.nan] * len(g)
def d_(g, v):
d[g].append(v)
if len(d[g]) > 1:
d[g][-1] = np.maximum(d[g][-2], d[g][-1])
return d[g][-1]
for i in range(len(g)):
result[i] = d_(g[i], v[i])
result = np.asarray(result)
return result
LENGTH = 100000
g = np.random.randint(low=0, high=LENGTH/2, size=LENGTH)
v = np.random.rand(LENGTH, 40)
%timeit r1 = cummax(g, v)
%timeit r2 = transform(g, v)
%timeit r3 = itertuples(g, v)
%timeit r4 = numpy(g, v)
给予
1 loop, best of 3: 22.5 s per loop
1 loop, best of 3: 18.4 s per loop
1 loop, best of 3: 1.56 s per loop
1 loop, best of 3: 325 ms per loop
您对我如何改进我的代码有任何进一步的建议吗?
旧
import pandas as pd
import numpy as np
LENGTH = 100000
df = pd.DataFrame(
np.random.randint(low=0, high=LENGTH/2, size=(LENGTH,2)),
columns=['group', 'value'])
df.groupby('group').cummax()
我们将使用 defaultdict
,其中默认值为 -np.inf
,因为我将采用最大值并且我想要一个所有值都大于的默认值。
解决方案
给定一组数组 g
和要累积最大值的值 v
def pir1(g, v):
d = defaultdict(lambda: -np.inf)
result = np.empty(len(g))
def d_(g, v):
d[g] = max(d[g], v)
return d[g]
for i in range(len(g)):
result[i] = d_(g[i], v[i])
return result
示范[=49=]
LENGTH = 100000
g = np.random.randint(low=0, high=LENGTH/2, size=LENGTH)
v = np.random.rand(LENGTH)
准确性
vm = pd.DataFrame(dict(group=g, value=v)).groupby('group').value.cummax()
vm.eq(pir1(g, v)).all()
True
深入探讨
与 Divakar 的回答比较
LENGTH = 100000
g = np.random.randint(low=0, high=LENGTH/2, size=LENGTH)
v = np.random.rand(LENGTH)
vm = pd.DataFrame(dict(group=g, value=v)).groupby('group').value.cummax()
vm.eq(pir1(g, v)).all()
True
头条图表
代码
我对 Divakar 的函数进行了一些修改以使其准确。
%%cython
import numpy as np
from collections import defaultdict
# this is a cythonized version of the next function
def pir1(g, v):
d = defaultdict(lambda: -np.inf)
result = np.empty(len(g))
def d_(g, v):
d[g] = max(d[g], v)
return d[g]
for i in range(len(g)):
result[i] = d_(g[i], v[i])
return result
def pir2(g, v):
d = defaultdict(lambda: -np.inf)
result = np.empty(len(g))
def d_(g, v):
d[g] = max(d[g], v)
return d[g]
for i in range(len(g)):
result[i] = d_(g[i], v[i])
return result
def argsort_unique(idx):
# Original idea :
n = idx.size
sidx = np.empty(n,dtype=int)
sidx[idx] = np.arange(n)
return sidx
def div1(groupby, value):
sidx = np.argsort(groupby,kind='mergesort')
sorted_groupby, sorted_value = groupby[sidx], value[sidx]
# Get shifts to be used for shifting each group
mx = sorted_value.max() + 1
shifts = sorted_groupby * mx
# Shift and get max accumlate along value col.
# Those shifts helping out in keeping cummulative max within each group.
group_cummaxed = np.maximum.accumulate(shifts + sorted_value) - shifts
return group_cummaxed[argsort_unique(sidx)]
def div2(groupby, value):
sidx = np.argsort(groupby, kind='mergesort')
sorted_groupby, sorted_value = groupby[sidx], value[sidx]
# factorize groups to integers
sorted_groupby = np.append(
0, sorted_groupby[1:] != sorted_groupby[:-1]).cumsum()
# Get shifts to be used for shifting each group
mx = sorted_value.max() + 1
shifts = (sorted_groupby - sorted_groupby.min()) * mx
# Shift and get max accumlate along value col.
# Those shifts helping out in keeping cummulative max within each group.
group_cummaxed = np.maximum.accumulate(shifts + sorted_value) - shifts
return group_cummaxed[argsort_unique(sidx)]
备注:
- 有必要对 Divakar 解中的组进行因式分解以对其进行概括
准确性
整数组
在基于整数的组中,div1
和 div2
产生相同的结果
np.isclose(div1(g, v), pir1(g, v)).all()
True
np.isclose(div2(g, v), pir1(g, v)).all()
True
普通组
基于字符串和浮点数的组 div1
变得不准确但很容易修复
g = g / 1000000
np.isclose(div1(g, v), pir1(g, v)).all()
False
np.isclose(div2(g, v), pir1(g, v)).all()
True
时间测试
results = pd.DataFrame(
index=pd.Index(['pir1', 'pir2', 'div1', 'div2'], name='method'),
columns=pd.Index(['Large', 'Medium', 'Small'], name='group size'))
size_map = dict(Large=100, Medium=10, Small=1)
from timeit import timeit
for i in results.index:
for j in results.columns:
results.set_value(
i, j,
timeit(
'{}(g // size_map[j], v)'.format(i),
'from __main__ import {}, size_map, g, v, j'.format(i),
number=100
)
)
results
results.T.plot.barh()
提议的方法
让我们为 table 带来一些 NumPy 魔法!好吧,我们将利用 np.maximum.accumulate
.
说明
要了解 maximum.accumulate
如何帮助我们,假设我们按顺序排列了这些组。
让我们考虑一个样本 grouby :
grouby column : [0, 0, 0, 1, 1, 2, 2, 2, 2, 2]
让我们考虑一个示例值:
value column : [3, 1, 4, 1, 3, 3, 1, 5, 2, 4]
仅在 value
上使用 maximum.accumulate
不会给我们想要的输出,因为我们只需要在每个组内进行这些累加。为此,一个技巧是将每个组与其之前的组相抵消。
可能没有什么方法可以进行抵消工作。一种简单的方法是用 value
+ 1 的最大偏移量来偏移每个组。对于示例,该偏移量将为 6
。因此,对于第二组,我们将添加 6
,第三组添加 12
,依此类推。因此,修改后的 value
将是 -
value column : [3, 1, 4, 7, 9, 15, 13, 17, 14, 16]
现在,我们可以使用maximum.accumulate
,累积将限制在每个组内-
value cummaxed: [3, 3, 4, 7, 9, 15, 15, 17, 17, 17])
要返回原始值,请减去之前添加的偏移量。
value cummaxed: [3, 3, 4, 1, 3, 3, 3, 5, 5, 5])
这是我们想要的结果!
一开始,我们假设分组是连续的。要获取该格式的数据,我们将使用 np.argsort(groupby,kind='mergesort')
获取排序索引,以便它保持相同数字的顺序,然后使用这些索引索引到 groupby
列。
为了解释负 groupby 元素,我们只需要偏移 max() - min()
而不是 max()
.
因此,实现看起来像这样 -
def argsort_unique(idx):
# Original idea :
n = idx.size
sidx = np.empty(n,dtype=int)
sidx[idx] = np.arange(n)
return sidx
def numpy_cummmax(groupby, value, factorize_groupby=0):
# Inputs : 1D arrays.
# Get sorted indices keeping the order. Sort groupby and value cols.
sidx = np.argsort(groupby,kind='mergesort')
sorted_groupby, sorted_value = groupby[sidx], value[sidx]
if factorize_groupby==1:
sf = np.concatenate(([0], np.flatnonzero(sorted_groupby[1:] != \
sorted_groupby[:-1])+1, [sorted_groupby.size] ))
sorted_groupby = np.repeat(np.arange(sf.size-1), sf[1:] - sf[:-1])
# Get shifts to be used for shifting each group
mx = sorted_groupby.max()-sorted_groupby.min()+1
shifts = sorted_groupby*mx
# Shift and get max accumlate along value col.
# Those shifts helping out in keeping cummulative max within each group.
group_cummaxed = np.maximum.accumulate(shifts + sorted_value) - shifts
return group_cummaxed[argsort_unique(sidx)]
运行时测试和验证
验证
1) Groupby 作为整数:
In [58]: # Setup with groupby as ints
...: LENGTH = 1000
...: g = np.random.randint(low=0, high=LENGTH/2, size=LENGTH)
...: v = np.random.rand(LENGTH)
...:
In [59]: df = pd.DataFrame(np.column_stack((g,v)),columns=['group', 'value'])
In [60]: # Verify results
...: out1 = df.groupby('group').cummax()
...: out2 = numpy_cummmax(df['group'].values, df['value'].values)
...: print np.allclose(out1.values.ravel(), out2, atol=1e-5)
...:
True
2) Groupby 作为浮点数:
In [10]: # Setup with groupby as floats
...: LENGTH = 100000
...: df = pd.DataFrame(np.random.randint(0,LENGTH//2,(LENGTH,2))/10.0, \
...: columns=['group', 'value'])
In [18]: # Verify results
...: out1 = df.groupby('group').cummax()
...: out2 = numpy_cummmax(df['group'].values, df['value'].values, factorize_groupby=1)
...: print np.allclose(out1.values.ravel(), out2, atol=1e-5)
...:
True
计时 -
1) Groupby as ints(与问题中用于计时的设置相同):
In [24]: LENGTH = 100000
...: g = np.random.randint(0,LENGTH//2,(LENGTH))/10.0
...: v = np.random.rand(LENGTH)
...:
In [25]: %timeit numpy(g, v) # Best solution from posted question
1 loops, best of 3: 373 ms per loop
In [26]: %timeit pir1(g, v) # @piRSquared's solution-1
1 loops, best of 3: 165 ms per loop
In [27]: %timeit pir2(g, v) # @piRSquared's solution-2
1 loops, best of 3: 157 ms per loop
In [28]: %timeit numpy_cummmax(g, v)
100 loops, best of 3: 18.3 ms per loop
2) Groupby 作为浮点数:
In [29]: LENGTH = 100000
...: g = np.random.randint(0,LENGTH//2,(LENGTH))/10.0
...: v = np.random.rand(LENGTH)
...:
In [30]: %timeit pir1(g, v) # @piRSquared's solution-1
1 loops, best of 3: 157 ms per loop
In [31]: %timeit pir2(g, v) # @piRSquared's solution-2
1 loops, best of 3: 156 ms per loop
In [32]: %timeit numpy_cummmax(g, v, factorize_groupby=1)
10 loops, best of 3: 20.8 ms per loop
In [34]: np.allclose(pir1(g, v),numpy_cummmax(g, v, factorize_groupby=1),atol=1e-5)
Out[34]: True