wide/long 旋转的 Groupby 累积平均值
Groupby cumulative mean with wide/long pivoting
我有一个看起来像这样的 DataFrame(重现它的代码见底部):
date id_ val
0 2017-01-08 a; b 9.3
1 2017-01-07 a; b; c 7.9
2 2017-01-07 a 7.3
3 2017-01-06 b 9.0
4 2017-01-06 c 8.1
5 2017-01-05 c 7.4
6 2017-01-05 d 7.1
7 2017-01-05 a 7.0
8 2017-01-04 b; a 7.7
9 2017-01-04 c; a 5.3
10 2017-01-04 a; c 8.0
我想对 id_
的单个(以分号分隔的)元素进行分组,并计算 val
的累积平均值,但不包括 每个日期。这应该为任何 id_
的第一次出现给出 NaN,然后我用一些任意值填充它(这里是 6.0)。
输出:
id_
a 0 6.0000
1 9.3000
2 8.6000
7 8.1667
8 7.8750
9 7.8400
10 7.4167
b 0 6.0000
1 9.3000
3 8.6000
8 8.7333
c 1 6.0000 # fill value
4 7.9000 # first actual occurrence of id_='c'
5 8.0000 # cumulative mean of the first two 'c'
9 7.8000
10 7.1750
d 6 6.0000
Name: val, dtype: float64
这是我目前的流程,比较慢--可以改进吗?其次,我可以在最终结果中保留 date
列吗?
# seems like `pd.melt` might be more direct here
df.sort_values('date', inplace=True)
stacked = df.id_.str.split('; ', expand=True).stack()
stacked.index = stacked.index.droplevel(1)
stacked = stacked.to_frame()\
.merge(df, left_index=True, right_index=True)\
.drop('id_', axis=1)\
.rename({0: 'id_'}, axis=1)
def trend_scorer(s: pd.Series, fillvalue=6.):
return s['val'].expanding().mean().shift(1).fillna(fillvalue)
stacked.groupby('id_').apply(trend_scorer)
DataFrame 创建:
import pandas as pd
data = \
{'id_': {0: 'a; b',
1: 'a; b; c',
2: 'a',
3: 'b',
4: 'c',
5: 'c',
6: 'd',
7: 'a',
8: 'b; a',
9: 'c; a',
10: 'a; c'},
'date': {0: '1/8/17',
1: '1/7/17',
2: '1/7/17',
3: '1/6/17',
4: '1/6/17',
5: '1/5/17',
6: '1/5/17',
7: '1/5/17',
8: '1/4/17',
9: '1/4/17',
10: '1/4/17'},
'val': {0: 9.3,
1: 7.9,
2: 7.3,
3: 9.0,
4: 8.1,
5: 7.4,
6: 7.1,
7: 7.0,
8: 7.7,
9: 5.3,
10: 8.0}}
df = pd.DataFrame(data)
df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)
groupby/apply
通常是一个相对较慢的操作(与 Pandas' Cythonized 或 NumPy 的矢量化操作相比),因为它需要为每个组调用一次 Python 函数。尽可能避免使用它。
在这种情况下,您可以通过使用 groupby/expanding
来获得适度的优势:
result = stacked.groupby('id_').expanding()['val'].mean()
result = result.groupby(level='id_').shift(1).fillna(fillvalue)
要用 stacked
重新加入此 result
,您可以使用 DataFrame.join
--
这里的主要问题是 DataFrame 必须共享相同的索引级别
在你加入他们之前:
result = stacked.set_index('id_', append=True).swaplevel().join(result)
在您的小示例 DataFrame 上,alt
比 orig
快 1.3 倍:
In [500]: %timeit orig(df)
100 loops, best of 3: 12.5 ms per loop
In [501]: %timeit alt(df)
100 loops, best of 3: 9.49 ms per loop
在更大的10K行1000组的DataFrame上,alt
的速度优势差不多:
In [504]: %timeit orig(df)
1 loop, best of 3: 2.34 s per loop
In [505]: %timeit alt(df)
1 loop, best of 3: 1.95 s per loop
(固定成本,例如 stacked.set_index('id_', append=True).swaplevel().join(result)
淹没了使用 groupby/expanding
而不是 groupby/apply
的相对较小的好处)。
下面是用于进行上述基准测试的代码:
import pandas as pd
import numpy as np
def trend_scorer(s: pd.Series, fillvalue=6.):
return s['val'].expanding().mean().shift(1).fillna(fillvalue)
def orig(df):
stacked = df.id_.str.split('; ', expand=True).stack()
stacked.index = stacked.index.droplevel(1)
stacked = (stacked.to_frame()
.merge(df, left_index=True, right_index=True)
.drop('id_', axis=1)
.rename(columns={0: 'id_'}))
result = stacked.groupby('id_').apply(trend_scorer)
result = result.rename('expanding mean')
result = stacked.set_index('id_', append=True).swaplevel().join(result)
return result
def alt(df, fillvalue=6.0):
stacked = df['id_'].str.split('; ', expand=True).stack()
stacked.index = stacked.index.droplevel(1)
stacked = (df.drop('id_', axis=1)
.join(stacked.rename('id_')))
result = stacked.groupby('id_').expanding()['val'].mean()
result = result.groupby(level='id_').shift(1).fillna(fillvalue)
result = result.rename('expanding mean')
result = stacked.set_index('id_', append=True).swaplevel().join(result)
return result
data = {'id_': {0: 'a; b', 1: 'a; b; c', 2: 'a', 3: 'b', 4: 'c', 5: 'c', 6: 'd', 7: 'a', 8: 'b; a', 9: 'c; a', 10: 'a; c'}, 'date': {0: '1/8/17', 1: '1/7/17', 2: '1/7/17', 3: '1/6/17', 4: '1/6/17', 5: '1/5/17', 6: '1/5/17', 7: '1/5/17', 8: '1/4/17', 9: '1/4/17', 10: '1/4/17'}, 'val': {0: 9.3, 1: 7.9, 2: 7.3, 3: 9.0, 4: 8.1, 5: 7.4, 6: 7.1, 7: 7.0, 8: 7.7, 9: 5.3, 10: 8.0}}
df = pd.DataFrame(data)
df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)
df = df.sort_values('date')
assert alt(df).equals(orig(df))
这就是我为上述基准创建更大的测试 DataFrame 的方式:
import numpy as np
def make_df(N=10000, seed=2018):
np.random.seed(seed)
data = []
for date in pd.date_range('2017-1-1', periods=N):
for i in range(np.random.randint(1, 10)):
ids = '; '.join(np.random.choice(1000, size=np.random.randint(1, 10)).astype(str))
data.append((date, ids))
df = pd.DataFrame(data, columns=['date', 'id_'])
df['val'] = np.random.uniform(1, 10, size=len(df))
return df
df = make_df()
我有一个看起来像这样的 DataFrame(重现它的代码见底部):
date id_ val
0 2017-01-08 a; b 9.3
1 2017-01-07 a; b; c 7.9
2 2017-01-07 a 7.3
3 2017-01-06 b 9.0
4 2017-01-06 c 8.1
5 2017-01-05 c 7.4
6 2017-01-05 d 7.1
7 2017-01-05 a 7.0
8 2017-01-04 b; a 7.7
9 2017-01-04 c; a 5.3
10 2017-01-04 a; c 8.0
我想对 id_
的单个(以分号分隔的)元素进行分组,并计算 val
的累积平均值,但不包括 每个日期。这应该为任何 id_
的第一次出现给出 NaN,然后我用一些任意值填充它(这里是 6.0)。
输出:
id_
a 0 6.0000
1 9.3000
2 8.6000
7 8.1667
8 7.8750
9 7.8400
10 7.4167
b 0 6.0000
1 9.3000
3 8.6000
8 8.7333
c 1 6.0000 # fill value
4 7.9000 # first actual occurrence of id_='c'
5 8.0000 # cumulative mean of the first two 'c'
9 7.8000
10 7.1750
d 6 6.0000
Name: val, dtype: float64
这是我目前的流程,比较慢--可以改进吗?其次,我可以在最终结果中保留 date
列吗?
# seems like `pd.melt` might be more direct here
df.sort_values('date', inplace=True)
stacked = df.id_.str.split('; ', expand=True).stack()
stacked.index = stacked.index.droplevel(1)
stacked = stacked.to_frame()\
.merge(df, left_index=True, right_index=True)\
.drop('id_', axis=1)\
.rename({0: 'id_'}, axis=1)
def trend_scorer(s: pd.Series, fillvalue=6.):
return s['val'].expanding().mean().shift(1).fillna(fillvalue)
stacked.groupby('id_').apply(trend_scorer)
DataFrame 创建:
import pandas as pd
data = \
{'id_': {0: 'a; b',
1: 'a; b; c',
2: 'a',
3: 'b',
4: 'c',
5: 'c',
6: 'd',
7: 'a',
8: 'b; a',
9: 'c; a',
10: 'a; c'},
'date': {0: '1/8/17',
1: '1/7/17',
2: '1/7/17',
3: '1/6/17',
4: '1/6/17',
5: '1/5/17',
6: '1/5/17',
7: '1/5/17',
8: '1/4/17',
9: '1/4/17',
10: '1/4/17'},
'val': {0: 9.3,
1: 7.9,
2: 7.3,
3: 9.0,
4: 8.1,
5: 7.4,
6: 7.1,
7: 7.0,
8: 7.7,
9: 5.3,
10: 8.0}}
df = pd.DataFrame(data)
df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)
groupby/apply
通常是一个相对较慢的操作(与 Pandas' Cythonized 或 NumPy 的矢量化操作相比),因为它需要为每个组调用一次 Python 函数。尽可能避免使用它。
在这种情况下,您可以通过使用 groupby/expanding
来获得适度的优势:
result = stacked.groupby('id_').expanding()['val'].mean()
result = result.groupby(level='id_').shift(1).fillna(fillvalue)
要用 stacked
重新加入此 result
,您可以使用 DataFrame.join
--
这里的主要问题是 DataFrame 必须共享相同的索引级别
在你加入他们之前:
result = stacked.set_index('id_', append=True).swaplevel().join(result)
在您的小示例 DataFrame 上,alt
比 orig
快 1.3 倍:
In [500]: %timeit orig(df)
100 loops, best of 3: 12.5 ms per loop
In [501]: %timeit alt(df)
100 loops, best of 3: 9.49 ms per loop
在更大的10K行1000组的DataFrame上,alt
的速度优势差不多:
In [504]: %timeit orig(df)
1 loop, best of 3: 2.34 s per loop
In [505]: %timeit alt(df)
1 loop, best of 3: 1.95 s per loop
(固定成本,例如 stacked.set_index('id_', append=True).swaplevel().join(result)
淹没了使用 groupby/expanding
而不是 groupby/apply
的相对较小的好处)。
下面是用于进行上述基准测试的代码:
import pandas as pd
import numpy as np
def trend_scorer(s: pd.Series, fillvalue=6.):
return s['val'].expanding().mean().shift(1).fillna(fillvalue)
def orig(df):
stacked = df.id_.str.split('; ', expand=True).stack()
stacked.index = stacked.index.droplevel(1)
stacked = (stacked.to_frame()
.merge(df, left_index=True, right_index=True)
.drop('id_', axis=1)
.rename(columns={0: 'id_'}))
result = stacked.groupby('id_').apply(trend_scorer)
result = result.rename('expanding mean')
result = stacked.set_index('id_', append=True).swaplevel().join(result)
return result
def alt(df, fillvalue=6.0):
stacked = df['id_'].str.split('; ', expand=True).stack()
stacked.index = stacked.index.droplevel(1)
stacked = (df.drop('id_', axis=1)
.join(stacked.rename('id_')))
result = stacked.groupby('id_').expanding()['val'].mean()
result = result.groupby(level='id_').shift(1).fillna(fillvalue)
result = result.rename('expanding mean')
result = stacked.set_index('id_', append=True).swaplevel().join(result)
return result
data = {'id_': {0: 'a; b', 1: 'a; b; c', 2: 'a', 3: 'b', 4: 'c', 5: 'c', 6: 'd', 7: 'a', 8: 'b; a', 9: 'c; a', 10: 'a; c'}, 'date': {0: '1/8/17', 1: '1/7/17', 2: '1/7/17', 3: '1/6/17', 4: '1/6/17', 5: '1/5/17', 6: '1/5/17', 7: '1/5/17', 8: '1/4/17', 9: '1/4/17', 10: '1/4/17'}, 'val': {0: 9.3, 1: 7.9, 2: 7.3, 3: 9.0, 4: 8.1, 5: 7.4, 6: 7.1, 7: 7.0, 8: 7.7, 9: 5.3, 10: 8.0}}
df = pd.DataFrame(data)
df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)
df = df.sort_values('date')
assert alt(df).equals(orig(df))
这就是我为上述基准创建更大的测试 DataFrame 的方式:
import numpy as np
def make_df(N=10000, seed=2018):
np.random.seed(seed)
data = []
for date in pd.date_range('2017-1-1', periods=N):
for i in range(np.random.randint(1, 10)):
ids = '; '.join(np.random.choice(1000, size=np.random.randint(1, 10)).astype(str))
data.append((date, ids))
df = pd.DataFrame(data, columns=['date', 'id_'])
df['val'] = np.random.uniform(1, 10, size=len(df))
return df
df = make_df()