结合 Pandas GroupBy 并使用多索引数据帧进行转换
Combining Pandas GroupBy and transform with multiindex dataframes
我是 Python Pandas 的新手,我在将 Pandas GroupBy
与 transform
结合使用时遇到问题.我一直无法找到已经发布的答案,但我可能遗漏了一些东西。
我有一个包含大量条目的 DataFrame,结构如下:
GLT_City = pd.read_csv('GlobalLandTemperaturesByCity.csv', sep=',')
GLT_City.head()
AvgTemp AvgTempUncert City Country Lat Long year month day
0 6.068 1.737 Århus Denmark 57.05N 10.33E 1743 11 01
5 5.788 3.624 Århus Denmark 57.05N 10.33E 1744 04 01
6 10.644 1.283 Århus Denmark 57.05N 10.33E 1744 05 01
7 14.051 1.347 Århus Denmark 57.05N 10.33E 1744 06 01
8 16.082 1.396 Århus Denmark 57.05N 10.33E 1744 07 01
10 12.781 1.454 Århus Denmark 57.05N 10.33E 1744 09 01
11 7.950 1.630 Århus Denmark 57.05N 10.33E 1744 10 01
12 4.639 1.302 Århus Denmark 57.05N 10.33E 1744 11 01
我想计算每个城市每个月的加权平均温度,并使用 transform()
以最平滑的方式将其作为新列添加到我的原始数据框中,原因如下这条线。
首先,我定义了一个函数来计算加权平均值:
def wavg(group,data_name,weight_name, sigma=None):
data = group[data_name]
weight = group[weight_name]
#Check whether we have actual weights or measurement uncertainties
if sigma=='sigma':
weight = 1./weight
try:
return (data * weight).sum() / weight.sum()
except ZeroDivisionError:
return data.mean()
然后我想结合 GroupBy
和 transform()
将此函数应用于我的数据框并将结果添加为新列,例如:
GLT_City['WeightedMonthlyMean'] = GLT_City.groupby(['City','month']).transform(wavg, 'AvgTemp','AvgTempUncert', sigma='sigma')
现在这会导致复制粘贴下面的非常冗长的错误消息
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5126)()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas/_libs/hashtable.c:14010)()
TypeError: an integer is required
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-61-cef679f52b5f> in <module>()
----> 1 GLT_City['WeightedMonthlyMean'] = GLT_City.groupby(['City','month']).transform(wavg,
'AvgTemp','AvgTemp', sigma='sigma')
~/anaconda/envs/python36/lib/python3.6/site-
packages/pandas/core/groupby.py in transform(self, func, *args, **kwargs)
3814 result = getattr(self, func)(*args, **kwargs)
3815 else:
-> 3816 return self._transform_general(func, *args, **kwargs)
3817
3818 # a reduction transform
~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/groupby.py in _transform_general(self, func, *args, **kwargs)
3765 # Try slow path and fast path.
3766 try:
-> 3767 path, res = self._choose_path(fast_path, slow_path, group)
3768 except TypeError:
3769 return self._transform_item_by_item(obj, fast_path)
~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/groupby.py in _choose_path(self, fast_path, slow_path, group)
3861 def _choose_path(self, fast_path, slow_path, group):
3862 path = slow_path
-> 3863 res = slow_path(group)
3864
3865 # if we make it here, test if we can use the fast path
~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/groupby.py in <lambda>(group)
3856 fast_path = lambda group: func(group, *args, **kwargs)
3857 slow_path = lambda group: group.apply(
-> 3858 lambda x: func(x, *args, **kwargs), axis=self.axis)
3859 return fast_path, slow_path
3860
~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
4260 f, axis,
4261 reduce=reduce,
-> 4262 ignore_failures=ignore_failures)
4263 else:
4264 return self._apply_broadcast(f, axis)
~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/frame.py in _apply_standard(self, func, axis, ignore_failures, reduce)
4356 try:
4357 for i, v in enumerate(series_gen):
-> 4358 results[i] = func(v)
4359 keys.append(v.name)
4360 except Exception as e:
~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/groupby.py in <lambda>(x)
3856 fast_path = lambda group: func(group, *args, **kwargs)
3857 slow_path = lambda group: group.apply(
-> 3858 lambda x: func(x, *args, **kwargs), axis=self.axis)
3859 return fast_path, slow_path
3860
<ipython-input-58-181ef4bb1f30> in wavg(group, data_name, weight_name, sigma)
10
11 #Extracting data and weights.
---> 12 data = group[data_name]
13 weight = group[weight_name]
14 #Check whether we have actual weights, or measurement uncertainties
~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key)
599 key = com._apply_if_callable(key, self)
600 try:
--> 601 result = self.index.get_value(self, key)
602
603 if not is_scalar(result):
~/anaconda/envs/python36/lib/python3.6/site-
packages/pandas/core/indexes/base.py in get_value(self, series, key)
2475 try:
2476 return self._engine.get_value(s, k,
-> 2477
tz=getattr(series.dtype, 'tz', None))
2478 except KeyError as e1:
2479 if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4404)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4087)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5210)()
KeyError: ('AvgTemp', 'occurred at index AvgTemp')
所以这显然不起作用,但我不清楚为什么。欢迎任何 pointers/solutions。
我可以使用 apply()
方法来获得所需的输出,但是由于我对组进行平均,所以我无法真正将其与原始数据框合并,因为 [=20] 生成的系列=] 将具有不同的大小。
transform
函数分别应用于每个组列。
在 wavg
中放入一个 print
语句将帮助您看到问题:
def wavg(group,data_name,weight_name, sigma=None):
print(group)
...
df['WeightedMonthlyMean'] = df.groupby(['City','month']).transform(wavg, 'AvgTemp','AvgTempUncert', sigma='sigma')
打印
1 5.788
Name: AvgTemp, dtype: object
在提高 KeyError
之前。这表明 group
只是一个系列,而不是整个(组)DataFrame。
因此,改用 apply
,然后将 result
合并回 df
:
result = df.groupby(['City','month']).apply(wavg, 'AvgTemp','AvgTempUncert', sigma='sigma').reset_index(name='wavg')
result = pd.merge(df, result)
例如,
import pandas as pd
df = pd.DataFrame({'AvgTemp': [6.068, 5.787999999999999, 10.644, 14.050999999999998, 16.082, 12.780999999999999, 7.95, 4.638999999999999], 'AvgTempUncert': [1.7369999999999999, 3.6239999999999997, 1.2830000000000001, 1.347, 1.396, 1.454, 1.63, 1.3019999999999998], 'City': ['Århus', 'Århus', 'Århus', 'Århus', 'Århus', 'Århus', 'Århus', 'Århus'], 'Country': ['Denmark', 'Denmark', 'Denmark', 'Denmark', 'Denmark', 'Denmark', 'Denmark', 'Denmark'], 'Lat': ['57.05N', '57.05N', '57.05N', '57.05N', '57.05N', '57.05N', '57.05N', '57.05N'], 'Long': ['10.33E', '10.33E', '10.33E', '10.33E', '10.33E', '10.33E', '10.33E', '10.33E'], 'day': [1, 1, 1, 1, 1, 1, 1, 1], 'month': [11, 4, 5, 6, 7, 9, 10, 11], 'year': [1743, 1744, 1744, 1744, 1744, 1744, 1744, 1744]})
def wavg(group,data_name,weight_name, sigma=None):
data = group[data_name]
weight = group[weight_name]
#Check whether we have actual weights or measurement uncertainties
if sigma=='sigma':
weight = 1./weight
try:
return (data * weight).sum() / weight.sum()
except ZeroDivisionError:
return data.mean()
result = df.groupby(['City','month']).apply(wavg, 'AvgTemp','AvgTempUncert', sigma='sigma').reset_index(name='wavg')
result = pd.merge(df, result)
print(result)
产量
AvgTemp AvgTempUncert City Country Lat Long day month year wavg
0 6.068 1.737 Århus Denmark 57.05N 10.33E 1 11 1743 5.251227
1 4.639 1.302 Århus Denmark 57.05N 10.33E 1 11 1744 5.251227
2 5.788 3.624 Århus Denmark 57.05N 10.33E 1 4 1744 5.788000
3 10.644 1.283 Århus Denmark 57.05N 10.33E 1 5 1744 10.644000
4 14.051 1.347 Århus Denmark 57.05N 10.33E 1 6 1744 14.051000
5 16.082 1.396 Århus Denmark 57.05N 10.33E 1 7 1744 16.082000
6 12.781 1.454 Århus Denmark 57.05N 10.33E 1 9 1744 12.781000
7 7.950 1.630 Århus Denmark 57.05N 10.33E 1 10 1744 7.950000
如何使用 apply
然后 merge
将其放入相同的 DataFrame
?示例:
import numpy as np
import pandas as pd
data = pd.DataFrame({'City': np.random.randint(0, 4, 1000), 'Month': np.random.randint(1, 12, 1000), 'T': np.random.randn(1000)})
pd.merge(data, data.groupby(['City', 'Month']).apply(lambda x: x['T']*2).reset_index()[['City', 'Month', 'T']].rename(columns={'T': 'WeightedT'}), left_on=['City', 'Month'], right_on=['City', 'Month'])
我是 Python Pandas 的新手,我在将 Pandas GroupBy
与 transform
结合使用时遇到问题.我一直无法找到已经发布的答案,但我可能遗漏了一些东西。
我有一个包含大量条目的 DataFrame,结构如下:
GLT_City = pd.read_csv('GlobalLandTemperaturesByCity.csv', sep=',')
GLT_City.head()
AvgTemp AvgTempUncert City Country Lat Long year month day
0 6.068 1.737 Århus Denmark 57.05N 10.33E 1743 11 01
5 5.788 3.624 Århus Denmark 57.05N 10.33E 1744 04 01
6 10.644 1.283 Århus Denmark 57.05N 10.33E 1744 05 01
7 14.051 1.347 Århus Denmark 57.05N 10.33E 1744 06 01
8 16.082 1.396 Århus Denmark 57.05N 10.33E 1744 07 01
10 12.781 1.454 Århus Denmark 57.05N 10.33E 1744 09 01
11 7.950 1.630 Århus Denmark 57.05N 10.33E 1744 10 01
12 4.639 1.302 Århus Denmark 57.05N 10.33E 1744 11 01
我想计算每个城市每个月的加权平均温度,并使用 transform()
以最平滑的方式将其作为新列添加到我的原始数据框中,原因如下这条线。
首先,我定义了一个函数来计算加权平均值:
def wavg(group,data_name,weight_name, sigma=None):
data = group[data_name]
weight = group[weight_name]
#Check whether we have actual weights or measurement uncertainties
if sigma=='sigma':
weight = 1./weight
try:
return (data * weight).sum() / weight.sum()
except ZeroDivisionError:
return data.mean()
然后我想结合 GroupBy
和 transform()
将此函数应用于我的数据框并将结果添加为新列,例如:
GLT_City['WeightedMonthlyMean'] = GLT_City.groupby(['City','month']).transform(wavg, 'AvgTemp','AvgTempUncert', sigma='sigma')
现在这会导致复制粘贴下面的非常冗长的错误消息
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5126)()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas/_libs/hashtable.c:14010)()
TypeError: an integer is required
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-61-cef679f52b5f> in <module>()
----> 1 GLT_City['WeightedMonthlyMean'] = GLT_City.groupby(['City','month']).transform(wavg,
'AvgTemp','AvgTemp', sigma='sigma')
~/anaconda/envs/python36/lib/python3.6/site-
packages/pandas/core/groupby.py in transform(self, func, *args, **kwargs)
3814 result = getattr(self, func)(*args, **kwargs)
3815 else:
-> 3816 return self._transform_general(func, *args, **kwargs)
3817
3818 # a reduction transform
~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/groupby.py in _transform_general(self, func, *args, **kwargs)
3765 # Try slow path and fast path.
3766 try:
-> 3767 path, res = self._choose_path(fast_path, slow_path, group)
3768 except TypeError:
3769 return self._transform_item_by_item(obj, fast_path)
~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/groupby.py in _choose_path(self, fast_path, slow_path, group)
3861 def _choose_path(self, fast_path, slow_path, group):
3862 path = slow_path
-> 3863 res = slow_path(group)
3864
3865 # if we make it here, test if we can use the fast path
~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/groupby.py in <lambda>(group)
3856 fast_path = lambda group: func(group, *args, **kwargs)
3857 slow_path = lambda group: group.apply(
-> 3858 lambda x: func(x, *args, **kwargs), axis=self.axis)
3859 return fast_path, slow_path
3860
~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
4260 f, axis,
4261 reduce=reduce,
-> 4262 ignore_failures=ignore_failures)
4263 else:
4264 return self._apply_broadcast(f, axis)
~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/frame.py in _apply_standard(self, func, axis, ignore_failures, reduce)
4356 try:
4357 for i, v in enumerate(series_gen):
-> 4358 results[i] = func(v)
4359 keys.append(v.name)
4360 except Exception as e:
~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/groupby.py in <lambda>(x)
3856 fast_path = lambda group: func(group, *args, **kwargs)
3857 slow_path = lambda group: group.apply(
-> 3858 lambda x: func(x, *args, **kwargs), axis=self.axis)
3859 return fast_path, slow_path
3860
<ipython-input-58-181ef4bb1f30> in wavg(group, data_name, weight_name, sigma)
10
11 #Extracting data and weights.
---> 12 data = group[data_name]
13 weight = group[weight_name]
14 #Check whether we have actual weights, or measurement uncertainties
~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key)
599 key = com._apply_if_callable(key, self)
600 try:
--> 601 result = self.index.get_value(self, key)
602
603 if not is_scalar(result):
~/anaconda/envs/python36/lib/python3.6/site-
packages/pandas/core/indexes/base.py in get_value(self, series, key)
2475 try:
2476 return self._engine.get_value(s, k,
-> 2477
tz=getattr(series.dtype, 'tz', None))
2478 except KeyError as e1:
2479 if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4404)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4087)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5210)()
KeyError: ('AvgTemp', 'occurred at index AvgTemp')
所以这显然不起作用,但我不清楚为什么。欢迎任何 pointers/solutions。
我可以使用 apply()
方法来获得所需的输出,但是由于我对组进行平均,所以我无法真正将其与原始数据框合并,因为 [=20] 生成的系列=] 将具有不同的大小。
transform
函数分别应用于每个组列。
在 wavg
中放入一个 print
语句将帮助您看到问题:
def wavg(group,data_name,weight_name, sigma=None):
print(group)
...
df['WeightedMonthlyMean'] = df.groupby(['City','month']).transform(wavg, 'AvgTemp','AvgTempUncert', sigma='sigma')
打印
1 5.788
Name: AvgTemp, dtype: object
在提高 KeyError
之前。这表明 group
只是一个系列,而不是整个(组)DataFrame。
因此,改用 apply
,然后将 result
合并回 df
:
result = df.groupby(['City','month']).apply(wavg, 'AvgTemp','AvgTempUncert', sigma='sigma').reset_index(name='wavg')
result = pd.merge(df, result)
例如,
import pandas as pd
df = pd.DataFrame({'AvgTemp': [6.068, 5.787999999999999, 10.644, 14.050999999999998, 16.082, 12.780999999999999, 7.95, 4.638999999999999], 'AvgTempUncert': [1.7369999999999999, 3.6239999999999997, 1.2830000000000001, 1.347, 1.396, 1.454, 1.63, 1.3019999999999998], 'City': ['Århus', 'Århus', 'Århus', 'Århus', 'Århus', 'Århus', 'Århus', 'Århus'], 'Country': ['Denmark', 'Denmark', 'Denmark', 'Denmark', 'Denmark', 'Denmark', 'Denmark', 'Denmark'], 'Lat': ['57.05N', '57.05N', '57.05N', '57.05N', '57.05N', '57.05N', '57.05N', '57.05N'], 'Long': ['10.33E', '10.33E', '10.33E', '10.33E', '10.33E', '10.33E', '10.33E', '10.33E'], 'day': [1, 1, 1, 1, 1, 1, 1, 1], 'month': [11, 4, 5, 6, 7, 9, 10, 11], 'year': [1743, 1744, 1744, 1744, 1744, 1744, 1744, 1744]})
def wavg(group,data_name,weight_name, sigma=None):
data = group[data_name]
weight = group[weight_name]
#Check whether we have actual weights or measurement uncertainties
if sigma=='sigma':
weight = 1./weight
try:
return (data * weight).sum() / weight.sum()
except ZeroDivisionError:
return data.mean()
result = df.groupby(['City','month']).apply(wavg, 'AvgTemp','AvgTempUncert', sigma='sigma').reset_index(name='wavg')
result = pd.merge(df, result)
print(result)
产量
AvgTemp AvgTempUncert City Country Lat Long day month year wavg
0 6.068 1.737 Århus Denmark 57.05N 10.33E 1 11 1743 5.251227
1 4.639 1.302 Århus Denmark 57.05N 10.33E 1 11 1744 5.251227
2 5.788 3.624 Århus Denmark 57.05N 10.33E 1 4 1744 5.788000
3 10.644 1.283 Århus Denmark 57.05N 10.33E 1 5 1744 10.644000
4 14.051 1.347 Århus Denmark 57.05N 10.33E 1 6 1744 14.051000
5 16.082 1.396 Århus Denmark 57.05N 10.33E 1 7 1744 16.082000
6 12.781 1.454 Århus Denmark 57.05N 10.33E 1 9 1744 12.781000
7 7.950 1.630 Århus Denmark 57.05N 10.33E 1 10 1744 7.950000
如何使用 apply
然后 merge
将其放入相同的 DataFrame
?示例:
import numpy as np
import pandas as pd
data = pd.DataFrame({'City': np.random.randint(0, 4, 1000), 'Month': np.random.randint(1, 12, 1000), 'T': np.random.randn(1000)})
pd.merge(data, data.groupby(['City', 'Month']).apply(lambda x: x['T']*2).reset_index()[['City', 'Month', 'T']].rename(columns={'T': 'WeightedT'}), left_on=['City', 'Month'], right_on=['City', 'Month'])