如何提取指定列值组合重复的数据框行?
How to extract the rows of a dataframe where a combination of specified column values are duplicated?
假设我有以下数据框:
import pandas as pd
data = {'Year':[2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018],
'Month':[1,1,1,2,2,3,3,3],
'ID':['A', 'A', 'B', 'A', 'B', 'A', 'B', 'B'],
'Fruit':['Apple', 'Banana', 'Apple', 'Pear', 'Mango', 'Banana', 'Apple', 'Mango']}
df = pd.DataFrame(data, columns=['Year', 'Month', 'ID', 'Fruit'])
df = df.astype(str)
df
我想提取重复的'Year'、'Month'和'ID'的组合。因此,对于上述数据框,预期结果是此数据框:
我的做法是先做一个groupby
,计算Year
、Month
和ID
的组合出现的次数:
df2 = df.groupby(['Year', 'Month'])['ID'].value_counts().to_frame(name = 'Count').reset_index()
df2 = df2[df2.Count>1]
df2
然后,我的想法是遍历 groupby 数据框中的 Year
、Month
和 ID
组合,并提取与原始数据框中的组合匹配的那些行进入一个新的数据框:
df_new = pd.DataFrame(columns=df.columns, index=range(sum(df2.Count)))
count = 0
for i in df2.index:
temp = df[(df.ID==df2.ID[i]) & (df.Year==df2.Year[i]) & (df.Month==df2.Month[i])]
temp.reset_index(drop=True, inplace=True)
for j in range(len(temp)):
df_new.iloc[count] = temp.iloc[j]
count+=1
df_new
但这会产生以下错误:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-38-7f2d95d71270> in <module>()
6 temp.reset_index(drop=True, inplace=True)
7 for j in range(len(temp)):
----> 8 df_new.iloc[count] = temp.iloc[j]
9 count+=1
10 df_new
c:\users\h473\appdata\local\programs\python\python35\lib\site-packages\pandas\core\indexing.py in __setitem__(self, key, value)
187 else:
188 key = com.apply_if_callable(key, self.obj)
--> 189 indexer = self._get_setitem_indexer(key)
190 self._setitem_with_indexer(indexer, value)
191
c:\users\h473\appdata\local\programs\python\python35\lib\site-packages\pandas\core\indexing.py in _get_setitem_indexer(self, key)
173
174 try:
--> 175 return self._convert_to_indexer(key, is_setter=True)
176 except TypeError as e:
177
c:\users\h473\appdata\local\programs\python\python35\lib\site-packages\pandas\core\indexing.py in _convert_to_indexer(self, obj, axis, is_setter)
2245
2246 try:
-> 2247 self._validate_key(obj, axis)
2248 return obj
2249 except ValueError:
c:\users\h473\appdata\local\programs\python\python35\lib\site-packages\pandas\core\indexing.py in _validate_key(self, key, axis)
2068 return
2069 elif is_integer(key):
-> 2070 self._validate_integer(key, axis)
2071 elif isinstance(key, tuple):
2072 # a tuple should already have been caught by this point
c:\users\h473\appdata\local\programs\python\python35\lib\site-packages\pandas\core\indexing.py in _validate_integer(self, key, axis)
2137 len_axis = len(self.obj._get_axis(axis))
2138 if key >= len_axis or key < -len_axis:
-> 2139 raise IndexError("single positional indexer is out-of-bounds")
2140
2141 def _getitem_tuple(self, tup):
IndexError: single positional indexer is out-of-bounds
错误是什么?我想不通。
当我将 for
循环的内容更改为以下内容时,错误消失了,产生了预期的结果:
for j in range(len(temp)):
df_new.ID[count] = temp.ID[j]
df_new.Year[count] = temp.Year[j]
df_new.Month[count] = temp.Month[j]
df_new.Fruit[count] = temp.Fruit[j]
count+=1
但这是一个繁琐的解决方法,涉及为原始数据框中的每个 n
列编写 n
行。
使用GroupBy.transform
with any column and counts by GroupBy.size
for Series with same size like original, so possible filter by boolean indexing
:
df1 = df[df.groupby(['Year','Month','ID'])['ID'].transform('size') > 1]
或者如果小 DataFrame
或者性能不重要使用 DataFrameGroupBy.filter
:
df1 = df.groupby(['Year','Month','ID']).filter(lambda x: len(x) > 1)
print (df1)
Year Month ID Fruit
0 2018 1 A Apple
1 2018 1 A Banana
6 2018 3 B Apple
7 2018 3 B Mango
您可以使用带有参数 keep=False
的方法 duplicated
来 select 所有重复项:
df[df.duplicated(subset=['Year', 'Month', 'ID'], keep=False)]
输出:
Year Month ID Fruit
0 2018 1 A Apple
1 2018 1 A Banana
6 2018 3 B Apple
7 2018 3 B Mango
假设我有以下数据框:
import pandas as pd
data = {'Year':[2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018],
'Month':[1,1,1,2,2,3,3,3],
'ID':['A', 'A', 'B', 'A', 'B', 'A', 'B', 'B'],
'Fruit':['Apple', 'Banana', 'Apple', 'Pear', 'Mango', 'Banana', 'Apple', 'Mango']}
df = pd.DataFrame(data, columns=['Year', 'Month', 'ID', 'Fruit'])
df = df.astype(str)
df
我想提取重复的'Year'、'Month'和'ID'的组合。因此,对于上述数据框,预期结果是此数据框:
我的做法是先做一个groupby
,计算Year
、Month
和ID
的组合出现的次数:
df2 = df.groupby(['Year', 'Month'])['ID'].value_counts().to_frame(name = 'Count').reset_index()
df2 = df2[df2.Count>1]
df2
然后,我的想法是遍历 groupby 数据框中的 Year
、Month
和 ID
组合,并提取与原始数据框中的组合匹配的那些行进入一个新的数据框:
df_new = pd.DataFrame(columns=df.columns, index=range(sum(df2.Count)))
count = 0
for i in df2.index:
temp = df[(df.ID==df2.ID[i]) & (df.Year==df2.Year[i]) & (df.Month==df2.Month[i])]
temp.reset_index(drop=True, inplace=True)
for j in range(len(temp)):
df_new.iloc[count] = temp.iloc[j]
count+=1
df_new
但这会产生以下错误:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-38-7f2d95d71270> in <module>()
6 temp.reset_index(drop=True, inplace=True)
7 for j in range(len(temp)):
----> 8 df_new.iloc[count] = temp.iloc[j]
9 count+=1
10 df_new
c:\users\h473\appdata\local\programs\python\python35\lib\site-packages\pandas\core\indexing.py in __setitem__(self, key, value)
187 else:
188 key = com.apply_if_callable(key, self.obj)
--> 189 indexer = self._get_setitem_indexer(key)
190 self._setitem_with_indexer(indexer, value)
191
c:\users\h473\appdata\local\programs\python\python35\lib\site-packages\pandas\core\indexing.py in _get_setitem_indexer(self, key)
173
174 try:
--> 175 return self._convert_to_indexer(key, is_setter=True)
176 except TypeError as e:
177
c:\users\h473\appdata\local\programs\python\python35\lib\site-packages\pandas\core\indexing.py in _convert_to_indexer(self, obj, axis, is_setter)
2245
2246 try:
-> 2247 self._validate_key(obj, axis)
2248 return obj
2249 except ValueError:
c:\users\h473\appdata\local\programs\python\python35\lib\site-packages\pandas\core\indexing.py in _validate_key(self, key, axis)
2068 return
2069 elif is_integer(key):
-> 2070 self._validate_integer(key, axis)
2071 elif isinstance(key, tuple):
2072 # a tuple should already have been caught by this point
c:\users\h473\appdata\local\programs\python\python35\lib\site-packages\pandas\core\indexing.py in _validate_integer(self, key, axis)
2137 len_axis = len(self.obj._get_axis(axis))
2138 if key >= len_axis or key < -len_axis:
-> 2139 raise IndexError("single positional indexer is out-of-bounds")
2140
2141 def _getitem_tuple(self, tup):
IndexError: single positional indexer is out-of-bounds
错误是什么?我想不通。
当我将 for
循环的内容更改为以下内容时,错误消失了,产生了预期的结果:
for j in range(len(temp)):
df_new.ID[count] = temp.ID[j]
df_new.Year[count] = temp.Year[j]
df_new.Month[count] = temp.Month[j]
df_new.Fruit[count] = temp.Fruit[j]
count+=1
但这是一个繁琐的解决方法,涉及为原始数据框中的每个 n
列编写 n
行。
使用GroupBy.transform
with any column and counts by GroupBy.size
for Series with same size like original, so possible filter by boolean indexing
:
df1 = df[df.groupby(['Year','Month','ID'])['ID'].transform('size') > 1]
或者如果小 DataFrame
或者性能不重要使用 DataFrameGroupBy.filter
:
df1 = df.groupby(['Year','Month','ID']).filter(lambda x: len(x) > 1)
print (df1)
Year Month ID Fruit
0 2018 1 A Apple
1 2018 1 A Banana
6 2018 3 B Apple
7 2018 3 B Mango
您可以使用带有参数 keep=False
的方法 duplicated
来 select 所有重复项:
df[df.duplicated(subset=['Year', 'Month', 'ID'], keep=False)]
输出:
Year Month ID Fruit
0 2018 1 A Apple
1 2018 1 A Banana
6 2018 3 B Apple
7 2018 3 B Mango