使用 pandas 和 python 写入数据帧
using pandas with python to write to a dataframe
import pandas as pd
import re
import numpy as np
data= [['Empty','CMI-General Liability | 05-9362','Empty','Empty'],['Empty','Central Operations','Empty','Empty'],['Empty','Alarm Central 05-8642','Empty','Empty'],['Empty','Market 466','Empty','Empty'],['Empty','Talent, Experience','Empty','Empty'],['Empty','Food Division','Empty','Empty'],['Empty','Quality WMCC','Empty','Empty'],['Empty','Modular Execution Team | 01-9700','Empty','Empty'],['Empty','US Central Operations','Empty','Empty'],['Empty','CE - Engineering - US','Empty','Empty'],['Empty','Fresh, Freezer & Cooler - 18-8110','Empty','Empty'],['Empty','9701','Empty','Empty'],['Empty','Contact Center','Empty','Empty'],['Empty','Central Operations','Empty','Empty'],['Empty','US Central Operations','Empty','Empty'],['Empty','Private Brands GM - 01-8683','Empty','Empty']]
df2=pd.DataFrame(data,columns=['JobTitle','Department','TrueDepartment','Dept_Function'])
data5 = [[1,'TRUCKING, MARCY, NY','Empty','Empty'],[2,'TRUCKING-GREENVILLE,TN','Empty','Empty'],[3,'DC 40, HOPE MILLS, NC','Empty','Empty'],[4,'TRUCKING, SHARON SPRINGS','Empty','Empty'],[5,'DISP PAULS VALLEY OK FDC','Empty','Empty'],[6,'COLDWATER, MI','Empty','Empty'],[7,'AMERICOLD LOGISTICS','Empty','Empty'],[8,'DFW3N FORT WORTH FC WHS.COM','Empty','Empty'],[9,'PCCC CURRENTLY BEING REVIEWED','Empty','Empty'],[466,'Springfield, MO','Empty','Empty'],[8110,'Fresh Dept','Empty','Empty'],[8642,'Security','Security & Compliance','Empty'],[8683,'General Merchandise','Empty','Empty'],[9362,'General Liability','Empty','Empty'],[9700,'Execution Team','Empty','Empty'],[9701,'Produce TN','Empty','Empty']]
df5=pd.DataFrame(data5,columns=['Dept_Nbr','Dept_Desc_good','Dept_Desc_better','Dept_Abrv'])
是数据帧 2
JobTitle Department TrueDepartment Dept_Function
CMI-General Liability | 05-9362
Central Operations
Alarm Central 05-8642
Market 466
Talent, Experience
Food Division
Quality WMCC
Modular Execution Team | 01-9700
US Central Operations
CE - Engineering - US
Fresh, Freezer & Cooler - 18-8110
9701
Contact Center
Central Operations
US Central Operations
Private Brands GM - 01-8683
df5 是 dataframe5
Dept_Nbr Dept_Desc_good Dept_Desc_better Dept_Abrv
1 TRUCKING, MARCY, NY
2 TRUCKING-GREENVILLE,TN
3 DC 40, HOPE MILLS, NC
4 TRUCKING, SHARON SPRINGS
5 DISP PAULS VALLEY OK FDC
6 COLDWATER, MI
7 AMERICOLD LOGISTICS
8 DFW3N FORT WORTH FC - WHS.COM
9 PCCC CURRENTLY BEING REVIEWED
466 Springfield, MO
8110 Fresh Dept
8642 Security Security & Compliance
8683 General Merchandise
9362 General Liability
9700 Execution Team
9701 Produce TN
运行 代码
后的期望结果
JobTitle Department TrueDepartment
CMI-General Liability | 05-9362 General Liability
Central Operations
Alarm Central 05-8642 Security & Compliance
Market 466
Talent, Experience
Food Division
Quality WMCC
Modular Execution Team | 01-9700 Execution Team
US Central Operations
CE - Engineering - US
Fresh, Freezer & Cooler - 18-8110 Fresh Dept
9701 Produce TN
Contact Center
Central Operations
US Central Operations
Private Brands GM - 01-8683 General Merchandise
当前代码:
import pandas as pd
import re
numbers = df5['Dept_Nbr'].tolist()
df5['Dept_Nbr'] = [int(i) for i in df5['Dept_Nbr']]
df5.set_index('Dept_Nbr')
for n in numbers:
for i in df5.index:
if n in df2.loc[i, 'Department']:
if df5.at[int(n), 'Dept_Desc_better']: #if values exists
df2.at[i, 'TrueDepartment'] = df5.at(int(n), 'Dept_Desc_better')
else:
df2.at[i, 'TrueDepartment'] = df5.at(int(n), 'Dept_Desc_good')
获取错误类型错误:'in ' 需要字符串作为左操作数,而不是 int'
我想我应该尝试将 n 更改为字符串类型?
此外,我还必须弄清楚如何在 df2
的 "Department" 列中找到子字符串,该子字符串要么跟在连字符后面,要么是单元格中唯一的数字(即 9701
) .我可能需要为此使用正则表达式 (re
)。对于 df2
中的第一个部门,它将找到字符串“9362”并将其与 df5
中的 Dept_Nbr
匹配,并将 "General Liability" 写入 TrueDepartment
列。 df5
实际上有 Dept_Nbr
从 1 到超过 10000 的连续数。
根据阿姆斯特朗先生的建议对我的代码进行更改后的最新错误...只有在我实际的完整数据帧上使用时才会出现错误,而不是在我提供的示例数据帧上使用时出现错误。
KeyError Traceback (most recent call last)
~/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py in
get_loc(self, key, method, tolerance)
3062 try:
-> 3063 return self._engine.get_loc(key)
3064 except KeyError:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in
pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in
pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'Dept_Nbr'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-14-89dd44993593> in <module>()
----> 1 numbers = df5['Dept_Nbr'].tolist()
2 df5['Dept_Nbr'] = [int(i) for i in df5['Dept_Nbr']]
3 df5 = df5.set_index('Dept_Nbr') #<-- need to actually set df5 to the new index
4
5 for n in numbers:
~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in
__getitem__(self, key)
2683 return self._getitem_multilevel(key)
2684 else:
-> 2685 return self._getitem_column(key)
2686
2687 def _getitem_column(self, key):
~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in
_getitem_column(self, key)
2690 # get column
2691 if self.columns.is_unique:
-> 2692 return self._get_item_cache(key)
2693
2694 # duplicate columns & possible reduce dimensionality
~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in
_get_item_cache(self, item)
2484 res = cache.get(item)
2485 if res is None:
-> 2486 values = self._data.get(item)
2487 res = self._box_item_values(item, values)
2488 cache[item] = res
~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in get(self, item, fastpath)
4113
4114 if not isna(item):
-> 4115 loc = self.items.get_loc(item)
4116 else:
4117 indexer = np.arange(len(self.items))[isna(self.items)]
~/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3063 return self._engine.get_loc(key)
3064 except KeyError:
-> 3065 return self._engine.get_loc(self._maybe_cast_indexer(key))
3066
3067 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in
pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in
pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'Dept_Nbr'
首先 - 您上面的数据框与您的数据框构造不匹配。我花了很长时间才弄明白为什么 9362 != 9362
:-)
这里有一些要考虑的事情:
numbers = df5['Dept_Nbr'].tolist()
df5['Dept_Nbr'] = [int(i) for i in df5['Dept_Nbr']]
df5 = df5.set_index('Dept_Nbr') #<-- need to actually set df5 to the new index
for n in numbers:
for i in range(len(df5.index)): #<-- you want to iterate through the number of elements not the elements themselves
if str(n) == df2.loc[i, 'Department'][-4:]: #<-- convert n to str and slice df2 string for the last 4 chars
if df5.loc[n, 'Dept_Desc_better'] != "Empty": #<-- you're actually checking against a string, not a NaN
df2.loc[i, 'TrueDepartment'] = df5.loc[n, 'Dept_Desc_better'] #<-- use .loc not .at
else:
df2.loc[i, 'TrueDepartment'] = df5.loc[n, 'Dept_Desc_good']
df2 = df2.replace(to_replace="Empty", value="") #<-- your desired output has '' rather than 'Empty' - so replaced.
假设 Department
下的 df2
描述始终以 Dept_Nbr
结尾,并且其长度始终为 4 个字符。这是输出:
df2
JobTitle Department TrueDepartment Dept_Function
0 CMI-General Liability | 05-9632
1 Central Operations
2 Alarm Central 05-8642 Security & Compliance
3 Market 466
4 Talent, Experience
5 Food Division
6 Quality WMCC
7 Modular Execution Team | 01-9700 Execution Team
8 US Central Operations
9 CE - Engineering - US
10 Fresh, Freezer & Cooler - 18-8110 Fresh Dept
11 9701 Produce TN
12 Contact Center
13 Central Operations
14 US Central Operations
15 Private Brands GM - 01-8683 General Merchandise
顺便说一句 - 代码确实有效,索引行 0 中没有错误 - 这是您的数据帧不同的地方。我也尽可能多地保留了您的代码,但我猜想还有更好的迭代方法。
import pandas as pd
import re
import numpy as np
data= [['Empty','CMI-General Liability | 05-9362','Empty','Empty'],['Empty','Central Operations','Empty','Empty'],['Empty','Alarm Central 05-8642','Empty','Empty'],['Empty','Market 466','Empty','Empty'],['Empty','Talent, Experience','Empty','Empty'],['Empty','Food Division','Empty','Empty'],['Empty','Quality WMCC','Empty','Empty'],['Empty','Modular Execution Team | 01-9700','Empty','Empty'],['Empty','US Central Operations','Empty','Empty'],['Empty','CE - Engineering - US','Empty','Empty'],['Empty','Fresh, Freezer & Cooler - 18-8110','Empty','Empty'],['Empty','9701','Empty','Empty'],['Empty','Contact Center','Empty','Empty'],['Empty','Central Operations','Empty','Empty'],['Empty','US Central Operations','Empty','Empty'],['Empty','Private Brands GM - 01-8683','Empty','Empty']]
df2=pd.DataFrame(data,columns=['JobTitle','Department','TrueDepartment','Dept_Function'])
data5 = [[1,'TRUCKING, MARCY, NY','Empty','Empty'],[2,'TRUCKING-GREENVILLE,TN','Empty','Empty'],[3,'DC 40, HOPE MILLS, NC','Empty','Empty'],[4,'TRUCKING, SHARON SPRINGS','Empty','Empty'],[5,'DISP PAULS VALLEY OK FDC','Empty','Empty'],[6,'COLDWATER, MI','Empty','Empty'],[7,'AMERICOLD LOGISTICS','Empty','Empty'],[8,'DFW3N FORT WORTH FC WHS.COM','Empty','Empty'],[9,'PCCC CURRENTLY BEING REVIEWED','Empty','Empty'],[466,'Springfield, MO','Empty','Empty'],[8110,'Fresh Dept','Empty','Empty'],[8642,'Security','Security & Compliance','Empty'],[8683,'General Merchandise','Empty','Empty'],[9362,'General Liability','Empty','Empty'],[9700,'Execution Team','Empty','Empty'],[9701,'Produce TN','Empty','Empty']]
df5=pd.DataFrame(data5,columns=['Dept_Nbr','Dept_Desc_good','Dept_Desc_better','Dept_Abrv'])
是数据帧 2
JobTitle Department TrueDepartment Dept_Function
CMI-General Liability | 05-9362
Central Operations
Alarm Central 05-8642
Market 466
Talent, Experience
Food Division
Quality WMCC
Modular Execution Team | 01-9700
US Central Operations
CE - Engineering - US
Fresh, Freezer & Cooler - 18-8110
9701
Contact Center
Central Operations
US Central Operations
Private Brands GM - 01-8683
df5 是 dataframe5
Dept_Nbr Dept_Desc_good Dept_Desc_better Dept_Abrv
1 TRUCKING, MARCY, NY
2 TRUCKING-GREENVILLE,TN
3 DC 40, HOPE MILLS, NC
4 TRUCKING, SHARON SPRINGS
5 DISP PAULS VALLEY OK FDC
6 COLDWATER, MI
7 AMERICOLD LOGISTICS
8 DFW3N FORT WORTH FC - WHS.COM
9 PCCC CURRENTLY BEING REVIEWED
466 Springfield, MO
8110 Fresh Dept
8642 Security Security & Compliance
8683 General Merchandise
9362 General Liability
9700 Execution Team
9701 Produce TN
运行 代码
后的期望结果JobTitle Department TrueDepartment
CMI-General Liability | 05-9362 General Liability
Central Operations
Alarm Central 05-8642 Security & Compliance
Market 466
Talent, Experience
Food Division
Quality WMCC
Modular Execution Team | 01-9700 Execution Team
US Central Operations
CE - Engineering - US
Fresh, Freezer & Cooler - 18-8110 Fresh Dept
9701 Produce TN
Contact Center
Central Operations
US Central Operations
Private Brands GM - 01-8683 General Merchandise
当前代码:
import pandas as pd
import re
numbers = df5['Dept_Nbr'].tolist()
df5['Dept_Nbr'] = [int(i) for i in df5['Dept_Nbr']]
df5.set_index('Dept_Nbr')
for n in numbers:
for i in df5.index:
if n in df2.loc[i, 'Department']:
if df5.at[int(n), 'Dept_Desc_better']: #if values exists
df2.at[i, 'TrueDepartment'] = df5.at(int(n), 'Dept_Desc_better')
else:
df2.at[i, 'TrueDepartment'] = df5.at(int(n), 'Dept_Desc_good')
获取错误类型错误:'in ' 需要字符串作为左操作数,而不是 int'
我想我应该尝试将 n 更改为字符串类型?
此外,我还必须弄清楚如何在 df2
的 "Department" 列中找到子字符串,该子字符串要么跟在连字符后面,要么是单元格中唯一的数字(即 9701
) .我可能需要为此使用正则表达式 (re
)。对于 df2
中的第一个部门,它将找到字符串“9362”并将其与 df5
中的 Dept_Nbr
匹配,并将 "General Liability" 写入 TrueDepartment
列。 df5
实际上有 Dept_Nbr
从 1 到超过 10000 的连续数。
根据阿姆斯特朗先生的建议对我的代码进行更改后的最新错误...只有在我实际的完整数据帧上使用时才会出现错误,而不是在我提供的示例数据帧上使用时出现错误。
KeyError Traceback (most recent call last)
~/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py in
get_loc(self, key, method, tolerance)
3062 try:
-> 3063 return self._engine.get_loc(key)
3064 except KeyError:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in
pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in
pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'Dept_Nbr'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-14-89dd44993593> in <module>()
----> 1 numbers = df5['Dept_Nbr'].tolist()
2 df5['Dept_Nbr'] = [int(i) for i in df5['Dept_Nbr']]
3 df5 = df5.set_index('Dept_Nbr') #<-- need to actually set df5 to the new index
4
5 for n in numbers:
~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in
__getitem__(self, key)
2683 return self._getitem_multilevel(key)
2684 else:
-> 2685 return self._getitem_column(key)
2686
2687 def _getitem_column(self, key):
~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in
_getitem_column(self, key)
2690 # get column
2691 if self.columns.is_unique:
-> 2692 return self._get_item_cache(key)
2693
2694 # duplicate columns & possible reduce dimensionality
~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in
_get_item_cache(self, item)
2484 res = cache.get(item)
2485 if res is None:
-> 2486 values = self._data.get(item)
2487 res = self._box_item_values(item, values)
2488 cache[item] = res
~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in get(self, item, fastpath)
4113
4114 if not isna(item):
-> 4115 loc = self.items.get_loc(item)
4116 else:
4117 indexer = np.arange(len(self.items))[isna(self.items)]
~/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3063 return self._engine.get_loc(key)
3064 except KeyError:
-> 3065 return self._engine.get_loc(self._maybe_cast_indexer(key))
3066
3067 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in
pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in
pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'Dept_Nbr'
首先 - 您上面的数据框与您的数据框构造不匹配。我花了很长时间才弄明白为什么 9362 != 9362
:-)
这里有一些要考虑的事情:
numbers = df5['Dept_Nbr'].tolist()
df5['Dept_Nbr'] = [int(i) for i in df5['Dept_Nbr']]
df5 = df5.set_index('Dept_Nbr') #<-- need to actually set df5 to the new index
for n in numbers:
for i in range(len(df5.index)): #<-- you want to iterate through the number of elements not the elements themselves
if str(n) == df2.loc[i, 'Department'][-4:]: #<-- convert n to str and slice df2 string for the last 4 chars
if df5.loc[n, 'Dept_Desc_better'] != "Empty": #<-- you're actually checking against a string, not a NaN
df2.loc[i, 'TrueDepartment'] = df5.loc[n, 'Dept_Desc_better'] #<-- use .loc not .at
else:
df2.loc[i, 'TrueDepartment'] = df5.loc[n, 'Dept_Desc_good']
df2 = df2.replace(to_replace="Empty", value="") #<-- your desired output has '' rather than 'Empty' - so replaced.
假设 Department
下的 df2
描述始终以 Dept_Nbr
结尾,并且其长度始终为 4 个字符。这是输出:
df2
JobTitle Department TrueDepartment Dept_Function
0 CMI-General Liability | 05-9632
1 Central Operations
2 Alarm Central 05-8642 Security & Compliance
3 Market 466
4 Talent, Experience
5 Food Division
6 Quality WMCC
7 Modular Execution Team | 01-9700 Execution Team
8 US Central Operations
9 CE - Engineering - US
10 Fresh, Freezer & Cooler - 18-8110 Fresh Dept
11 9701 Produce TN
12 Contact Center
13 Central Operations
14 US Central Operations
15 Private Brands GM - 01-8683 General Merchandise
顺便说一句 - 代码确实有效,索引行 0 中没有错误 - 这是您的数据帧不同的地方。我也尽可能多地保留了您的代码,但我猜想还有更好的迭代方法。