迭代两个 pandas 数据帧并逐行比较名称相似的列
Iterate over two pandas dataframes and compare similarly named columns by row
我有两个 df,我想创建一个修改后的第三个 df,它根据 df2 中的相同位置(行 x 列)向 df1 的值添加一个字符串。我的 for 循环似乎没有正常工作,并且没有按预期添加字符串。我的 forloop 哪里出错了?我试过几个版本的 .iterrows() 和 .iterritems() 都无济于事。
data1 = {'col1' :['2.28', '-0.38', '0.16', '-0.00', '-0.11', '0.00', '-0.00','0.92', '0.58', '0.90', '0.80'],
'col2': ['2.23', '-0.38', '0.17', '-0.00', '-0.10', '0.00', '-0.00','0.89', '0.57', '0.89', '0.77'],
'col3': ['2.25', '-0.31', '0.17', '0.00', '-0.10', '0.00', '-0.00','0.88', '0.55', '0.89', '0.78'],
'col4': ['2.22', '-0.16', '0.17', '0.00', '-0.08', '0.00', '-0.00','0.85', '0.52', '0.85', '0.76']}
df1 = pd.DataFrame(data1)
data2 = {'col1' :[0.0043463 , 0.02835221, 0.01998397, 0.95802428, 0.30099018,
0.88572679, 0.49670165, 0.24806438, 0.46116773, 0.26479583,
0.30652824],
'col2': [0.00557248, 0.14322473, 0.01656108, 0.99548637, 0.32824552,
0.78092598, 0.45962261, 0.26562723, 0.47006285, 0.27588199,
0.32995383],
'col3': [0.00516395, 0.1432596 , 0.01855883, 0.95806069, 0.33487223,
0.90526158, 0.373449 , 0.27230896, 0.48722896, 0.27608198,
0.323059 ],
'col4': [0.00476079, 0.08205838, 0.03080842, 0.8642895 , 0.46951708,
0.75946821, 0.25630978, 0.30713967, 0.52637885, 0.29876842,
0.36094742]}
df2 = pd.DataFrame(data2)
final_df = pd.DataFrame()
for idx1,c in enumerate(df1):
for idx2,p in enumerate(df2):
final_df[c] = np.where(df2[p] < .05, df1[c]+'*',
np.where(df2[p] <.01,df1[c]+'**',
np.where(df2[p] <.001 , df1[c]+'***',df1[c])))
final_df = pd.DataFrame(final_df)
final_df
期望的输出:
col1 col2 col3 col4
0 2.28** 2.23** 2.25** 2.22**
1 -0.38* -0.38 -0.31 -0.16
2 0.16* 0.17* 0.17* 0.17*
3 -0.00 -0.00 0.00 0.00
4 -0.11 -0.10 -0.10 -0.08
我认为这不是最佳答案,但我相信它可以达到您想要的结果。
首先,我看不出有任何理由遍历 df2,因为您正在使用 np.where 比较两个数据帧中的同一列。
import pandas as pd
import numpy as np
data1 = {'col1' :['2.28', '-0.38', '0.16', '-0.00', '-0.11', '0.00', '-0.00','0.92', '0.58', '0.90', '0.80'],
'col2': ['2.23', '-0.38', '0.17', '-0.00', '-0.10', '0.00', '-0.00','0.89', '0.57', '0.89', '0.77'],
'col3': ['2.25', '-0.31', '0.17', '0.00', '-0.10', '0.00', '-0.00','0.88', '0.55', '0.89', '0.78'],
'col4': ['2.22', '-0.16', '0.17', '0.00', '-0.08', '0.00', '-0.00','0.85', '0.52', '0.85', '0.76']}
df1 = pd.DataFrame(data1)
data2 = {'col1' :[0.0043463 , 0.02835221, 0.01998397, 0.95802428, 0.30099018,
0.88572679, 0.49670165, 0.24806438, 0.46116773, 0.26479583,
0.30652824],
'col2': [0.00557248, 0.14322473, 0.01656108, 0.99548637, 0.32824552,
0.78092598, 0.45962261, 0.26562723, 0.47006285, 0.27588199,
0.32995383],
'col3': [0.00516395, 0.1432596 , 0.01855883, 0.95806069, 0.33487223,
0.90526158, 0.373449 , 0.27230896, 0.48722896, 0.27608198,
0.323059 ],
'col4': [0.00476079, 0.08205838, 0.03080842, 0.8642895 , 0.46951708,
0.75946821, 0.25630978, 0.30713967, 0.52637885, 0.29876842,
0.36094742]}
df2 = pd.DataFrame(data2)
final_df = pd.DataFrame()
for idx1, c in enumerate(df1):
# just add a * for each level, and after the first modifcation reference itself(final_df) so we don't overwrite any *'s
final_df[c] = np.where(df2[c] < .05, df1[c]+'*',df1[c])
final_df[c] = np.where(df2[c] <.01,final_df[c]+'*', final_df[c])
final_df[c] = np.where(df2[c] <.001 , final_df[c]+'*', final_df[c])
final_df = pd.DataFrame(final_df)
final_df
输出:
col1 col2 col3 col4
[0: 2.28** 2.23** 2.25** 2.22**],
[1: -0.38* -0.38 -0.31 -0.16],
[2: 0.16* 0.17* 0.17* 0.17*],
[3: -0.00 -0.00 0.00 0.00],
[4: -0.11 -0.10 -0.10 -0.08],
[5: 0.00 0.00 0.00 0.00],
[6: -0.00 -0.00 -0.00 -0.00],
[7: 0.92 0.89 0.88 0.85]...
我有两个 df,我想创建一个修改后的第三个 df,它根据 df2 中的相同位置(行 x 列)向 df1 的值添加一个字符串。我的 for 循环似乎没有正常工作,并且没有按预期添加字符串。我的 forloop 哪里出错了?我试过几个版本的 .iterrows() 和 .iterritems() 都无济于事。
data1 = {'col1' :['2.28', '-0.38', '0.16', '-0.00', '-0.11', '0.00', '-0.00','0.92', '0.58', '0.90', '0.80'],
'col2': ['2.23', '-0.38', '0.17', '-0.00', '-0.10', '0.00', '-0.00','0.89', '0.57', '0.89', '0.77'],
'col3': ['2.25', '-0.31', '0.17', '0.00', '-0.10', '0.00', '-0.00','0.88', '0.55', '0.89', '0.78'],
'col4': ['2.22', '-0.16', '0.17', '0.00', '-0.08', '0.00', '-0.00','0.85', '0.52', '0.85', '0.76']}
df1 = pd.DataFrame(data1)
data2 = {'col1' :[0.0043463 , 0.02835221, 0.01998397, 0.95802428, 0.30099018,
0.88572679, 0.49670165, 0.24806438, 0.46116773, 0.26479583,
0.30652824],
'col2': [0.00557248, 0.14322473, 0.01656108, 0.99548637, 0.32824552,
0.78092598, 0.45962261, 0.26562723, 0.47006285, 0.27588199,
0.32995383],
'col3': [0.00516395, 0.1432596 , 0.01855883, 0.95806069, 0.33487223,
0.90526158, 0.373449 , 0.27230896, 0.48722896, 0.27608198,
0.323059 ],
'col4': [0.00476079, 0.08205838, 0.03080842, 0.8642895 , 0.46951708,
0.75946821, 0.25630978, 0.30713967, 0.52637885, 0.29876842,
0.36094742]}
df2 = pd.DataFrame(data2)
final_df = pd.DataFrame()
for idx1,c in enumerate(df1):
for idx2,p in enumerate(df2):
final_df[c] = np.where(df2[p] < .05, df1[c]+'*',
np.where(df2[p] <.01,df1[c]+'**',
np.where(df2[p] <.001 , df1[c]+'***',df1[c])))
final_df = pd.DataFrame(final_df)
final_df
期望的输出:
col1 col2 col3 col4
0 2.28** 2.23** 2.25** 2.22**
1 -0.38* -0.38 -0.31 -0.16
2 0.16* 0.17* 0.17* 0.17*
3 -0.00 -0.00 0.00 0.00
4 -0.11 -0.10 -0.10 -0.08
我认为这不是最佳答案,但我相信它可以达到您想要的结果。
首先,我看不出有任何理由遍历 df2,因为您正在使用 np.where 比较两个数据帧中的同一列。
import pandas as pd
import numpy as np
data1 = {'col1' :['2.28', '-0.38', '0.16', '-0.00', '-0.11', '0.00', '-0.00','0.92', '0.58', '0.90', '0.80'],
'col2': ['2.23', '-0.38', '0.17', '-0.00', '-0.10', '0.00', '-0.00','0.89', '0.57', '0.89', '0.77'],
'col3': ['2.25', '-0.31', '0.17', '0.00', '-0.10', '0.00', '-0.00','0.88', '0.55', '0.89', '0.78'],
'col4': ['2.22', '-0.16', '0.17', '0.00', '-0.08', '0.00', '-0.00','0.85', '0.52', '0.85', '0.76']}
df1 = pd.DataFrame(data1)
data2 = {'col1' :[0.0043463 , 0.02835221, 0.01998397, 0.95802428, 0.30099018,
0.88572679, 0.49670165, 0.24806438, 0.46116773, 0.26479583,
0.30652824],
'col2': [0.00557248, 0.14322473, 0.01656108, 0.99548637, 0.32824552,
0.78092598, 0.45962261, 0.26562723, 0.47006285, 0.27588199,
0.32995383],
'col3': [0.00516395, 0.1432596 , 0.01855883, 0.95806069, 0.33487223,
0.90526158, 0.373449 , 0.27230896, 0.48722896, 0.27608198,
0.323059 ],
'col4': [0.00476079, 0.08205838, 0.03080842, 0.8642895 , 0.46951708,
0.75946821, 0.25630978, 0.30713967, 0.52637885, 0.29876842,
0.36094742]}
df2 = pd.DataFrame(data2)
final_df = pd.DataFrame()
for idx1, c in enumerate(df1):
# just add a * for each level, and after the first modifcation reference itself(final_df) so we don't overwrite any *'s
final_df[c] = np.where(df2[c] < .05, df1[c]+'*',df1[c])
final_df[c] = np.where(df2[c] <.01,final_df[c]+'*', final_df[c])
final_df[c] = np.where(df2[c] <.001 , final_df[c]+'*', final_df[c])
final_df = pd.DataFrame(final_df)
final_df
输出:
col1 col2 col3 col4
[0: 2.28** 2.23** 2.25** 2.22**],
[1: -0.38* -0.38 -0.31 -0.16],
[2: 0.16* 0.17* 0.17* 0.17*],
[3: -0.00 -0.00 0.00 0.00],
[4: -0.11 -0.10 -0.10 -0.08],
[5: 0.00 0.00 0.00 0.00],
[6: -0.00 -0.00 -0.00 -0.00],
[7: 0.92 0.89 0.88 0.85]...