合并或连接具有不均匀行的 df - python
Merge or concat df's with uneven rows - python
我有三个独立的数据框。我希望将它们合并或连接在一起。我在每个数据框中都有一个参考值。我已将它们标记为 ValueX, ValueY, ValueZ
。但他们没有独特的价值来合并。它们几乎总是包含相同数量的对应值。
使用下面的 df,我希望在 ValueX, ValueY, ValueZ
上对齐数据帧。如果你连接,每个值在很大程度上是一致的。但有时有 4 个或 6 个值而不是 5 个。因此相应的数据框未对齐。
import pandas as pd
df1 = pd.DataFrame({
'ValueX' : [0,0,0,0,0,0,2,2,2,2,2,4,4,4,4,4],
'Item3' : ['a', 'e', 'd', 'g', 'f', 'c', 'c', 'f', 'b', 'c', 'j', 'g', 'm', 's', 'c', 'k'],
'Item4' : ['f', 's', 'h', 'k', 'f', 'd', 'c', 'g', 'b', 'k', 'j', 'n', 'r', 'd', 'x', 'd'],
})
df2 = pd.DataFrame({
'ValueY' : [1,1,1,1,1,3,3,3,3,3,3,5,5,5,5,5,5],
'Item1' : ['a', 'c', 'c', 'g', 'a', 'c', 'c', 'g', 'a', 'c', 'c', 'g', 'a', 'c', 'c', 'g', 'k'],
'Item2' : ['a', 'e', 'd', 'g', 'f', 'c', 'c', 'f', 'b', 'c', 'j', 'g', 'm', 's', 'c', 'k', 'k'],
})
df3 = pd.DataFrame({
'ValueZ' : [6,6,6,6,6,7,7,7,7,7,7,8,8,8,8,8],
'Item5' : ['a', 'e', 'd', 'g', 'f', 'c', 'c', 'f', 'b', 'c', 'j', 'g', 'm', 's', 'c', 'k'],
'Item6' : ['f', 's', 'h', 'k', 'f', 'd', 'c', 'g', 'b', 'k', 'j', 'n', 'r', 'd', 'x', 'd'],
})
final_df = pd.concat([df1, df2, df3], axis = 1)
预期输出:
ValueX Item3 Item4 ValueY Item1 Item2 ValueZ Item5 Item6
0 0.0 a f 1.0 a a 6.0 a f
1 0.0 e s 1.0 c e 6.0 e s
2 0.0 d h 1.0 c d 6.0 d h
3 0.0 g k 1.0 g g 6.0 g k
4 0.0 f f 1.0 a f 6.0 f f
5 0.0 c d NaN NaN NaN NaN NaN NaN
6 2.0 c c 3.0 c c 7.0 c d
7 2.0 f g 3.0 c c 7.0 c c
8 2.0 b b 3.0 g f 7.0 f g
9 2.0 c k 3.0 a b 7.0 b b
10 2.0 j j 3.0 c c 7.0 c k
11 NaN NaN NaN 3.0 c j 7.0 j j
12 4.0 g n 5.0 g g 8.0 g n
13 4.0 m r 5.0 a m 8.0 m r
14 4.0 s d 5.0 c s 8.0 s d
15 4.0 c x 5.0 c c 8.0 c x
16 4.0 k d 5.0 g k 8.0 k d
17 NaN NaN NaN 5.0 k k NaN NaN NaN
您可以使用 itertools.zip_longest
来对齐组:
from itertools import zip_longest
g1 = df1.groupby('ValueX')
g2 = df2.groupby('ValueY')
g3 = df3.groupby('ValueZ')
dfs = []
for (_, a), (_, b), (_, c) in zip_longest(g1, g2, g3, fillvalue=('', pd.DataFrame())):
dfs.append(
pd.concat([a.reset_index(drop=True),
b.reset_index(drop=True),
c.reset_index(drop=True)], axis=1) )
final = pd.concat(dfs).reset_index(drop=True)
print(final)
打印:
ValueX Item3 Item4 ValueY Item1 Item2 ValueZ Item5 Item6
0 0.0 a f 1.0 a a 6.0 a f
1 0.0 e s 1.0 c e 6.0 e s
2 0.0 d h 1.0 c d 6.0 d h
3 0.0 g k 1.0 g g 6.0 g k
4 0.0 f f 1.0 a f 6.0 f f
5 0.0 c d NaN NaN NaN NaN NaN NaN
6 2.0 c c 3.0 c c 7.0 c d
7 2.0 f g 3.0 c c 7.0 c c
8 2.0 b b 3.0 g f 7.0 f g
9 2.0 c k 3.0 a b 7.0 b b
10 2.0 j j 3.0 c c 7.0 c k
11 NaN NaN NaN 3.0 c j 7.0 j j
12 4.0 g n 5.0 g g 8.0 g n
13 4.0 m r 5.0 a m 8.0 m r
14 4.0 s d 5.0 c s 8.0 s d
15 4.0 c x 5.0 c c 8.0 c x
16 4.0 k d 5.0 g k 8.0 k d
17 NaN NaN NaN 5.0 k k NaN NaN NaN
d=[df1,df2,df3]#Put DataFrames in a list
for x in d:
pd.melt(x, id_vars=','.join(list(x.filter(regex="^Value", axis=1).columns)), value_vars=','.join(list(x.filter(regex="^Item", axis=1).columns)))#x.rename(columns={','.join(list(x.filter(regex=["^Value"#], axis=1).columns)):'Values'}, inplace=True)#Melt the dataframes
final_df = pd.concat([df1, df2, df3], axis = 1)#Concat the dataframes
print(final_df)
ValueX Item3 Item4 ValueY Item1 Item2 ValueZ Item5 Item6
0 0.0 a f 1 a a 6.0 a f
1 0.0 e s 1 c e 6.0 e s
2 0.0 d h 1 c d 6.0 d h
3 0.0 g k 1 g g 6.0 g k
4 0.0 f f 1 a f 6.0 f f
5 0.0 c d 3 c c 7.0 c d
6 2.0 c c 3 c c 7.0 c c
7 2.0 f g 3 g f 7.0 f g
8 2.0 b b 3 a b 7.0 b b
9 2.0 c k 3 c c 7.0 c k
10 2.0 j j 3 c j 7.0 j j
11 4.0 g n 5 g g 8.0 g n
12 4.0 m r 5 a m 8.0 m r
13 4.0 s d 5 c s 8.0 s d
14 4.0 c x 5 c c 8.0 c x
15 4.0 k d 5 g k 8.0 k d
16 NaN NaN NaN 5 k k NaN NaN NaN
我有三个独立的数据框。我希望将它们合并或连接在一起。我在每个数据框中都有一个参考值。我已将它们标记为 ValueX, ValueY, ValueZ
。但他们没有独特的价值来合并。它们几乎总是包含相同数量的对应值。
使用下面的 df,我希望在 ValueX, ValueY, ValueZ
上对齐数据帧。如果你连接,每个值在很大程度上是一致的。但有时有 4 个或 6 个值而不是 5 个。因此相应的数据框未对齐。
import pandas as pd
df1 = pd.DataFrame({
'ValueX' : [0,0,0,0,0,0,2,2,2,2,2,4,4,4,4,4],
'Item3' : ['a', 'e', 'd', 'g', 'f', 'c', 'c', 'f', 'b', 'c', 'j', 'g', 'm', 's', 'c', 'k'],
'Item4' : ['f', 's', 'h', 'k', 'f', 'd', 'c', 'g', 'b', 'k', 'j', 'n', 'r', 'd', 'x', 'd'],
})
df2 = pd.DataFrame({
'ValueY' : [1,1,1,1,1,3,3,3,3,3,3,5,5,5,5,5,5],
'Item1' : ['a', 'c', 'c', 'g', 'a', 'c', 'c', 'g', 'a', 'c', 'c', 'g', 'a', 'c', 'c', 'g', 'k'],
'Item2' : ['a', 'e', 'd', 'g', 'f', 'c', 'c', 'f', 'b', 'c', 'j', 'g', 'm', 's', 'c', 'k', 'k'],
})
df3 = pd.DataFrame({
'ValueZ' : [6,6,6,6,6,7,7,7,7,7,7,8,8,8,8,8],
'Item5' : ['a', 'e', 'd', 'g', 'f', 'c', 'c', 'f', 'b', 'c', 'j', 'g', 'm', 's', 'c', 'k'],
'Item6' : ['f', 's', 'h', 'k', 'f', 'd', 'c', 'g', 'b', 'k', 'j', 'n', 'r', 'd', 'x', 'd'],
})
final_df = pd.concat([df1, df2, df3], axis = 1)
预期输出:
ValueX Item3 Item4 ValueY Item1 Item2 ValueZ Item5 Item6
0 0.0 a f 1.0 a a 6.0 a f
1 0.0 e s 1.0 c e 6.0 e s
2 0.0 d h 1.0 c d 6.0 d h
3 0.0 g k 1.0 g g 6.0 g k
4 0.0 f f 1.0 a f 6.0 f f
5 0.0 c d NaN NaN NaN NaN NaN NaN
6 2.0 c c 3.0 c c 7.0 c d
7 2.0 f g 3.0 c c 7.0 c c
8 2.0 b b 3.0 g f 7.0 f g
9 2.0 c k 3.0 a b 7.0 b b
10 2.0 j j 3.0 c c 7.0 c k
11 NaN NaN NaN 3.0 c j 7.0 j j
12 4.0 g n 5.0 g g 8.0 g n
13 4.0 m r 5.0 a m 8.0 m r
14 4.0 s d 5.0 c s 8.0 s d
15 4.0 c x 5.0 c c 8.0 c x
16 4.0 k d 5.0 g k 8.0 k d
17 NaN NaN NaN 5.0 k k NaN NaN NaN
您可以使用 itertools.zip_longest
来对齐组:
from itertools import zip_longest
g1 = df1.groupby('ValueX')
g2 = df2.groupby('ValueY')
g3 = df3.groupby('ValueZ')
dfs = []
for (_, a), (_, b), (_, c) in zip_longest(g1, g2, g3, fillvalue=('', pd.DataFrame())):
dfs.append(
pd.concat([a.reset_index(drop=True),
b.reset_index(drop=True),
c.reset_index(drop=True)], axis=1) )
final = pd.concat(dfs).reset_index(drop=True)
print(final)
打印:
ValueX Item3 Item4 ValueY Item1 Item2 ValueZ Item5 Item6
0 0.0 a f 1.0 a a 6.0 a f
1 0.0 e s 1.0 c e 6.0 e s
2 0.0 d h 1.0 c d 6.0 d h
3 0.0 g k 1.0 g g 6.0 g k
4 0.0 f f 1.0 a f 6.0 f f
5 0.0 c d NaN NaN NaN NaN NaN NaN
6 2.0 c c 3.0 c c 7.0 c d
7 2.0 f g 3.0 c c 7.0 c c
8 2.0 b b 3.0 g f 7.0 f g
9 2.0 c k 3.0 a b 7.0 b b
10 2.0 j j 3.0 c c 7.0 c k
11 NaN NaN NaN 3.0 c j 7.0 j j
12 4.0 g n 5.0 g g 8.0 g n
13 4.0 m r 5.0 a m 8.0 m r
14 4.0 s d 5.0 c s 8.0 s d
15 4.0 c x 5.0 c c 8.0 c x
16 4.0 k d 5.0 g k 8.0 k d
17 NaN NaN NaN 5.0 k k NaN NaN NaN
d=[df1,df2,df3]#Put DataFrames in a list
for x in d:
pd.melt(x, id_vars=','.join(list(x.filter(regex="^Value", axis=1).columns)), value_vars=','.join(list(x.filter(regex="^Item", axis=1).columns)))#x.rename(columns={','.join(list(x.filter(regex=["^Value"#], axis=1).columns)):'Values'}, inplace=True)#Melt the dataframes
final_df = pd.concat([df1, df2, df3], axis = 1)#Concat the dataframes
print(final_df)
ValueX Item3 Item4 ValueY Item1 Item2 ValueZ Item5 Item6
0 0.0 a f 1 a a 6.0 a f
1 0.0 e s 1 c e 6.0 e s
2 0.0 d h 1 c d 6.0 d h
3 0.0 g k 1 g g 6.0 g k
4 0.0 f f 1 a f 6.0 f f
5 0.0 c d 3 c c 7.0 c d
6 2.0 c c 3 c c 7.0 c c
7 2.0 f g 3 g f 7.0 f g
8 2.0 b b 3 a b 7.0 b b
9 2.0 c k 3 c c 7.0 c k
10 2.0 j j 3 c j 7.0 j j
11 4.0 g n 5 g g 8.0 g n
12 4.0 m r 5 a m 8.0 m r
13 4.0 s d 5 c s 8.0 s d
14 4.0 c x 5 c c 8.0 c x
15 4.0 k d 5 g k 8.0 k d
16 NaN NaN NaN 5 k k NaN NaN NaN