合并或连接具有不均匀行的 df - python

Merge or concat df's with uneven rows - python

我有三个独立的数据框。我希望将它们合并或连接在一起。我在每个数据框中都有一个参考值。我已将它们标记为 ValueX, ValueY, ValueZ。但他们没有独特的价值来合并。它们几乎总是包含相同数量的对应值。

使用下面的 df,我希望在 ValueX, ValueY, ValueZ 上对齐数据帧。如果你连接,每个值在很大程度上是一致的。但有时有 4 个或 6 个值而不是 5 个。因此相应的数据框未对齐。

import pandas as pd

df1 = pd.DataFrame({
    'ValueX' : [0,0,0,0,0,0,2,2,2,2,2,4,4,4,4,4],                   
    'Item3' : ['a', 'e', 'd', 'g', 'f', 'c', 'c', 'f', 'b', 'c', 'j', 'g', 'm', 's', 'c', 'k'], 
    'Item4' : ['f', 's', 'h', 'k', 'f', 'd', 'c', 'g', 'b', 'k', 'j', 'n', 'r', 'd', 'x', 'd'],                         
        })

df2 = pd.DataFrame({
    'ValueY' : [1,1,1,1,1,3,3,3,3,3,3,5,5,5,5,5,5],                   
    'Item1' : ['a', 'c', 'c', 'g', 'a', 'c', 'c', 'g', 'a', 'c', 'c', 'g', 'a', 'c', 'c', 'g',  'k'], 
    'Item2' : ['a', 'e', 'd', 'g', 'f', 'c', 'c', 'f', 'b', 'c', 'j', 'g', 'm', 's', 'c', 'k',  'k'],                         
        })

df3 = pd.DataFrame({
    'ValueZ' : [6,6,6,6,6,7,7,7,7,7,7,8,8,8,8,8],                  
    'Item5' : ['a', 'e', 'd', 'g', 'f', 'c', 'c', 'f', 'b', 'c', 'j', 'g', 'm', 's', 'c', 'k'], 
    'Item6' : ['f', 's', 'h', 'k', 'f', 'd', 'c', 'g', 'b', 'k', 'j', 'n', 'r', 'd', 'x', 'd'],                         
        })

final_df = pd.concat([df1, df2, df3], axis = 1)

预期输出:

    ValueX Item3 Item4  ValueY Item1 Item2  ValueZ Item5 Item6
0      0.0     a     f     1.0     a     a     6.0     a     f
1      0.0     e     s     1.0     c     e     6.0     e     s
2      0.0     d     h     1.0     c     d     6.0     d     h
3      0.0     g     k     1.0     g     g     6.0     g     k
4      0.0     f     f     1.0     a     f     6.0     f     f
5      0.0     c     d     NaN   NaN   NaN     NaN   NaN   NaN
6      2.0     c     c     3.0     c     c     7.0     c     d
7      2.0     f     g     3.0     c     c     7.0     c     c
8      2.0     b     b     3.0     g     f     7.0     f     g
9      2.0     c     k     3.0     a     b     7.0     b     b
10     2.0     j     j     3.0     c     c     7.0     c     k
11     NaN   NaN   NaN     3.0     c     j     7.0     j     j
12     4.0     g     n     5.0     g     g     8.0     g     n
13     4.0     m     r     5.0     a     m     8.0     m     r
14     4.0     s     d     5.0     c     s     8.0     s     d
15     4.0     c     x     5.0     c     c     8.0     c     x
16     4.0     k     d     5.0     g     k     8.0     k     d
17     NaN   NaN   NaN     5.0     k     k     NaN   NaN   NaN

您可以使用 itertools.zip_longest 来对齐组:

from itertools import zip_longest

g1 = df1.groupby('ValueX')
g2 = df2.groupby('ValueY')
g3 = df3.groupby('ValueZ')

dfs = []
for (_, a), (_, b), (_, c) in zip_longest(g1, g2, g3, fillvalue=('', pd.DataFrame())):
    dfs.append( 
        pd.concat([a.reset_index(drop=True), 
                   b.reset_index(drop=True), 
                   c.reset_index(drop=True)], axis=1) )

final = pd.concat(dfs).reset_index(drop=True)
print(final)

打印:

    ValueX Item3 Item4  ValueY Item1 Item2  ValueZ Item5 Item6
0      0.0     a     f     1.0     a     a     6.0     a     f
1      0.0     e     s     1.0     c     e     6.0     e     s
2      0.0     d     h     1.0     c     d     6.0     d     h
3      0.0     g     k     1.0     g     g     6.0     g     k
4      0.0     f     f     1.0     a     f     6.0     f     f
5      0.0     c     d     NaN   NaN   NaN     NaN   NaN   NaN
6      2.0     c     c     3.0     c     c     7.0     c     d
7      2.0     f     g     3.0     c     c     7.0     c     c
8      2.0     b     b     3.0     g     f     7.0     f     g
9      2.0     c     k     3.0     a     b     7.0     b     b
10     2.0     j     j     3.0     c     c     7.0     c     k
11     NaN   NaN   NaN     3.0     c     j     7.0     j     j
12     4.0     g     n     5.0     g     g     8.0     g     n
13     4.0     m     r     5.0     a     m     8.0     m     r
14     4.0     s     d     5.0     c     s     8.0     s     d
15     4.0     c     x     5.0     c     c     8.0     c     x
16     4.0     k     d     5.0     g     k     8.0     k     d
17     NaN   NaN   NaN     5.0     k     k     NaN   NaN   NaN
d=[df1,df2,df3]#Put DataFrames in a list
for x in d:
    pd.melt(x, id_vars=','.join(list(x.filter(regex="^Value", axis=1).columns)), value_vars=','.join(list(x.filter(regex="^Item", axis=1).columns)))#x.rename(columns={','.join(list(x.filter(regex=["^Value"#], axis=1).columns)):'Values'}, inplace=True)#Melt the dataframes
final_df = pd.concat([df1, df2, df3], axis = 1)#Concat the dataframes
print(final_df)

    ValueX Item3 Item4  ValueY Item1 Item2  ValueZ Item5 Item6
0      0.0     a     f       1     a     a     6.0     a     f
1      0.0     e     s       1     c     e     6.0     e     s
2      0.0     d     h       1     c     d     6.0     d     h
3      0.0     g     k       1     g     g     6.0     g     k
4      0.0     f     f       1     a     f     6.0     f     f
5      0.0     c     d       3     c     c     7.0     c     d
6      2.0     c     c       3     c     c     7.0     c     c
7      2.0     f     g       3     g     f     7.0     f     g
8      2.0     b     b       3     a     b     7.0     b     b
9      2.0     c     k       3     c     c     7.0     c     k
10     2.0     j     j       3     c     j     7.0     j     j
11     4.0     g     n       5     g     g     8.0     g     n
12     4.0     m     r       5     a     m     8.0     m     r
13     4.0     s     d       5     c     s     8.0     s     d
14     4.0     c     x       5     c     c     8.0     c     x
15     4.0     k     d       5     g     k     8.0     k     d
16     NaN   NaN   NaN       5     k     k     NaN   NaN   NaN