如何在没有设置模式的情况下对多个列使用 pandas melt

How to use pandas melt for several columns without set pattern

我有一个如下所示的数据框

df = pd.DataFrame({
'subject_ID':[1,2,3,4,5,6,7,8],
'1st_date':['1/1/2020','3/3/2000','13/11/2020','24/05/1998','30/03/1971','30/03/1971','30/03/1971','30/03/1971'],
'1st_marks':[31,32,34,45,56,78,74,32],
'1st_1st_retest_marks':[31,32,34,45,56,78,74,32],
'1st_2nd_retest_marks':[31,32,34,45,56,78,74,32],
'2nd_date':['1/2/2020','3/4/2000','13/12/2020','24/06/1998','30/04/1971','21/04/1971','10/04/1971','20/04/1971'],
'2nd_marks':[31,32,34,45,56,78,74,32],
'3rd_date':['1/1/2010','3/3/2005','13/11/2021','24/05/1898','30/03/1981','30/03/1991','30/03/1901','30/03/1871'],
'3rd_marks':[31,32,34,45,56,78,74,32]})

我尝试了以下

df = pd.melt(df, id_vars =['subject_ID']) # incorrect output
df = pd.melt(df,id_vars = ['subject_ID','1st_date'] #incorrect output

在我的真实数据中,我有超过 100 个日期列和每个主题对应的标记值。

如何将所有 100 个日期作为输入传递给 melt 函数

我希望我的输出如下所示(subject_id = 1 的示例)

请不要像在真实数据中那样使用列名中的任何模式,列名没有像 1st, 2nd, 3rd etc

这样的任何模式

如果有分隔符_,使用它来分割成MultiIndex,所以可以使用DataFrame.stack:

df = df.set_index('subject_ID')
df.columns = df.columns.str.split('_', expand=True)
df = df.stack(0).rename_axis(['subject_ID','tmp']).reset_index()
df['mark_variable'] = df['tmp'] + '_marks'
df['date_variable'] = df.pop('tmp') + '_date'

print (df)
    subject_ID        date  marks mark_variable date_variable
0            1    1/1/2020     31     1st_marks      1st_date
1            1    1/2/2020     31     2nd_marks      2nd_date
2            1    1/1/2010     31     3rd_marks      3rd_date
3            2    3/3/2000     32     1st_marks      1st_date
4            2    3/4/2000     32     2nd_marks      2nd_date
5            2    3/3/2005     32     3rd_marks      3rd_date
6            3  13/11/2020     34     1st_marks      1st_date
7            3  13/12/2020     34     2nd_marks      2nd_date
8            3  13/11/2021     34     3rd_marks      3rd_date
9            4  24/05/1998     45     1st_marks      1st_date
10           4  24/06/1998     45     2nd_marks      2nd_date
11           4  24/05/1898     45     3rd_marks      3rd_date
12           5  30/03/1971     56     1st_marks      1st_date
13           5  30/04/1971     56     2nd_marks      2nd_date
14           5  30/03/1981     56     3rd_marks      3rd_date
15           6  30/03/1971     78     1st_marks      1st_date
16           6  21/04/1971     78     2nd_marks      2nd_date
17           6  30/03/1991     78     3rd_marks      3rd_date
18           7  30/03/1971     74     1st_marks      1st_date
19           7  10/04/1971     74     2nd_marks      2nd_date
20           7  30/03/1901     74     3rd_marks      3rd_date
21           8  30/03/1971     32     1st_marks      1st_date
22           8  20/04/1971     32     2nd_marks      2nd_date
23           8  30/03/1871     32     3rd_marks      3rd_date

编辑:

#convert ID column to index first
df = df.set_index('subject_ID')

#groups with first column date
g = df.columns.str.contains('date').cumsum()
#per each group reshape by stack with first date column
d = {x.columns[0]: x.set_index(x.columns[0], append=True).stack()
     for i, x in df.groupby(g, axis=1)}

#rename columns
renamer = {'level_0':'date_variable','level_2':'date','level_3':'mark_variable'}
#join together dictionary, sorting by `ID` and rename columns
df = (pd.concat(d)
        .sort_index(level=1, sort_remaining=False)
        .reset_index(name='mark')
        .rename(columns=renamer))

print (df)
   date_variable  subject_ID        date         mark_variable  mark
0       1st_date           1    1/1/2020             1st_marks    31
1       1st_date           1    1/1/2020  1st_1st_retest_marks    31
2       1st_date           1    1/1/2020  1st_2nd_retest_marks    31
3       2nd_date           1    1/2/2020             2nd_marks    31
4       3rd_date           1    1/1/2010             3rd_marks    31
5       1st_date           2    3/3/2000             1st_marks    32
6       1st_date           2    3/3/2000  1st_1st_retest_marks    32
7       1st_date           2    3/3/2000  1st_2nd_retest_marks    32
8       2nd_date           2    3/4/2000             2nd_marks    32
9       3rd_date           2    3/3/2005             3rd_marks    32
10      1st_date           3  13/11/2020             1st_marks    34
11      1st_date           3  13/11/2020  1st_1st_retest_marks    34
12      1st_date           3  13/11/2020  1st_2nd_retest_marks    34
13      2nd_date           3  13/12/2020             2nd_marks    34
14      3rd_date           3  13/11/2021             3rd_marks    34
15      1st_date           4  24/05/1998             1st_marks    45
16      1st_date           4  24/05/1998  1st_1st_retest_marks    45
17      1st_date           4  24/05/1998  1st_2nd_retest_marks    45
18      2nd_date           4  24/06/1998             2nd_marks    45
19      3rd_date           4  24/05/1898             3rd_marks    45
20      1st_date           5  30/03/1971             1st_marks    56
21      1st_date           5  30/03/1971  1st_1st_retest_marks    56
22      1st_date           5  30/03/1971  1st_2nd_retest_marks    56
23      2nd_date           5  30/04/1971             2nd_marks    56
24      3rd_date           5  30/03/1981             3rd_marks    56
25      1st_date           6  30/03/1971             1st_marks    78
26      1st_date           6  30/03/1971  1st_1st_retest_marks    78
27      1st_date           6  30/03/1971  1st_2nd_retest_marks    78
28      2nd_date           6  21/04/1971             2nd_marks    78
29      3rd_date           6  30/03/1991             3rd_marks    78
30      1st_date           7  30/03/1971             1st_marks    74
31      1st_date           7  30/03/1971  1st_1st_retest_marks    74
32      1st_date           7  30/03/1971  1st_2nd_retest_marks    74
33      2nd_date           7  10/04/1971             2nd_marks    74
34      3rd_date           7  30/03/1901             3rd_marks    74
35      1st_date           8  30/03/1971             1st_marks    32
36      1st_date           8  30/03/1971  1st_1st_retest_marks    32
37      1st_date           8  30/03/1971  1st_2nd_retest_marks    32
38      2nd_date           8  20/04/1971             2nd_marks    32
39      3rd_date           8  30/03/1871             3rd_marks    32

您可以使用 pyjanitor 中的 pivot_longer(它们是 pandas 函数的包装器)来重塑数据:

 # pip install pyjanitor
 import pandas as pd
 import janitor
 (df.pivot_longer(index = ['subject_ID', '*date'],
                  # any part of the column label associated 
                  # with .value stays as a column name
                  # the rest are collated under mark_variable 
                  names_to = ('mark_variable', '.value'), 
                  # this determines how the column labels are split
                  # there are two groups, to pair with the names
                  # in `names_to`
                  names_pattern=r"(.+)_(marks)$")
     # a second `melt` to get date_variable
    .pivot_longer(['subject_ID', '*mark*'], 
                  names_to = 'date_variable', 
                  values_to='date')
     # if data is large, a more efficient option here
     # would be to convert mark_variable to a categorical column
     # and then rename the categories ... pretty efficient
    .assign(mark_variable = lambda df: df.mark_variable + "_marks")
    .sort_values(['subject_ID', 'date'], ignore_index=True)
    .head(10)
)

   subject_ID         mark_variable  marks date_variable      date
0           1             1st_marks     31      3rd_date  1/1/2010
1           1  1st_1st_retest_marks     31      3rd_date  1/1/2010
2           1  1st_2nd_retest_marks     31      3rd_date  1/1/2010
3           1             2nd_marks     31      3rd_date  1/1/2010
4           1             3rd_marks     31      3rd_date  1/1/2010
5           1             1st_marks     31      1st_date  1/1/2020
6           1  1st_1st_retest_marks     31      1st_date  1/1/2020
7           1  1st_2nd_retest_marks     31      1st_date  1/1/2020
8           1             2nd_marks     31      1st_date  1/1/2020
9           1             3rd_marks     31      1st_date  1/1/2020