R 使用列存根字符串重塑宽到长

R Reshape Wide To Long Using Column Stub Strings

data1=data.frame("School"=c(1,1,2,2,3,3,4,4),
                 "Fund"=c(0,1,0,1,0,1,0,1),
                 "Total_A_Grade5"=c(22,20,21,24,24,26,25,22),
                 "Group1_A_Grade5"=c(10,6,6,10,9,9,9,10),
                 "Group2_A_Grade5"=c(5,9,9,8,10,8,8,6),
                 "Total_B_Grade5"=c(23,33,19,21,19,23,20,21),
                 "Group1_B_Grade5"=c(8,7,7,10,9,9,5,5),
                 "Group2_B_Grade5"=c(6,10,7,6,6,5,9,9),
                 "Total_A_Grade6"=c(18,24,16,24,26,25,16,19),
                 "Group1_A_Grade6"=c(7,7,5,9,10,9,5,7),
                 "Group2_A_Grade6"=c(5,8,6,7,10,8,8,9),
                 "Total_B_Grade6"=c(26,23,22,24,21,22,24,19),
                 "Group1_B_Grade6"=c(10,10,6,10,7,8,8,7),
                 "Group2_B_Grade6"=c(9,6,9,6,7,6,9,9),
                 "Total_A_Grade7"=c(20,19,18,25,16,21,19,26),
                 "Group1_A_Grade7"=c(9,7,7,9,7,7,5,8),
                 "Group2_A_Grade7"=c(8,5,7,9,6,5,5,9),
                 "Total_B_Grade7"=c(25,21,24,25,18,18,27,18),
                 "Group1_B_Grade7"=c(10,10,10,7,5,6,8,5),
                 "Group2_B_Grade7"=c(9,6,8,10,8,6,10,6))


data2=data.frame("School"=c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),
                 "Fund"=c(0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1),
                 "Type"=c('Total','Total','Group1','Group1','Group2','Group2','Total','Total','Group1','Group1','Group2','Group2','Total','Total','Group1','Group1','Group2','Group2','Total','Total','Group1','Group1','Group2','Group2'),
                 "Class"=c('A','A','A','A','A','A','B','B','B','B','B','B','A','A','A','A','A','A','B','B','B','B','B','B'),
"Grade"=c(5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6),
"Score"=c(22,20,10,6,5,9,23,33,8,7,6,10,18,24,7,7,5,8,26,23,10,10,9,6))

我有 'data1' 并且想要重塑以制作 'data2',它只是展示了 School 1 5 年级和 6 年级的示例,但我想要重塑所有 data1。

'data1'的列名包含丰富的信息。例如,Group2_B_Grade6 表示 'Type' = Group2,'Class' = B,'Grade' = 6。我希望重塑 'data1',然后使用这些由“分隔的存根” _" 作为 colnames 准备 'data2'


data3=data.frame("School"=c(1,1,2,2,3,3,4,4),
                 "Fund"=c(0,1,0,1,0,1,0,1),
                 "Grade_5"=c(22,20,21,24,24,26,25,22),
                 "Grade_6"=c(10,6,6,10,9,9,9,10),
                 "Grade_7"=c(5,9,9,8,10,8,8,6))

使用dplyr(和tidyr):

library(dplyr)
library(tidyr)

data2 <- data1 %>% 
  pivot_longer(-c(School, Fund)) %>% 
  separate(name, into = c('Type', 'Class', 'Grade')) %>% 
  extract(Grade, 'Grade', "([0-9]+)")

data2
#> # A tibble: 144 x 6
#>    School  Fund Type   Class Grade value
#>     <dbl> <dbl> <chr>  <chr> <chr> <dbl>
#>  1      1     0 Total  A     5        22
#>  2      1     0 Group1 A     5        10
#>  3      1     0 Group2 A     5         5
#>  4      1     0 Total  B     5        23
#>  5      1     0 Group1 B     5         8
#>  6      1     0 Group2 B     5         6
#>  7      1     0 Total  A     6        18
#>  8      1     0 Group1 A     6         7
#>  9      1     0 Group2 A     6         5
#> 10      1     0 Total  B     6        26
#> # … with 134 more rows

reprex package (v0.3.0)

于 2020-04-06 创建

您可以直接使用 pivot_longernames_pattern 中的一些正则表达式来执行此操作。

tidyr::pivot_longer(data1, 
                    cols = -c(School, Fund), 
                    names_to = c('Type', 'Class', 'Grade'), 
                    names_pattern = '(.*?)_([A-Z])_Grade(\d+)', 
                    values_to = 'Score')


# A tibble: 144 x 6
#   School  Fund Type   Class Grade Score
#    <dbl> <dbl> <chr>  <chr> <chr> <dbl>
# 1      1     0 Total  A     5        22
# 2      1     0 Group1 A     5        10
# 3      1     0 Group2 A     5         5
# 4      1     0 Total  B     5        23
# 5      1     0 Group1 B     5         8
# 6      1     0 Group2 B     5         6
# 7      1     0 Total  A     6        18
# 8      1     0 Group1 A     6         7
# 9      1     0 Group2 A     6         5
#10      1     0 Total  B     6        26
# … with 134 more rows

我们可以使用 melt 来自 data.table

library(data.table)
melt(setDT(data1), id.var = c('School', 'Fund'))[,
    c('Type', 'Class', 'Grade') := tstrsplit(variable, "_")][, 
     Grade := sub('Grade', '', Grade)][, variable := NULL][]
#     School Fund value   Type Class Grade
#  1:      1    0    22  Total     A     5
#  2:      1    1    20  Total     A     5
#  3:      2    0    21  Total     A     5
#  4:      2    1    24  Total     A     5
#  5:      3    0    24  Total     A     5
# ---                                     
#140:      2    1    10 Group2     B     7
#141:      3    0     8 Group2     B     7
#142:      3    1     6 Group2     B     7
#143:      4    0    10 Group2     B     7
#144:      4    1     6 Group2     B     7