如何重塑这个具有两个变量作为标识符的 data.frame?

How to reshape this data.frame having two variables as identifiers?

假设我有一个 data.frame 如下:

data = data.frame(firm = LETTERS[seq( from = 1, to = 9)],
                  industry = seq(1,9),
                  country = c("USA", "CAN", "DEU"),
                  var1_10 = rnorm(9),
                  var1_11 = rnorm(9),
                  var1_12 = rnorm(9),
                  var2_10 = rnorm(9),
                  var2_11 = rnorm(9),
                  var2_12 = rnorm(9))

看起来像这样:

head(data)
  firm industry country      var1_10    var1_11      var1_12    var2_10     var2_11    var2_12
    A        1     USA  0.006080107  1.7089981  0.384306433 -0.2814963 -0.31852115  0.4879907
    B        2     CAN  0.447786736 -0.6414333  0.683906020 -0.7500779 -0.72770473 -0.1499627
    C        3     DEU  1.265955776 -1.6834242 -0.685028075  0.7192065 -0.02291059 -0.2322860
    D        4     USA  0.874346857  0.6339960 -0.005798694  1.0982600 -1.57901079 -0.0510445
    E        5     CAN  0.692382607 -0.4461135 -0.432249529  1.7461789 -0.49300818  1.1987289
    F        6     DEU -1.098814463  0.7868190  2.281716591 -1.0006592  0.95612690  1.0244039

而且我想要长格式的 var1 和 var2,但要将公司和国家/地区作为类别。我的意思是这样的:

   firm country time       var1        var2
1     A     USA   10  0.6157731  1.05564854
2     A     USA   11  0.2560421  0.42902183
3     D     CAN   10  0.7278390 -1.81995641
4     D     CAN   11  1.3241109 -0.69197609
5     B     DEU   10  0.1471585 -1.93182825
6     B     DEU   11 -0.5985394  1.20967201
7     E     USA   10  2.1925299 -0.27900005
8     E     USA   11  2.3271128 -1.09578323
9     C     CAN   10  1.1348696 -0.10218604
10    C     CAN   11 -0.1908846  0.35702296
11    F     DEU   10  0.4748446 -0.88230257
12    F     DEU   11 -0.5454749 -0.05664779

您可以使用新的 tidyr 1.0.0 pivot_longer()pivot_wider() 函数。

@yutannihilation 对这些新功能进行了出色的介绍:A Graphical Introduction to tidyr's pivot_*()

library(tidyr)
set.seed(2019)

data = data.frame(firm = LETTERS[seq( from = 1, to = 9)],
                  industry = seq(1,9),
                  country = c("USA", "CAN", "DEU"),
                  var1_10 = rnorm(9),
                  var1_11 = rnorm(9),
                  var1_12 = rnorm(9),
                  var2_10 = rnorm(9),
                  var2_11 = rnorm(9),
                  var2_12 = rnorm(9))
data
#>   firm industry country    var1_10    var1_11    var1_12     var2_10
#> 1    A        1     USA  0.7385227 -0.3191793 -0.3271264  0.04062997
#> 2    B        2     CAN -0.5147605 -0.2379111 -2.2632252  2.63601650
#> 3    C        3     DEU -1.6401813  1.6186229  0.2855605 -1.61599923
#> 4    D        4     USA  0.9160368 -1.1176011  0.9684286 -0.93455930
#> 5    E        5     CAN -1.2674820  0.2340028  0.8673066  0.63038569
#> 6    F        6     DEU  0.7382478  0.3161516  1.3781350  0.76075998
#> 7    G        7     USA -0.7826228  0.3707686 -0.8082596 -0.51162277
#> 8    H        8     CAN  0.5092959  0.8775886 -0.5121532  1.00190750
#> 9    I        9     DEU -1.4899391 -1.7683235 -1.8039718 -0.38339219
#>       var2_11     var2_12
#> 1 -0.47713729  0.20612698
#> 2  0.25420771  0.86320623
#> 3 -1.16349174  0.13977752
#> 4 -0.43793937 -0.22809479
#> 5 -1.72413573 -0.31982812
#> 6  1.72514669 -0.05294738
#> 7  0.09215510 -0.23639840
#> 8  0.07311485 -0.33796351
#> 9  0.64014783 -0.75548467

首先创建一个长 table 格式

data_longer <- data %>%
  pivot_longer(
    cols = starts_with("var"),
    names_to = c("var", "time"),
    names_sep = "_",
    values_to = "value"
  )
data_longer
#> # A tibble: 54 x 6
#>    firm  industry country var   time    value
#>    <fct>    <int> <fct>   <chr> <chr>   <dbl>
#>  1 A            1 USA     var1  10     0.739 
#>  2 A            1 USA     var1  11    -0.319 
#>  3 A            1 USA     var1  12    -0.327 
#>  4 A            1 USA     var2  10     0.0406
#>  5 A            1 USA     var2  11    -0.477 
#>  6 A            1 USA     var2  12     0.206 
#>  7 B            2 CAN     var1  10    -0.515 
#>  8 B            2 CAN     var1  11    -0.238 
#>  9 B            2 CAN     var1  12    -2.26  
#> 10 B            2 CAN     var2  10     2.64  
#> # ... with 44 more rows

然后整形为所需的宽幅面

data_wider <- data_longer %>%
  pivot_wider(names_from = var, values_from = value)
data_wider
#> # A tibble: 27 x 6
#>    firm  industry country time    var1    var2
#>    <fct>    <int> <fct>   <chr>  <dbl>   <dbl>
#>  1 A            1 USA     10     0.739  0.0406
#>  2 A            1 USA     11    -0.319 -0.477 
#>  3 A            1 USA     12    -0.327  0.206 
#>  4 B            2 CAN     10    -0.515  2.64  
#>  5 B            2 CAN     11    -0.238  0.254 
#>  6 B            2 CAN     12    -2.26   0.863 
#>  7 C            3 DEU     10    -1.64  -1.62  
#>  8 C            3 DEU     11     1.62  -1.16  
#>  9 C            3 DEU     12     0.286  0.140 
#> 10 D            4 USA     10     0.916 -0.935 
#> # ... with 17 more rows

reprex package (v0.3.0)

于 2019-10-05 创建