如何重塑这个具有两个变量作为标识符的 data.frame?
How to reshape this data.frame having two variables as identifiers?
假设我有一个 data.frame 如下:
data = data.frame(firm = LETTERS[seq( from = 1, to = 9)],
industry = seq(1,9),
country = c("USA", "CAN", "DEU"),
var1_10 = rnorm(9),
var1_11 = rnorm(9),
var1_12 = rnorm(9),
var2_10 = rnorm(9),
var2_11 = rnorm(9),
var2_12 = rnorm(9))
看起来像这样:
head(data)
firm industry country var1_10 var1_11 var1_12 var2_10 var2_11 var2_12
A 1 USA 0.006080107 1.7089981 0.384306433 -0.2814963 -0.31852115 0.4879907
B 2 CAN 0.447786736 -0.6414333 0.683906020 -0.7500779 -0.72770473 -0.1499627
C 3 DEU 1.265955776 -1.6834242 -0.685028075 0.7192065 -0.02291059 -0.2322860
D 4 USA 0.874346857 0.6339960 -0.005798694 1.0982600 -1.57901079 -0.0510445
E 5 CAN 0.692382607 -0.4461135 -0.432249529 1.7461789 -0.49300818 1.1987289
F 6 DEU -1.098814463 0.7868190 2.281716591 -1.0006592 0.95612690 1.0244039
而且我想要长格式的 var1 和 var2,但要将公司和国家/地区作为类别。我的意思是这样的:
firm country time var1 var2
1 A USA 10 0.6157731 1.05564854
2 A USA 11 0.2560421 0.42902183
3 D CAN 10 0.7278390 -1.81995641
4 D CAN 11 1.3241109 -0.69197609
5 B DEU 10 0.1471585 -1.93182825
6 B DEU 11 -0.5985394 1.20967201
7 E USA 10 2.1925299 -0.27900005
8 E USA 11 2.3271128 -1.09578323
9 C CAN 10 1.1348696 -0.10218604
10 C CAN 11 -0.1908846 0.35702296
11 F DEU 10 0.4748446 -0.88230257
12 F DEU 11 -0.5454749 -0.05664779
您可以使用新的 tidyr 1.0.0
pivot_longer()
和 pivot_wider()
函数。
@yutannihilation 对这些新功能进行了出色的介绍:A Graphical Introduction to tidyr's pivot_*()
library(tidyr)
set.seed(2019)
data = data.frame(firm = LETTERS[seq( from = 1, to = 9)],
industry = seq(1,9),
country = c("USA", "CAN", "DEU"),
var1_10 = rnorm(9),
var1_11 = rnorm(9),
var1_12 = rnorm(9),
var2_10 = rnorm(9),
var2_11 = rnorm(9),
var2_12 = rnorm(9))
data
#> firm industry country var1_10 var1_11 var1_12 var2_10
#> 1 A 1 USA 0.7385227 -0.3191793 -0.3271264 0.04062997
#> 2 B 2 CAN -0.5147605 -0.2379111 -2.2632252 2.63601650
#> 3 C 3 DEU -1.6401813 1.6186229 0.2855605 -1.61599923
#> 4 D 4 USA 0.9160368 -1.1176011 0.9684286 -0.93455930
#> 5 E 5 CAN -1.2674820 0.2340028 0.8673066 0.63038569
#> 6 F 6 DEU 0.7382478 0.3161516 1.3781350 0.76075998
#> 7 G 7 USA -0.7826228 0.3707686 -0.8082596 -0.51162277
#> 8 H 8 CAN 0.5092959 0.8775886 -0.5121532 1.00190750
#> 9 I 9 DEU -1.4899391 -1.7683235 -1.8039718 -0.38339219
#> var2_11 var2_12
#> 1 -0.47713729 0.20612698
#> 2 0.25420771 0.86320623
#> 3 -1.16349174 0.13977752
#> 4 -0.43793937 -0.22809479
#> 5 -1.72413573 -0.31982812
#> 6 1.72514669 -0.05294738
#> 7 0.09215510 -0.23639840
#> 8 0.07311485 -0.33796351
#> 9 0.64014783 -0.75548467
首先创建一个长 table 格式
data_longer <- data %>%
pivot_longer(
cols = starts_with("var"),
names_to = c("var", "time"),
names_sep = "_",
values_to = "value"
)
data_longer
#> # A tibble: 54 x 6
#> firm industry country var time value
#> <fct> <int> <fct> <chr> <chr> <dbl>
#> 1 A 1 USA var1 10 0.739
#> 2 A 1 USA var1 11 -0.319
#> 3 A 1 USA var1 12 -0.327
#> 4 A 1 USA var2 10 0.0406
#> 5 A 1 USA var2 11 -0.477
#> 6 A 1 USA var2 12 0.206
#> 7 B 2 CAN var1 10 -0.515
#> 8 B 2 CAN var1 11 -0.238
#> 9 B 2 CAN var1 12 -2.26
#> 10 B 2 CAN var2 10 2.64
#> # ... with 44 more rows
然后整形为所需的宽幅面
data_wider <- data_longer %>%
pivot_wider(names_from = var, values_from = value)
data_wider
#> # A tibble: 27 x 6
#> firm industry country time var1 var2
#> <fct> <int> <fct> <chr> <dbl> <dbl>
#> 1 A 1 USA 10 0.739 0.0406
#> 2 A 1 USA 11 -0.319 -0.477
#> 3 A 1 USA 12 -0.327 0.206
#> 4 B 2 CAN 10 -0.515 2.64
#> 5 B 2 CAN 11 -0.238 0.254
#> 6 B 2 CAN 12 -2.26 0.863
#> 7 C 3 DEU 10 -1.64 -1.62
#> 8 C 3 DEU 11 1.62 -1.16
#> 9 C 3 DEU 12 0.286 0.140
#> 10 D 4 USA 10 0.916 -0.935
#> # ... with 17 more rows
由 reprex package (v0.3.0)
于 2019-10-05 创建
假设我有一个 data.frame 如下:
data = data.frame(firm = LETTERS[seq( from = 1, to = 9)],
industry = seq(1,9),
country = c("USA", "CAN", "DEU"),
var1_10 = rnorm(9),
var1_11 = rnorm(9),
var1_12 = rnorm(9),
var2_10 = rnorm(9),
var2_11 = rnorm(9),
var2_12 = rnorm(9))
看起来像这样:
head(data)
firm industry country var1_10 var1_11 var1_12 var2_10 var2_11 var2_12
A 1 USA 0.006080107 1.7089981 0.384306433 -0.2814963 -0.31852115 0.4879907
B 2 CAN 0.447786736 -0.6414333 0.683906020 -0.7500779 -0.72770473 -0.1499627
C 3 DEU 1.265955776 -1.6834242 -0.685028075 0.7192065 -0.02291059 -0.2322860
D 4 USA 0.874346857 0.6339960 -0.005798694 1.0982600 -1.57901079 -0.0510445
E 5 CAN 0.692382607 -0.4461135 -0.432249529 1.7461789 -0.49300818 1.1987289
F 6 DEU -1.098814463 0.7868190 2.281716591 -1.0006592 0.95612690 1.0244039
而且我想要长格式的 var1 和 var2,但要将公司和国家/地区作为类别。我的意思是这样的:
firm country time var1 var2
1 A USA 10 0.6157731 1.05564854
2 A USA 11 0.2560421 0.42902183
3 D CAN 10 0.7278390 -1.81995641
4 D CAN 11 1.3241109 -0.69197609
5 B DEU 10 0.1471585 -1.93182825
6 B DEU 11 -0.5985394 1.20967201
7 E USA 10 2.1925299 -0.27900005
8 E USA 11 2.3271128 -1.09578323
9 C CAN 10 1.1348696 -0.10218604
10 C CAN 11 -0.1908846 0.35702296
11 F DEU 10 0.4748446 -0.88230257
12 F DEU 11 -0.5454749 -0.05664779
您可以使用新的 tidyr 1.0.0
pivot_longer()
和 pivot_wider()
函数。
@yutannihilation 对这些新功能进行了出色的介绍:A Graphical Introduction to tidyr's pivot_*()
library(tidyr)
set.seed(2019)
data = data.frame(firm = LETTERS[seq( from = 1, to = 9)],
industry = seq(1,9),
country = c("USA", "CAN", "DEU"),
var1_10 = rnorm(9),
var1_11 = rnorm(9),
var1_12 = rnorm(9),
var2_10 = rnorm(9),
var2_11 = rnorm(9),
var2_12 = rnorm(9))
data
#> firm industry country var1_10 var1_11 var1_12 var2_10
#> 1 A 1 USA 0.7385227 -0.3191793 -0.3271264 0.04062997
#> 2 B 2 CAN -0.5147605 -0.2379111 -2.2632252 2.63601650
#> 3 C 3 DEU -1.6401813 1.6186229 0.2855605 -1.61599923
#> 4 D 4 USA 0.9160368 -1.1176011 0.9684286 -0.93455930
#> 5 E 5 CAN -1.2674820 0.2340028 0.8673066 0.63038569
#> 6 F 6 DEU 0.7382478 0.3161516 1.3781350 0.76075998
#> 7 G 7 USA -0.7826228 0.3707686 -0.8082596 -0.51162277
#> 8 H 8 CAN 0.5092959 0.8775886 -0.5121532 1.00190750
#> 9 I 9 DEU -1.4899391 -1.7683235 -1.8039718 -0.38339219
#> var2_11 var2_12
#> 1 -0.47713729 0.20612698
#> 2 0.25420771 0.86320623
#> 3 -1.16349174 0.13977752
#> 4 -0.43793937 -0.22809479
#> 5 -1.72413573 -0.31982812
#> 6 1.72514669 -0.05294738
#> 7 0.09215510 -0.23639840
#> 8 0.07311485 -0.33796351
#> 9 0.64014783 -0.75548467
首先创建一个长 table 格式
data_longer <- data %>%
pivot_longer(
cols = starts_with("var"),
names_to = c("var", "time"),
names_sep = "_",
values_to = "value"
)
data_longer
#> # A tibble: 54 x 6
#> firm industry country var time value
#> <fct> <int> <fct> <chr> <chr> <dbl>
#> 1 A 1 USA var1 10 0.739
#> 2 A 1 USA var1 11 -0.319
#> 3 A 1 USA var1 12 -0.327
#> 4 A 1 USA var2 10 0.0406
#> 5 A 1 USA var2 11 -0.477
#> 6 A 1 USA var2 12 0.206
#> 7 B 2 CAN var1 10 -0.515
#> 8 B 2 CAN var1 11 -0.238
#> 9 B 2 CAN var1 12 -2.26
#> 10 B 2 CAN var2 10 2.64
#> # ... with 44 more rows
然后整形为所需的宽幅面
data_wider <- data_longer %>%
pivot_wider(names_from = var, values_from = value)
data_wider
#> # A tibble: 27 x 6
#> firm industry country time var1 var2
#> <fct> <int> <fct> <chr> <dbl> <dbl>
#> 1 A 1 USA 10 0.739 0.0406
#> 2 A 1 USA 11 -0.319 -0.477
#> 3 A 1 USA 12 -0.327 0.206
#> 4 B 2 CAN 10 -0.515 2.64
#> 5 B 2 CAN 11 -0.238 0.254
#> 6 B 2 CAN 12 -2.26 0.863
#> 7 C 3 DEU 10 -1.64 -1.62
#> 8 C 3 DEU 11 1.62 -1.16
#> 9 C 3 DEU 12 0.286 0.140
#> 10 D 4 USA 10 0.916 -0.935
#> # ... with 17 more rows
由 reprex package (v0.3.0)
于 2019-10-05 创建