从长到宽,每个 group/ID 的行数不同
Long to wide with different number of rows for each group/ID
我有以下数据框:
df <- data.frame(stringsAsFactors=FALSE,
ID = c(2016070707, 2016070707, 2016070707, 2017080808, 2017080808,
2017080808, 2017080808),
team_name = c("Harlequins", "Harlequins", "Harlequins", "Bristol Rugby",
"Bristol Rugby", "Bristol Rugby", "Bristol Rugby"),
player_name = c("Karl Dickson", "Tim Swiel", "Alofa Alofa",
"Jason Woodward", "Jack Wallace", "Will Hurrell",
"Tusi Pisi"),
DGC = c(7, 5, 0, 3, 0, 8, 9),
MR = c(3, 8, 31, 25, 0, 85, 0),
CB = c(2, 9, 1, 13, 0, 0, 0)
)
我想转换成宽版:
new_df <- data.frame(stringsAsFactors=FALSE,
ID = c(2016070707, 2017080808),
team_name = c("Harlequins", "Bristol Rugby"),
player_1 = c("Karl Dickson", "Jason Woodward"),
player_2 = c("Tim Swiel", "Jack Wallace"),
player_3 = c("Alofa Alofa", "Will Hurrell"),
player_4 = c(NA, "Tusi Pisi"),
DGC_1 = c(7, 3),
DGC_2 = c(5, 0),
DGC_3 = c(0, 8),
DGC_4 = c(NA, 9),
MR_1 = c(3, 25),
MR_2 = c(8, 0),
MR_3 = c(31, 85),
MR_4 = c(NA, 0),
CB_1 = c(2, 13),
CB_2 = c(9, 0),
CB_3 = c(1, 0),
CB_4 = c(NA, 0)
)
但是,如您所见,第一队有 3 名球员,第二队有 4 名球员。如何将 df 转置为 new_df 并创建这些额外的列,如 player_4, DGC_4 以防队伍人数不同。并且差异将由 "NAs" 补偿(就像 new_df 中一样)。
这是使用包 data.table
的一种解决方案
df <- data.frame(stringsAsFactors=FALSE,
ID = c(2016070707, 2016070707, 2016070707, 2017080808, 2017080808,
2017080808, 2017080808),
team_name = c("Harlequins", "Harlequins", "Harlequins", "Bristol Rugby",
"Bristol Rugby", "Bristol Rugby", "Bristol Rugby"),
player_name = c("Karl Dickson", "Tim Swiel", "Alofa Alofa",
"Jason Woodward", "Jack Wallace", "Will Hurrell",
"Tusi Pisi"),
DGC = c(7, 5, 0, 3, 0, 8, 9),
MR = c(3, 8, 31, 25, 0, 85, 0),
CB = c(2, 9, 1, 13, 0, 0, 0)
)
library(data.table)
#convert to data.table
setDT(df)
#create a sequence from 1 to N for each ID
df[, idx := 1:.N, by = ID]
#cast wide
dcast(ID + team_name ~ idx, data = df, value.var = c("player_name", "DGC", "MR", "CB"), )
#> ID team_name player_name_1 player_name_2 player_name_3
#> 1: 2016070707 Harlequins Karl Dickson Tim Swiel Alofa Alofa
#> 2: 2017080808 Bristol Rugby Jason Woodward Jack Wallace Will Hurrell
#> player_name_4 DGC_1 DGC_2 DGC_3 DGC_4 MR_1 MR_2 MR_3 MR_4 CB_1 CB_2
#> 1: <NA> 7 5 0 NA 3 8 31 NA 2 9
#> 2: Tusi Pisi 3 0 8 9 25 0 85 0 13 0
#> CB_3 CB_4
#> 1: 1 NA
#> 2: 0 0
由 reprex package (v0.2.1)
于 2019-01-27 创建
还有一种tidyverse
可能:
df %>%
rowid_to_column() %>%
gather(var, val, -c(ID, team_name, rowid)) %>%
arrange(rowid, ID, team_name, var) %>%
group_by(ID, team_name) %>%
mutate(var = paste(var, gl(n_distinct(rowid), n_distinct(var)), sep = "_")) %>%
select(-rowid) %>%
spread(var, val, convert = TRUE)
ID team_name CB_1 CB_2 CB_3 CB_4 DGC_1 DGC_2 DGC_3 DGC_4 MR_1 MR_2
<dbl> <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 2016070707. Harlequins 2 9 1 NA 7 5 0 NA 3 8
2 2017080808. Bristol Rugby 13 0 0 0 3 0 8 9 25 0
MR_3 MR_4 player_name_1 player_name_2 player_name_3 player_name_4
<int> <int> <chr> <chr> <chr> <chr>
1 31 NA Karl Dickson Tim Swiel Alofa Alofa <NA>
2 85 0 Jason Woodward Jack Wallace Will Hurrell Tusi Pisi
它首先通过将数据从宽格式转换为长格式来完成这项工作。然后,整理数据,最后,创建新的变量名称并将其转换为所需的宽格式。
我有以下数据框:
df <- data.frame(stringsAsFactors=FALSE,
ID = c(2016070707, 2016070707, 2016070707, 2017080808, 2017080808,
2017080808, 2017080808),
team_name = c("Harlequins", "Harlequins", "Harlequins", "Bristol Rugby",
"Bristol Rugby", "Bristol Rugby", "Bristol Rugby"),
player_name = c("Karl Dickson", "Tim Swiel", "Alofa Alofa",
"Jason Woodward", "Jack Wallace", "Will Hurrell",
"Tusi Pisi"),
DGC = c(7, 5, 0, 3, 0, 8, 9),
MR = c(3, 8, 31, 25, 0, 85, 0),
CB = c(2, 9, 1, 13, 0, 0, 0)
)
我想转换成宽版:
new_df <- data.frame(stringsAsFactors=FALSE,
ID = c(2016070707, 2017080808),
team_name = c("Harlequins", "Bristol Rugby"),
player_1 = c("Karl Dickson", "Jason Woodward"),
player_2 = c("Tim Swiel", "Jack Wallace"),
player_3 = c("Alofa Alofa", "Will Hurrell"),
player_4 = c(NA, "Tusi Pisi"),
DGC_1 = c(7, 3),
DGC_2 = c(5, 0),
DGC_3 = c(0, 8),
DGC_4 = c(NA, 9),
MR_1 = c(3, 25),
MR_2 = c(8, 0),
MR_3 = c(31, 85),
MR_4 = c(NA, 0),
CB_1 = c(2, 13),
CB_2 = c(9, 0),
CB_3 = c(1, 0),
CB_4 = c(NA, 0)
)
但是,如您所见,第一队有 3 名球员,第二队有 4 名球员。如何将 df 转置为 new_df 并创建这些额外的列,如 player_4, DGC_4 以防队伍人数不同。并且差异将由 "NAs" 补偿(就像 new_df 中一样)。
这是使用包 data.table
df <- data.frame(stringsAsFactors=FALSE,
ID = c(2016070707, 2016070707, 2016070707, 2017080808, 2017080808,
2017080808, 2017080808),
team_name = c("Harlequins", "Harlequins", "Harlequins", "Bristol Rugby",
"Bristol Rugby", "Bristol Rugby", "Bristol Rugby"),
player_name = c("Karl Dickson", "Tim Swiel", "Alofa Alofa",
"Jason Woodward", "Jack Wallace", "Will Hurrell",
"Tusi Pisi"),
DGC = c(7, 5, 0, 3, 0, 8, 9),
MR = c(3, 8, 31, 25, 0, 85, 0),
CB = c(2, 9, 1, 13, 0, 0, 0)
)
library(data.table)
#convert to data.table
setDT(df)
#create a sequence from 1 to N for each ID
df[, idx := 1:.N, by = ID]
#cast wide
dcast(ID + team_name ~ idx, data = df, value.var = c("player_name", "DGC", "MR", "CB"), )
#> ID team_name player_name_1 player_name_2 player_name_3
#> 1: 2016070707 Harlequins Karl Dickson Tim Swiel Alofa Alofa
#> 2: 2017080808 Bristol Rugby Jason Woodward Jack Wallace Will Hurrell
#> player_name_4 DGC_1 DGC_2 DGC_3 DGC_4 MR_1 MR_2 MR_3 MR_4 CB_1 CB_2
#> 1: <NA> 7 5 0 NA 3 8 31 NA 2 9
#> 2: Tusi Pisi 3 0 8 9 25 0 85 0 13 0
#> CB_3 CB_4
#> 1: 1 NA
#> 2: 0 0
由 reprex package (v0.2.1)
于 2019-01-27 创建还有一种tidyverse
可能:
df %>%
rowid_to_column() %>%
gather(var, val, -c(ID, team_name, rowid)) %>%
arrange(rowid, ID, team_name, var) %>%
group_by(ID, team_name) %>%
mutate(var = paste(var, gl(n_distinct(rowid), n_distinct(var)), sep = "_")) %>%
select(-rowid) %>%
spread(var, val, convert = TRUE)
ID team_name CB_1 CB_2 CB_3 CB_4 DGC_1 DGC_2 DGC_3 DGC_4 MR_1 MR_2
<dbl> <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 2016070707. Harlequins 2 9 1 NA 7 5 0 NA 3 8
2 2017080808. Bristol Rugby 13 0 0 0 3 0 8 9 25 0
MR_3 MR_4 player_name_1 player_name_2 player_name_3 player_name_4
<int> <int> <chr> <chr> <chr> <chr>
1 31 NA Karl Dickson Tim Swiel Alofa Alofa <NA>
2 85 0 Jason Woodward Jack Wallace Will Hurrell Tusi Pisi
它首先通过将数据从宽格式转换为长格式来完成这项工作。然后,整理数据,最后,创建新的变量名称并将其转换为所需的宽格式。