使用 R 将列中的分类数据重塑为行中的二进制数据
Reshape categorical data in columns to binary data in rows using R
我有二项式事件(例如输赢数)的表格,我想将其重塑为长格式,以便每个事件在单独的行中。
数据看起来像这样:
#Fake data
team <- c("a","b","c","d")
wins.home <- c(0,1,0,2); wins.away <- c(1,1,1,1)
loss.home <- c(0,1,2,1); loss.away <- c(2,1,2,2)
df <- data.frame(team,wins.home,wins.away,loss.home,loss.away)
df$games.tot <- apply(df[,-1],1,sum)
#Dataframe in WIDE format
team wins.home wins.away loss.home loss.away games.tot
a 0 1 0 2 3
b 1 1 1 1 4
c 0 1 2 2 5
d 2 1 1 2 6
我想像这样将这个宽数据重塑为长格式:
team game where win
a 1 away 1
a 2 away 0
a 3 away 0
b 1 home 1
b 2 away 1
b 3 home 0
b 4 away 0
编辑:游戏索引是任意的,只是一个索引。
使用 reshape2::melt
很容易将其转换为每行中每个类别的计数,但我不知道如何进一步将计数分解为二进制事件。
可能有一个更短的解决方案,但作为快速修复:
library(stringr)
library(dplyr)
library(reshape2)
#initial melt
df2 <- melt(df[-6], id.vars='team')
#split the variable column to get the 'away and home' column
df2 <- cbind(df2, str_split_fixed(df2$variable, "\.", 2))
#create the wins column
df2$win <- ifelse(df2$`1` == 'wins',1,0)
#replicate the number of rows according to value
df2 <- df2[rep(1:nrow(df2), df2$value),]
#remove unnecessary columns
df2$variable <- df2$value <- df2$`1` <- NULL
#final group by to add the game column
df2 %>% group_by(team) %>% mutate(game = 1:n()) %>% arrange(team)
输出:
Source: local data frame [18 x 4]
Groups: team [4]
team 2 win game
(fctr) (fctr) (dbl) (int)
1 a away 1 1
2 a away 0 2
3 a away 0 3
4 b home 1 1
5 b away 1 2
6 b home 0 3
7 b away 0 4
#...and so on
这里有一个可供考虑的替代方案(尽管 melt
作为第一步更有意义):
使用 apply
创建列名称列表,并将它们 stack
放入两列 data.frame
中,其中包括 "ind"(团队)和 "values"(列名)。
temp <- stack(setNames(apply(df[-c(1, length(df))], 1, function(x)
rep(names(df[-c(1, length(df))]), x)), df[[1]]))
然后,加载我的 "splitstackshape" 包并执行以下操作:
library(splitstackshape)
setnames( ## To create nice names
getanID( ## "game" numbers
cSplit(temp, "values", ".")[ ## Splits values into columns
, values_1 := as.numeric(values_1 == "wins")], ## "wins" = 1, others = 0
"ind"), ## game numbers based on team
c("team", "win", "where", "game"))[] ## final names we want
# team win where game
# 1: a 1 away 1
# 2: a 0 away 2
# 3: a 0 away 3
# 4: b 1 home 1
# 5: b 1 away 2
# 6: b 0 home 3
# 7: b 0 away 4
# 8: c 1 away 1
# 9: c 0 home 2
# 10: c 0 home 3
# 11: c 0 away 4
# 12: c 0 away 5
# 13: d 1 home 1
# 14: d 1 home 2
# 15: d 1 away 3
# 16: d 0 home 4
# 17: d 0 away 5
# 18: d 0 away 6
这是一个主要使用 "tidyr" 和 "dplyr" 的替代方案,"splitstackshape" 提供了一些帮助:
library(dplyr)
library(tidyr)
library(splitstackshape)
gather(df[-6], var, val, -team) %>% ## A new take on melt
expandRows("val") %>% ## Replicates rows by values
separate(var, into = c("win", "where")) %>% ## Splits into two columns
group_by(team) %>% ## Grouping is required
mutate(win = as.numeric(win == "wins"), ## Numeric version of win
game = sequence(n())) %>% ## Sequence of games
arrange(team) ## Final sorting
并且,为了完整起见,"data.table" 替代方案可能类似于:
library(data.table)
DT <- as.data.table(df)
sdcols <- names(DT)[-c(1, length(DT))]
DT[, list(val = rep(sdcols, .SD)), by = team, .SDcols = sdcols][
, c("win", "where") := tstrsplit(val, ".", TRUE)][
, `:=`(val = NULL, win = as.numeric(win == "wins"))][
, game := sequence(.N), by = team][]
我有二项式事件(例如输赢数)的表格,我想将其重塑为长格式,以便每个事件在单独的行中。
数据看起来像这样:
#Fake data
team <- c("a","b","c","d")
wins.home <- c(0,1,0,2); wins.away <- c(1,1,1,1)
loss.home <- c(0,1,2,1); loss.away <- c(2,1,2,2)
df <- data.frame(team,wins.home,wins.away,loss.home,loss.away)
df$games.tot <- apply(df[,-1],1,sum)
#Dataframe in WIDE format
team wins.home wins.away loss.home loss.away games.tot
a 0 1 0 2 3
b 1 1 1 1 4
c 0 1 2 2 5
d 2 1 1 2 6
我想像这样将这个宽数据重塑为长格式:
team game where win
a 1 away 1
a 2 away 0
a 3 away 0
b 1 home 1
b 2 away 1
b 3 home 0
b 4 away 0
编辑:游戏索引是任意的,只是一个索引。
使用 reshape2::melt
很容易将其转换为每行中每个类别的计数,但我不知道如何进一步将计数分解为二进制事件。
可能有一个更短的解决方案,但作为快速修复:
library(stringr)
library(dplyr)
library(reshape2)
#initial melt
df2 <- melt(df[-6], id.vars='team')
#split the variable column to get the 'away and home' column
df2 <- cbind(df2, str_split_fixed(df2$variable, "\.", 2))
#create the wins column
df2$win <- ifelse(df2$`1` == 'wins',1,0)
#replicate the number of rows according to value
df2 <- df2[rep(1:nrow(df2), df2$value),]
#remove unnecessary columns
df2$variable <- df2$value <- df2$`1` <- NULL
#final group by to add the game column
df2 %>% group_by(team) %>% mutate(game = 1:n()) %>% arrange(team)
输出:
Source: local data frame [18 x 4]
Groups: team [4]
team 2 win game
(fctr) (fctr) (dbl) (int)
1 a away 1 1
2 a away 0 2
3 a away 0 3
4 b home 1 1
5 b away 1 2
6 b home 0 3
7 b away 0 4
#...and so on
这里有一个可供考虑的替代方案(尽管 melt
作为第一步更有意义):
使用 apply
创建列名称列表,并将它们 stack
放入两列 data.frame
中,其中包括 "ind"(团队)和 "values"(列名)。
temp <- stack(setNames(apply(df[-c(1, length(df))], 1, function(x)
rep(names(df[-c(1, length(df))]), x)), df[[1]]))
然后,加载我的 "splitstackshape" 包并执行以下操作:
library(splitstackshape)
setnames( ## To create nice names
getanID( ## "game" numbers
cSplit(temp, "values", ".")[ ## Splits values into columns
, values_1 := as.numeric(values_1 == "wins")], ## "wins" = 1, others = 0
"ind"), ## game numbers based on team
c("team", "win", "where", "game"))[] ## final names we want
# team win where game
# 1: a 1 away 1
# 2: a 0 away 2
# 3: a 0 away 3
# 4: b 1 home 1
# 5: b 1 away 2
# 6: b 0 home 3
# 7: b 0 away 4
# 8: c 1 away 1
# 9: c 0 home 2
# 10: c 0 home 3
# 11: c 0 away 4
# 12: c 0 away 5
# 13: d 1 home 1
# 14: d 1 home 2
# 15: d 1 away 3
# 16: d 0 home 4
# 17: d 0 away 5
# 18: d 0 away 6
这是一个主要使用 "tidyr" 和 "dplyr" 的替代方案,"splitstackshape" 提供了一些帮助:
library(dplyr)
library(tidyr)
library(splitstackshape)
gather(df[-6], var, val, -team) %>% ## A new take on melt
expandRows("val") %>% ## Replicates rows by values
separate(var, into = c("win", "where")) %>% ## Splits into two columns
group_by(team) %>% ## Grouping is required
mutate(win = as.numeric(win == "wins"), ## Numeric version of win
game = sequence(n())) %>% ## Sequence of games
arrange(team) ## Final sorting
并且,为了完整起见,"data.table" 替代方案可能类似于:
library(data.table)
DT <- as.data.table(df)
sdcols <- names(DT)[-c(1, length(DT))]
DT[, list(val = rep(sdcols, .SD)), by = team, .SDcols = sdcols][
, c("win", "where") := tstrsplit(val, ".", TRUE)][
, `:=`(val = NULL, win = as.numeric(win == "wins"))][
, game := sequence(.N), by = team][]