使用 R 将列中的分类数据重塑为行中的二进制数据

Reshape categorical data in columns to binary data in rows using R

我有二项式事件(例如输赢数)的表格,我想将其重塑为格式,以便每个事件在单独的行中。

数据看起来像这样:

 #Fake data   
 team      <- c("a","b","c","d")
 wins.home <- c(0,1,0,2); wins.away <- c(1,1,1,1)
 loss.home <- c(0,1,2,1); loss.away <- c(2,1,2,2)
 df <- data.frame(team,wins.home,wins.away,loss.home,loss.away)
 df$games.tot <- apply(df[,-1],1,sum)


#Dataframe in WIDE format
      team  wins.home  wins.away  loss.home  loss.away  games.tot
         a         0         1         0         2         3
         b         1         1         1         1         4
         c         0         1         2         2         5
         d         2         1         1         2         6

我想像这样将这个宽数据重塑为长格式:

team  game where  win
a     1    away   1
a     2    away   0
a     3    away   0
b     1    home   1
b     2    away   1
b     3    home   0
b     4    away   0

编辑:游戏索引是任意的,只是一个索引。

使用 reshape2::melt 很容易将其转换为每行中每个类别的计数,但我不知道如何进一步将计数分解为二进制事件。

可能有一个更短的解决方案,但作为快速修复:

library(stringr)
library(dplyr)
library(reshape2)

#initial melt
df2 <- melt(df[-6], id.vars='team')
#split the variable column to get the 'away and home' column
df2 <- cbind(df2, str_split_fixed(df2$variable, "\.", 2))
#create the wins column
df2$win <- ifelse(df2$`1` == 'wins',1,0)
#replicate the number of rows according to value
df2 <- df2[rep(1:nrow(df2), df2$value),]
#remove unnecessary columns
df2$variable <- df2$value <- df2$`1` <- NULL
#final group by to add the game column
df2 %>% group_by(team) %>% mutate(game = 1:n()) %>% arrange(team)

输出:

Source: local data frame [18 x 4]
Groups: team [4]

     team      2   win  game
   (fctr) (fctr) (dbl) (int)
1       a   away     1     1
2       a   away     0     2
3       a   away     0     3
4       b   home     1     1
5       b   away     1     2
6       b   home     0     3
7       b   away     0     4
#...and so on

这里有一个可供考虑的替代方案(尽管 melt 作为第一步更有意义):

使用 apply 创建列名称列表,并将它们 stack 放入两列 data.frame 中,其中包括 "ind"(团队)和 "values"(列名)。

temp <- stack(setNames(apply(df[-c(1, length(df))], 1, function(x) 
  rep(names(df[-c(1, length(df))]), x)), df[[1]]))

然后,加载我的 "splitstackshape" 包并执行以下操作:

library(splitstackshape)

setnames(                                            ## To create nice names
  getanID(                                           ## "game" numbers
    cSplit(temp, "values", ".")[                     ## Splits values into columns
      , values_1 := as.numeric(values_1 == "wins")], ## "wins" = 1, others = 0
    "ind"),                                          ## game numbers based on team
  c("team", "win", "where", "game"))[]               ## final names we want
#     team win where game
#  1:    a   1  away    1
#  2:    a   0  away    2
#  3:    a   0  away    3
#  4:    b   1  home    1
#  5:    b   1  away    2
#  6:    b   0  home    3
#  7:    b   0  away    4
#  8:    c   1  away    1
#  9:    c   0  home    2
# 10:    c   0  home    3
# 11:    c   0  away    4
# 12:    c   0  away    5
# 13:    d   1  home    1
# 14:    d   1  home    2
# 15:    d   1  away    3
# 16:    d   0  home    4
# 17:    d   0  away    5
# 18:    d   0  away    6

这是一个主要使用 "tidyr" 和 "dplyr" 的替代方案,"splitstackshape" 提供了一些帮助:

library(dplyr)
library(tidyr)
library(splitstackshape)

gather(df[-6], var, val, -team) %>%            ## A new take on melt
  expandRows("val") %>%                        ## Replicates rows by values
  separate(var, into = c("win", "where")) %>%  ## Splits into two columns
  group_by(team) %>%                           ## Grouping is required
  mutate(win = as.numeric(win == "wins"),      ## Numeric version of win
         game = sequence(n())) %>%             ## Sequence of games
  arrange(team)                                ## Final sorting

并且,为了完整起见,"data.table" 替代方案可能类似于:

library(data.table)

DT <- as.data.table(df)
sdcols <- names(DT)[-c(1, length(DT))]

DT[, list(val = rep(sdcols, .SD)), by = team, .SDcols = sdcols][
  , c("win", "where") := tstrsplit(val, ".", TRUE)][
    , `:=`(val = NULL, win = as.numeric(win == "wins"))][
      , game := sequence(.N), by = team][]