获取分组数据集中变量及其对应值的所有组合

Get all combinations of a variable and their corresponding values in a grouped data set

我的数据是这样的:

mydata <- data.frame(id = c(1,1,1,2,2,3,3,3,3), 
               subid = c(1,2,3,1,2,1,2,3,4),
               time = c(16, 18, 20, 10, 11, 7, 9, 10, 11))

  id subid time
1  1     1   16
2  1     2   18
3  1     3   20
4  2     1   10
5  2     2   11
6  3     1    7
7  3     2    9
8  3     3   10
9  3     4   11

我的目标是将数据转换为:

newdata <- data.frame(id = c(1,1,1,2,3,3,3,3,3,3), 
                  subid.1 = c(1,1,2,1,1,1,1,2,2,3), 
                  subid.2 = c(2,3,3,2,2,3,4,3,4,4), 
                  time.1 = c(16,16,18,10,7,7,7,9,9,10),
                  time.2 = c(18,20,20,11,9,10,11,10,11,11))

   id subid.1 subid.2 time.1 time.2
1   1       1       2     16     18
2   1       1       3     16     20
3   1       2       3     18     20
4   2       1       2     10     11
5   3       1       2      7      9
6   3       1       3      7     10
7   3       1       4      7     11
8   3       2       3      9     10
9   3       2       4      9     11
10  3       3       4     10     11

所以这不是从长到宽过程的简单重塑:这个想法是,在 id 定义的组内,采用所有可能的组合 subid 及其相应的时间值,并将其转换为宽格式。

我知道我可以获得所有可能的组合,例如 gtools::combinations。第一组包含 3 行,因此

gtools::combinations(n=3, r=2) 

给我组 id==1 的新 subid.1 和 subid.2 对的矩阵:

      [,1] [,2]
[1,]    1    2
[2,]    1    3
[3,]    2    3

但是我不知道如何继续(既没有将 id==1 的组重塑为这种格式,也不知道如何为每个组单独进行)。谢谢!

使用 data.table-包:

library(data.table)
setDT(mydata)[, .(subid = c(t(combn(subid, 2)))), by = id
              ][, grp := rep(1:2, each = .N/2), by = id
                ][mydata, on = .(id, subid), time := time
                  ][, dcast(.SD, id + rowid(grp) ~ grp, value.var = list('subid','time'), sep = '.')]

这给你:

    id grp subid.1 subid.2 time.1 time.2
 1:  1   1       1       2     16     18
 2:  1   2       1       3     16     20
 3:  1   3       2       3     18     20
 4:  2   4       1       2     10     11
 5:  3   5       1       2      7      9
 6:  3   6       1       3      7     10
 7:  3   7       1       4      7     11
 8:  3   8       2       3      9     10
 9:  3   9       2       4      9     11
10:  3  10       3       4     10     11

忘了说我想出了这个相当蹩脚的 4 步解决方案:

step1 <- lapply(unique(mydata$id), function(x) {
  nrows <- nrow(mydata[which(mydata$id == x), ])
  combos <- gtools::combinations(n=nrows, r=2)
  return(as.data.frame(cbind(x, combos)))
})

step2 <- dplyr::bind_rows(step1)

step3a <- merge(step2, mydata, by.x = c("x", "V2"), by.y = c("id", "subid"))
step3b <- merge(step3a, mydata, by.x = c("x", "V3"), by.y = c("id", "subid"))

step4 <- step3b[, c(1, 3, 2, 4, 5)]
names(step4) <- c("id", "subid.1", "subid.2", "time.1", "time.2")

它很丑但很管用。

基数 R:

subset(merge(mydata, mydata, by="id", suffix=c(".1",".2")), subid.1 < subid.2)
#    id subid.1 time.1 subid.2 time.2
# 1   1       1     16       2     18
# 2   1       1     16       3     20
# 3   1       2     18       3     20
# 4   2       1     10       2     11
# 5   3       1      7       2      9
# 6   3       1      7       3     10
# 7   3       1      7       4     11
# 8   3       2      9       3     10
# 9   3       2      9       4     11
# 10  3       3     10       4     11

dplyr版本:

mydata %>% inner_join(.,.,by="id",suffix=c(".1",".2")) %>% filter(subid.1 < subid.2)

data.table版本:

setDT(mydata)
mydata[mydata, on="id", allow.cartesian=TRUE][subid < i.subid]
#     id subid time i.subid i.time
#  1:  1     1   16       2     18
#  2:  1     1   16       3     20
#  3:  1     2   18       3     20
#  4:  2     1   10       2     11
#  5:  3     1    7       2      9
#  6:  3     1    7       3     10
#  7:  3     2    9       3     10
#  8:  3     1    7       4     11
#  9:  3     2    9       4     11
# 10:  3     3   10       4     11

或者让你的列名正确,但它扼杀了简短解决方案的乐趣:)。

merge(mydata, mydata, by="id", suffix=c(".1",".2"), allow.cartesian=TRUE)[subid.1 < subid.2]