reshape2:当一个单元格有多个值但保留此值时进行 dcast

reshape2: dcast when there are multiple values for one cell but keep this values

我有一个这样的数据框:

    HLA_Status    variable      value
1     PP            CCL24       9.645
2     PP            CCL24       56.32
3     PP            CCL24       7.268
4     PC            CCL24       5.698
5     PC            CCL24       89.457
6     PC            CCL24       78.23
7     PP            SPP1        23.12
8     PP            SPP1        36.32
9     PP            SPP1        17.268
10    PC            SPP1        2.698
11    PC            SPP1        9.457
12    PC            SPP1        8.23

我想用 reshape2::dcast() 重塑我的数据框并得到这个:

   HLA_Status        CCL24        SPP1
1      PP            9.645       23.12
2      PP            56.32       36.32
3      PP            7.268       17.268
13     PC            5.698       2.698
14     PC            89.457      9.457
15     PC            78.230      8.23

但我没能做到这一点。

我试过这个:

dcast(mydt, HLA_Status ~ variable, value.var = "value")

但是没有用。

而且我在 reshape2 的文档中看到,如果我们每个单元格有多个值,我们需要告诉 dcast 如何聚合数据。

我想我的问题是不知道给 fun.aggregate 什么。

如何使用 reshape2 或其他包获得想要的数据框?

我们可以使用 tidyr

中的 spread
library(dplyr)
library(tidyr)

df %>%
  group_by(HLA_Status, variable) %>%
  mutate(row = row_number()) %>%
  spread(variable, value) %>%
  ungroup() %>%
  select(-row)

# A tibble: 6 x 3
#  HLA_Status CCL24  SPP1
#  <fct>     <dbl> <dbl>
#1   PC       5.70  2.70
#2   PC       89.5  9.46
#3   PC       78.2  8.23
#4   PP       9.64  23.1 
#5   PP       56.3  36.3 
#6   PP       7.27  17.3 

数据

df <- structure(list(HLA_Status = structure(c(2L, 2L, 2L, 1L, 1L, 1L, 
2L, 2L, 2L, 1L, 1L, 1L), .Label = c("PC", "PP"), class = "factor"), 
variable = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 
2L, 2L, 2L), .Label = c("CCL24", "SPP1"), class = "factor"), 
value = c(9.645, 56.32, 7.268, 5.698, 89.457, 78.23, 23.12, 
36.32, 17.268, 2.698, 9.457, 8.23)), class = "data.frame", row.names = 
c(NA, -12L))

我强烈建议切换到 tidyr 而不是使用 reshape2。但是,如果你真的想使用 dcast 这是要走的路

library(dplyr)
library(reshape2)
df <- structure(list(HLA_Status = c("PP", "PP", "PP", "PC", "PC", "PC", 
"PP", "PP", "PP", "PC", "PC", "PC"), variable = c("CCL24", "CCL24", 
"CCL24", "CCL24", "CCL24", "CCL24", "SPP1", "SPP1", "SPP1", "SPP1", 
"SPP1", "SPP1"), value = c(9.645, 56.32, 7.268, 5.698, 89.457, 
78.23, 23.12, 36.32, 17.268, 2.698, 9.457, 8.23)), row.names = c(NA, 
-12L), class = "data.frame")


df %>% 
  group_by(variable, HLA_Status) %>%
  mutate(id = row_number()) %>% 
  dcast(HLA_Status+id ~ variable, value.var = "value") %>%
  select(-id)

  HLA_Status  CCL24   SPP1
1         PC  5.698  2.698
2         PC 89.457  9.457
3         PC 78.230  8.230
4         PP  9.645 23.120
5         PP 56.320 36.320
6         PP  7.268 17.268

这可以通过 dcast(此处来自 data.table)完成,但您需要一个行标识符。

library(data.table)
dcast(dt, HLA_Status + rowid(HLA_Status, variable) ~ variable)
#   HLA_Status HLA_Status_1  CCL24   SPP1
#1:         PC            1  5.698  2.698
#2:         PC            2 89.457  9.457
#3:         PC            3 78.230  8.230
#4:         PP            1  9.645 23.120
#5:         PP            2 56.320 36.320
#6:         PP            3  7.268 17.268

数据

dt <- fread("    HLA_Status    variable      value
     PP            CCL24       9.645
     PP            CCL24       56.32
     PP            CCL24       7.268
     PC            CCL24       5.698
     PC            CCL24       89.457
     PC            CCL24       78.23
     PP            SPP1        23.12
     PP            SPP1        36.32
     PP            SPP1        17.268
     PC            SPP1        2.698
     PC            SPP1        9.457
     PC            SPP1        8.23")

如果确实需要 reshape2::dcast,则有 ave 的解决方案(要获取标识符,请参阅 @markus 的回答):

reshape2::dcast(d, HLA_Status + ave(rep(1, nrow(d)), d[1:2], FUN=seq) ~ variable)
#   HLA_Status ave(rep(1, nrow(d)), d[1:2], FUN = seq)  CCL24   SPP1
# 1         PC                                       1  5.698  2.698
# 2         PC                                       2 89.457  9.457
# 3         PC                                       3 78.230  8.230
# 4         PP                                       1  9.645 23.120
# 5         PP                                       2 56.320 36.320
# 6         PP                                       3  7.268 17.268

数据

d <- structure(list(HLA_Status = c("PP", "PP", "PP", "PC", "PC", "PC", 
"PP", "PP", "PP", "PC", "PC", "PC"), variable = c("CCL24", "CCL24", 
"CCL24", "CCL24", "CCL24", "CCL24", "SPP1", "SPP1", "SPP1", "SPP1", 
"SPP1", "SPP1"), value = c(9.645, 56.32, 7.268, 5.698, 89.457, 
78.23, 23.12, 36.32, 17.268, 2.698, 9.457, 8.23)), row.names = c(NA, 
-12L), class = "data.frame")