reshape2:当一个单元格有多个值但保留此值时进行 dcast
reshape2: dcast when there are multiple values for one cell but keep this values
我有一个这样的数据框:
HLA_Status variable value
1 PP CCL24 9.645
2 PP CCL24 56.32
3 PP CCL24 7.268
4 PC CCL24 5.698
5 PC CCL24 89.457
6 PC CCL24 78.23
7 PP SPP1 23.12
8 PP SPP1 36.32
9 PP SPP1 17.268
10 PC SPP1 2.698
11 PC SPP1 9.457
12 PC SPP1 8.23
我想用 reshape2::dcast() 重塑我的数据框并得到这个:
HLA_Status CCL24 SPP1
1 PP 9.645 23.12
2 PP 56.32 36.32
3 PP 7.268 17.268
13 PC 5.698 2.698
14 PC 89.457 9.457
15 PC 78.230 8.23
但我没能做到这一点。
我试过这个:
dcast(mydt, HLA_Status ~ variable, value.var = "value")
但是没有用。
而且我在 reshape2 的文档中看到,如果我们每个单元格有多个值,我们需要告诉 dcast 如何聚合数据。
我想我的问题是不知道给 fun.aggregate 什么。
如何使用 reshape2 或其他包获得想要的数据框?
我们可以使用 tidyr
中的 spread
library(dplyr)
library(tidyr)
df %>%
group_by(HLA_Status, variable) %>%
mutate(row = row_number()) %>%
spread(variable, value) %>%
ungroup() %>%
select(-row)
# A tibble: 6 x 3
# HLA_Status CCL24 SPP1
# <fct> <dbl> <dbl>
#1 PC 5.70 2.70
#2 PC 89.5 9.46
#3 PC 78.2 8.23
#4 PP 9.64 23.1
#5 PP 56.3 36.3
#6 PP 7.27 17.3
数据
df <- structure(list(HLA_Status = structure(c(2L, 2L, 2L, 1L, 1L, 1L,
2L, 2L, 2L, 1L, 1L, 1L), .Label = c("PC", "PP"), class = "factor"),
variable = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L), .Label = c("CCL24", "SPP1"), class = "factor"),
value = c(9.645, 56.32, 7.268, 5.698, 89.457, 78.23, 23.12,
36.32, 17.268, 2.698, 9.457, 8.23)), class = "data.frame", row.names =
c(NA, -12L))
我强烈建议切换到 tidyr
而不是使用 reshape2
。但是,如果你真的想使用 dcast
这是要走的路
library(dplyr)
library(reshape2)
df <- structure(list(HLA_Status = c("PP", "PP", "PP", "PC", "PC", "PC",
"PP", "PP", "PP", "PC", "PC", "PC"), variable = c("CCL24", "CCL24",
"CCL24", "CCL24", "CCL24", "CCL24", "SPP1", "SPP1", "SPP1", "SPP1",
"SPP1", "SPP1"), value = c(9.645, 56.32, 7.268, 5.698, 89.457,
78.23, 23.12, 36.32, 17.268, 2.698, 9.457, 8.23)), row.names = c(NA,
-12L), class = "data.frame")
df %>%
group_by(variable, HLA_Status) %>%
mutate(id = row_number()) %>%
dcast(HLA_Status+id ~ variable, value.var = "value") %>%
select(-id)
HLA_Status CCL24 SPP1
1 PC 5.698 2.698
2 PC 89.457 9.457
3 PC 78.230 8.230
4 PP 9.645 23.120
5 PP 56.320 36.320
6 PP 7.268 17.268
这可以通过 dcast
(此处来自 data.table
)完成,但您需要一个行标识符。
library(data.table)
dcast(dt, HLA_Status + rowid(HLA_Status, variable) ~ variable)
# HLA_Status HLA_Status_1 CCL24 SPP1
#1: PC 1 5.698 2.698
#2: PC 2 89.457 9.457
#3: PC 3 78.230 8.230
#4: PP 1 9.645 23.120
#5: PP 2 56.320 36.320
#6: PP 3 7.268 17.268
数据
dt <- fread(" HLA_Status variable value
PP CCL24 9.645
PP CCL24 56.32
PP CCL24 7.268
PC CCL24 5.698
PC CCL24 89.457
PC CCL24 78.23
PP SPP1 23.12
PP SPP1 36.32
PP SPP1 17.268
PC SPP1 2.698
PC SPP1 9.457
PC SPP1 8.23")
如果确实需要 reshape2::dcast
,则有 ave
的解决方案(要获取标识符,请参阅 @markus 的回答):
reshape2::dcast(d, HLA_Status + ave(rep(1, nrow(d)), d[1:2], FUN=seq) ~ variable)
# HLA_Status ave(rep(1, nrow(d)), d[1:2], FUN = seq) CCL24 SPP1
# 1 PC 1 5.698 2.698
# 2 PC 2 89.457 9.457
# 3 PC 3 78.230 8.230
# 4 PP 1 9.645 23.120
# 5 PP 2 56.320 36.320
# 6 PP 3 7.268 17.268
数据
d <- structure(list(HLA_Status = c("PP", "PP", "PP", "PC", "PC", "PC",
"PP", "PP", "PP", "PC", "PC", "PC"), variable = c("CCL24", "CCL24",
"CCL24", "CCL24", "CCL24", "CCL24", "SPP1", "SPP1", "SPP1", "SPP1",
"SPP1", "SPP1"), value = c(9.645, 56.32, 7.268, 5.698, 89.457,
78.23, 23.12, 36.32, 17.268, 2.698, 9.457, 8.23)), row.names = c(NA,
-12L), class = "data.frame")
我有一个这样的数据框:
HLA_Status variable value
1 PP CCL24 9.645
2 PP CCL24 56.32
3 PP CCL24 7.268
4 PC CCL24 5.698
5 PC CCL24 89.457
6 PC CCL24 78.23
7 PP SPP1 23.12
8 PP SPP1 36.32
9 PP SPP1 17.268
10 PC SPP1 2.698
11 PC SPP1 9.457
12 PC SPP1 8.23
我想用 reshape2::dcast() 重塑我的数据框并得到这个:
HLA_Status CCL24 SPP1
1 PP 9.645 23.12
2 PP 56.32 36.32
3 PP 7.268 17.268
13 PC 5.698 2.698
14 PC 89.457 9.457
15 PC 78.230 8.23
但我没能做到这一点。
我试过这个:
dcast(mydt, HLA_Status ~ variable, value.var = "value")
但是没有用。
而且我在 reshape2 的文档中看到,如果我们每个单元格有多个值,我们需要告诉 dcast 如何聚合数据。
我想我的问题是不知道给 fun.aggregate 什么。
如何使用 reshape2 或其他包获得想要的数据框?
我们可以使用 tidyr
spread
library(dplyr)
library(tidyr)
df %>%
group_by(HLA_Status, variable) %>%
mutate(row = row_number()) %>%
spread(variable, value) %>%
ungroup() %>%
select(-row)
# A tibble: 6 x 3
# HLA_Status CCL24 SPP1
# <fct> <dbl> <dbl>
#1 PC 5.70 2.70
#2 PC 89.5 9.46
#3 PC 78.2 8.23
#4 PP 9.64 23.1
#5 PP 56.3 36.3
#6 PP 7.27 17.3
数据
df <- structure(list(HLA_Status = structure(c(2L, 2L, 2L, 1L, 1L, 1L,
2L, 2L, 2L, 1L, 1L, 1L), .Label = c("PC", "PP"), class = "factor"),
variable = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L), .Label = c("CCL24", "SPP1"), class = "factor"),
value = c(9.645, 56.32, 7.268, 5.698, 89.457, 78.23, 23.12,
36.32, 17.268, 2.698, 9.457, 8.23)), class = "data.frame", row.names =
c(NA, -12L))
我强烈建议切换到 tidyr
而不是使用 reshape2
。但是,如果你真的想使用 dcast
这是要走的路
library(dplyr)
library(reshape2)
df <- structure(list(HLA_Status = c("PP", "PP", "PP", "PC", "PC", "PC",
"PP", "PP", "PP", "PC", "PC", "PC"), variable = c("CCL24", "CCL24",
"CCL24", "CCL24", "CCL24", "CCL24", "SPP1", "SPP1", "SPP1", "SPP1",
"SPP1", "SPP1"), value = c(9.645, 56.32, 7.268, 5.698, 89.457,
78.23, 23.12, 36.32, 17.268, 2.698, 9.457, 8.23)), row.names = c(NA,
-12L), class = "data.frame")
df %>%
group_by(variable, HLA_Status) %>%
mutate(id = row_number()) %>%
dcast(HLA_Status+id ~ variable, value.var = "value") %>%
select(-id)
HLA_Status CCL24 SPP1
1 PC 5.698 2.698
2 PC 89.457 9.457
3 PC 78.230 8.230
4 PP 9.645 23.120
5 PP 56.320 36.320
6 PP 7.268 17.268
这可以通过 dcast
(此处来自 data.table
)完成,但您需要一个行标识符。
library(data.table)
dcast(dt, HLA_Status + rowid(HLA_Status, variable) ~ variable)
# HLA_Status HLA_Status_1 CCL24 SPP1
#1: PC 1 5.698 2.698
#2: PC 2 89.457 9.457
#3: PC 3 78.230 8.230
#4: PP 1 9.645 23.120
#5: PP 2 56.320 36.320
#6: PP 3 7.268 17.268
数据
dt <- fread(" HLA_Status variable value
PP CCL24 9.645
PP CCL24 56.32
PP CCL24 7.268
PC CCL24 5.698
PC CCL24 89.457
PC CCL24 78.23
PP SPP1 23.12
PP SPP1 36.32
PP SPP1 17.268
PC SPP1 2.698
PC SPP1 9.457
PC SPP1 8.23")
如果确实需要 reshape2::dcast
,则有 ave
的解决方案(要获取标识符,请参阅 @markus 的回答):
reshape2::dcast(d, HLA_Status + ave(rep(1, nrow(d)), d[1:2], FUN=seq) ~ variable)
# HLA_Status ave(rep(1, nrow(d)), d[1:2], FUN = seq) CCL24 SPP1
# 1 PC 1 5.698 2.698
# 2 PC 2 89.457 9.457
# 3 PC 3 78.230 8.230
# 4 PP 1 9.645 23.120
# 5 PP 2 56.320 36.320
# 6 PP 3 7.268 17.268
数据
d <- structure(list(HLA_Status = c("PP", "PP", "PP", "PC", "PC", "PC",
"PP", "PP", "PP", "PC", "PC", "PC"), variable = c("CCL24", "CCL24",
"CCL24", "CCL24", "CCL24", "CCL24", "SPP1", "SPP1", "SPP1", "SPP1",
"SPP1", "SPP1"), value = c(9.645, 56.32, 7.268, 5.698, 89.457,
78.23, 23.12, 36.32, 17.268, 2.698, 9.457, 8.23)), row.names = c(NA,
-12L), class = "data.frame")