根据 R 中其他列中的重复值将值粘贴到数据框中
Paste values in a data frame based on duplicated values in other columns in R
我有一个数据框如下
dt <- data.frame(genotype = c("X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X1", "X2", "X3", "X4",
"X5", "X6", "X7", "X8", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"),
variable = c("A", "A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B",
"B", "B", "B", "B", "C", "C", "C", "C", "C", "C", "C", "C"),
value = c(1L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 3L, 3L, 4L, 5L, 5L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L), stringsAsFactors = F)
dt
genotype variable value
1 X1 A 1
2 X2 A 1
3 X3 A 2
4 X4 A 3
5 X5 A 4
6 X6 A 5
7 X7 A 6
8 X8 A 7
9 X1 B 1
10 X2 B 2
11 X3 B 3
12 X4 B 3
13 X5 B 3
14 X6 B 4
15 X7 B 5
16 X8 B 5
17 X1 C 1
18 X2 C 2
19 X3 C 3
20 X4 C 4
21 X5 C 5
22 X6 C 6
23 X7 C 7
24 X8 C 8
我想根据每个变量值列中的重复将基因型数据粘贴在一起来创建一个新列。
想要的输出如下。
out <- data.frame(genotype = c("X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X1", "X2", "X3", "X4",
"X5", "X6", "X7", "X8", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"),
variable = c("A", "A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B",
"B", "B", "B", "B", "C", "C", "C", "C", "C", "C", "C", "C"),
value = c(1L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 3L, 3L, 4L, 5L,
5L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L),
lab = c("X1, X2", "X1, X2", "X3", "X4", "X5", "X6", "X7", "X8", "X1",
"X2", "X3, X4, X5", "X3, X4, X5", "X3, X4, X5", "X6", "X7, X7",
"X8, X7", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"), stringsAsFactors = F)
out
genotype variable value lab
1 X1 A 1 X1, X2
2 X2 A 1 X1, X2
3 X3 A 2 X3
4 X4 A 3 X4
5 X5 A 4 X5
6 X6 A 5 X6
7 X7 A 6 X7
8 X8 A 7 X8
9 X1 B 1 X1
10 X2 B 2 X2
11 X3 B 3 X3, X4, X5
12 X4 B 3 X3, X4, X5
13 X5 B 3 X3, X4, X5
14 X6 B 4 X6
15 X7 B 5 X7, X7
16 X8 B 5 X8, X7
17 X1 C 1 X1
18 X2 C 2 X2
19 X3 C 3 X3
20 X4 C 4 X4
21 X5 C 5 X5
22 X6 C 6 X6
23 X7 C 7 X7
24 X8 C 8 X8
我尝试按如下方式使用 aggregate
,但由于重复值丢失而未能获得所需结果。
cons <- aggregate(. ~value+variable, data=dt,
function(x) paste(unique(x), collapse = ","))
cons
value variable genotype
1 1 A X1,X2
2 2 A X3
3 3 A X4
4 4 A X5
5 5 A X6
6 6 A X7
7 7 A X8
8 1 B X1
9 2 B X2
10 3 B X3,X4,X5
11 4 B X6
12 5 B X7,X8
13 1 C X1
14 2 C X2
15 3 C X3
16 4 C X4
17 5 C X5
18 6 C X6
19 7 C X7
20 8 C X8
如何最好地在 base R 中获得所需的输出?
您可以使用 dplyr
轻松完成。
library(dplyr)
dt %>% group_by(variable, value) %>%
mutate(lab = toString(genotype)) %>%
as.data.frame()
# genotype variable value lab
# 1 X1 A 1 X1, X2
# 2 X2 A 1 X1, X2
# 3 X3 A 2 X3
# 4 X4 A 3 X4
# 5 X5 A 4 X5
# 6 X6 A 5 X6
# 7 X7 A 6 X7
# 8 X8 A 7 X8
# 9 X1 B 1 X1
# 10 X2 B 2 X2
# 11 X3 B 3 X3, X4, X5
# 12 X4 B 3 X3, X4, X5
# 13 X5 B 3 X3, X4, X5
# 14 X6 B 4 X6
# 15 X7 B 5 X7, X8
# 16 X8 B 5 X7, X8
# 17 X1 C 1 X1
# 18 X2 C 2 X2
# 19 X3 C 3 X3
# 20 X4 C 4 X4
# 21 X5 C 5 X5
# 22 X6 C 6 X6
# 23 X7 C 7 X7
# 24 X8 C 8 X8
已编辑: 根据 @markus
的建议,可以在 base-R
中使用 transform
函数
transform(dt, lab = ave(genotype, variable, value, FUN = toString))
aggregate
没有任何问题,只要您在后面加上 merge
以恢复重复的行。
res <- aggregate(genotype ~ variable + value, dt, paste, collapse = ", ")
res <- merge(dt, res, by = c("value", "variable"))[-3]
names(res)[3] <- "genotype"
head(res, 15)
# value variable genotype
#1 1 A X1, X2
#2 1 A X1, X2
#3 1 B X1
#4 1 C X1
#5 2 A X3
#6 2 B X2
#7 2 C X2
#8 3 A X4
#9 3 B X3, X4, X5
#10 3 B X3, X4, X5
#11 3 B X3, X4, X5
#12 3 C X3
#13 4 A X5
#14 4 B X6
#15 4 C X4
我有一个数据框如下
dt <- data.frame(genotype = c("X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X1", "X2", "X3", "X4",
"X5", "X6", "X7", "X8", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"),
variable = c("A", "A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B",
"B", "B", "B", "B", "C", "C", "C", "C", "C", "C", "C", "C"),
value = c(1L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 3L, 3L, 4L, 5L, 5L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L), stringsAsFactors = F)
dt
genotype variable value
1 X1 A 1
2 X2 A 1
3 X3 A 2
4 X4 A 3
5 X5 A 4
6 X6 A 5
7 X7 A 6
8 X8 A 7
9 X1 B 1
10 X2 B 2
11 X3 B 3
12 X4 B 3
13 X5 B 3
14 X6 B 4
15 X7 B 5
16 X8 B 5
17 X1 C 1
18 X2 C 2
19 X3 C 3
20 X4 C 4
21 X5 C 5
22 X6 C 6
23 X7 C 7
24 X8 C 8
我想根据每个变量值列中的重复将基因型数据粘贴在一起来创建一个新列。
想要的输出如下。
out <- data.frame(genotype = c("X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X1", "X2", "X3", "X4",
"X5", "X6", "X7", "X8", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"),
variable = c("A", "A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B",
"B", "B", "B", "B", "C", "C", "C", "C", "C", "C", "C", "C"),
value = c(1L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 3L, 3L, 4L, 5L,
5L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L),
lab = c("X1, X2", "X1, X2", "X3", "X4", "X5", "X6", "X7", "X8", "X1",
"X2", "X3, X4, X5", "X3, X4, X5", "X3, X4, X5", "X6", "X7, X7",
"X8, X7", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"), stringsAsFactors = F)
out
genotype variable value lab
1 X1 A 1 X1, X2
2 X2 A 1 X1, X2
3 X3 A 2 X3
4 X4 A 3 X4
5 X5 A 4 X5
6 X6 A 5 X6
7 X7 A 6 X7
8 X8 A 7 X8
9 X1 B 1 X1
10 X2 B 2 X2
11 X3 B 3 X3, X4, X5
12 X4 B 3 X3, X4, X5
13 X5 B 3 X3, X4, X5
14 X6 B 4 X6
15 X7 B 5 X7, X7
16 X8 B 5 X8, X7
17 X1 C 1 X1
18 X2 C 2 X2
19 X3 C 3 X3
20 X4 C 4 X4
21 X5 C 5 X5
22 X6 C 6 X6
23 X7 C 7 X7
24 X8 C 8 X8
我尝试按如下方式使用 aggregate
,但由于重复值丢失而未能获得所需结果。
cons <- aggregate(. ~value+variable, data=dt,
function(x) paste(unique(x), collapse = ","))
cons
value variable genotype
1 1 A X1,X2
2 2 A X3
3 3 A X4
4 4 A X5
5 5 A X6
6 6 A X7
7 7 A X8
8 1 B X1
9 2 B X2
10 3 B X3,X4,X5
11 4 B X6
12 5 B X7,X8
13 1 C X1
14 2 C X2
15 3 C X3
16 4 C X4
17 5 C X5
18 6 C X6
19 7 C X7
20 8 C X8
如何最好地在 base R 中获得所需的输出?
您可以使用 dplyr
轻松完成。
library(dplyr)
dt %>% group_by(variable, value) %>%
mutate(lab = toString(genotype)) %>%
as.data.frame()
# genotype variable value lab
# 1 X1 A 1 X1, X2
# 2 X2 A 1 X1, X2
# 3 X3 A 2 X3
# 4 X4 A 3 X4
# 5 X5 A 4 X5
# 6 X6 A 5 X6
# 7 X7 A 6 X7
# 8 X8 A 7 X8
# 9 X1 B 1 X1
# 10 X2 B 2 X2
# 11 X3 B 3 X3, X4, X5
# 12 X4 B 3 X3, X4, X5
# 13 X5 B 3 X3, X4, X5
# 14 X6 B 4 X6
# 15 X7 B 5 X7, X8
# 16 X8 B 5 X7, X8
# 17 X1 C 1 X1
# 18 X2 C 2 X2
# 19 X3 C 3 X3
# 20 X4 C 4 X4
# 21 X5 C 5 X5
# 22 X6 C 6 X6
# 23 X7 C 7 X7
# 24 X8 C 8 X8
已编辑: 根据 @markus
的建议,可以在 base-R
transform
函数
transform(dt, lab = ave(genotype, variable, value, FUN = toString))
aggregate
没有任何问题,只要您在后面加上 merge
以恢复重复的行。
res <- aggregate(genotype ~ variable + value, dt, paste, collapse = ", ")
res <- merge(dt, res, by = c("value", "variable"))[-3]
names(res)[3] <- "genotype"
head(res, 15)
# value variable genotype
#1 1 A X1, X2
#2 1 A X1, X2
#3 1 B X1
#4 1 C X1
#5 2 A X3
#6 2 B X2
#7 2 C X2
#8 3 A X4
#9 3 B X3, X4, X5
#10 3 B X3, X4, X5
#11 3 B X3, X4, X5
#12 3 C X3
#13 4 A X5
#14 4 B X6
#15 4 C X4