使用 dplyr 将多个列合并在一起
Merging multiple columns together using dplyr
我正在尝试获取一个包含大约 100 列和 100000 行的数据框,并将类似命名的列组合成一列,例如C1、C2、C3 到 C。我一直在尝试使用 dplyr 收集功能,但我似乎无法获得所需的输出,如果更容易的话,我很乐意使用另一个包。我在下面包含了一个简化的示例。我确定我遗漏了一些简单的东西,非常感谢任何帮助。
id = c(222, 222, 222, 333, 333, 333, 444, 444, 444)
timepoint = c("aa", "aa", "bb", "aa", "aa", "bb", "aa", "aa", "bb")
position = c(1, 2, 1, 1, 2,1 , 1, 2, 1)
C1 = c("aat", "aaf", "bbg", "aag", "aag", "bbg", "aag", "aag", "bbg")
P1 = c("A", "B", "C", "J", "J", "J", "J", "H", "H")
X1 = c(21, 22, 23, 33, 35, 33, 41, 43, 45)
C2 = c("aat", "aaf", "bbg", "aag", "aag", "bbg", "aag", "aag", "bbg")
P2 = c("A", "B", "C", "J", "J", "J", "J", "H", "H")
X2 = c(21, 22, 23, 33, 35, 33, 41, 43, 45)
C3 = c("aat", "aaf", "bbg", "aag", "aag", "bbg", "aag", "aag", "bbg")
P3 = c("A", "B", "C", "J", "J", "J", "J", "H", "H")
X3 = c(21, 22, 23, 33, 35, 33, 41, 43, 45)
df = data.frame(id, timepoint, position, C1, P1, X1, C2, P2, X2, C3, P3, X3)
我想从这个格式开始
id timepoint position C1 P1 X1 C2 P2 X2 C3 P3 X3
222 aa 1 aat A 21 aat A 21 aat A 21
222 aa 2 aaf B 22 aaf B 22 aaf B 22
222 bb 1 bbg C 23 bbg C 23 bbg C 23
333 aa 1 aag J 33 aag J 33 aag J 33
333 aa 2 aag J 35 aag J 35 aag J 35
333 bb 1 bbg J 33 bbg J 33 bbg J 33
444 aa 1 aag J 41 aag J 41 aag J 41
444 aa 2 aag H 43 aag H 43 aag H 43
444 bb 1 bbg H 45 bbg H 45 bbg H 45
到这个格式。
id timepoint position C P X
222 aa 1 aat A 21
222 aa 2 aaf B 22
222 bb 1 bbg C 23
333 aa 1 aag J 33
333 aa 2 aag J 35
333 bb 1 bbg J 33
444 aa 1 aag J 41
444 aa 2 aag H 43
444 bb 1 bbg H 45
222 aa 1 aat A 21
222 aa 2 aaf B 22
222 bb 1 bbg C 23
333 aa 1 aag J 33
333 aa 2 aag J 35
333 bb 1 bbg J 33
444 aa 1 aag J 41
444 aa 2 aag H 43
444 bb 1 bbg H 45
222 aa 1 aat A 21
222 aa 2 aaf B 22
222 bb 1 bbg C 23
333 aa 1 aag J 33
333 aa 2 aag J 35
333 bb 1 bbg J 33
444 aa 1 aag J 41
444 aa 2 aag H 43
444 bb 1 bbg H 45
我们可以使用 data.table
中的 melt
轻松做到这一点,它可以占用多个 measure
patterns
library(data.table)
melt(setDT(df), measure = patterns("^C\d+", "^P\d+", "^X\d+"),
value.name = c("C", "P", "X"))[, variable := NULL][]
# id timepoint position C P X
# 1: 222 aa 1 aat A 21
# 2: 222 aa 2 aaf B 22
# 3: 222 bb 1 bbg C 23
# 4: 333 aa 1 aag J 33
# 5: 333 aa 2 aag J 35
# 6: 333 bb 1 bbg J 33
# 7: 444 aa 1 aag J 41
# 8: 444 aa 2 aag H 43
# 9: 444 bb 1 bbg H 45
#10: 222 aa 1 aat A 21
#11: 222 aa 2 aaf B 22
#12: 222 bb 1 bbg C 23
#13: 333 aa 1 aag J 33
#14: 333 aa 2 aag J 35
#15: 333 bb 1 bbg J 33
#16: 444 aa 1 aag J 41
#17: 444 aa 2 aag H 43
#18: 444 bb 1 bbg H 45
#19: 222 aa 1 aat A 21
#20: 222 aa 2 aaf B 22
#21: 222 bb 1 bbg C 23
#22: 333 aa 1 aag J 33
#23: 333 aa 2 aag J 35
#24: 333 bb 1 bbg J 33
#25: 444 aa 1 aag J 41
#26: 444 aa 2 aag H 43
#27: 444 bb 1 bbg H 45
我正在尝试获取一个包含大约 100 列和 100000 行的数据框,并将类似命名的列组合成一列,例如C1、C2、C3 到 C。我一直在尝试使用 dplyr 收集功能,但我似乎无法获得所需的输出,如果更容易的话,我很乐意使用另一个包。我在下面包含了一个简化的示例。我确定我遗漏了一些简单的东西,非常感谢任何帮助。
id = c(222, 222, 222, 333, 333, 333, 444, 444, 444)
timepoint = c("aa", "aa", "bb", "aa", "aa", "bb", "aa", "aa", "bb")
position = c(1, 2, 1, 1, 2,1 , 1, 2, 1)
C1 = c("aat", "aaf", "bbg", "aag", "aag", "bbg", "aag", "aag", "bbg")
P1 = c("A", "B", "C", "J", "J", "J", "J", "H", "H")
X1 = c(21, 22, 23, 33, 35, 33, 41, 43, 45)
C2 = c("aat", "aaf", "bbg", "aag", "aag", "bbg", "aag", "aag", "bbg")
P2 = c("A", "B", "C", "J", "J", "J", "J", "H", "H")
X2 = c(21, 22, 23, 33, 35, 33, 41, 43, 45)
C3 = c("aat", "aaf", "bbg", "aag", "aag", "bbg", "aag", "aag", "bbg")
P3 = c("A", "B", "C", "J", "J", "J", "J", "H", "H")
X3 = c(21, 22, 23, 33, 35, 33, 41, 43, 45)
df = data.frame(id, timepoint, position, C1, P1, X1, C2, P2, X2, C3, P3, X3)
我想从这个格式开始
id timepoint position C1 P1 X1 C2 P2 X2 C3 P3 X3
222 aa 1 aat A 21 aat A 21 aat A 21
222 aa 2 aaf B 22 aaf B 22 aaf B 22
222 bb 1 bbg C 23 bbg C 23 bbg C 23
333 aa 1 aag J 33 aag J 33 aag J 33
333 aa 2 aag J 35 aag J 35 aag J 35
333 bb 1 bbg J 33 bbg J 33 bbg J 33
444 aa 1 aag J 41 aag J 41 aag J 41
444 aa 2 aag H 43 aag H 43 aag H 43
444 bb 1 bbg H 45 bbg H 45 bbg H 45
到这个格式。
id timepoint position C P X
222 aa 1 aat A 21
222 aa 2 aaf B 22
222 bb 1 bbg C 23
333 aa 1 aag J 33
333 aa 2 aag J 35
333 bb 1 bbg J 33
444 aa 1 aag J 41
444 aa 2 aag H 43
444 bb 1 bbg H 45
222 aa 1 aat A 21
222 aa 2 aaf B 22
222 bb 1 bbg C 23
333 aa 1 aag J 33
333 aa 2 aag J 35
333 bb 1 bbg J 33
444 aa 1 aag J 41
444 aa 2 aag H 43
444 bb 1 bbg H 45
222 aa 1 aat A 21
222 aa 2 aaf B 22
222 bb 1 bbg C 23
333 aa 1 aag J 33
333 aa 2 aag J 35
333 bb 1 bbg J 33
444 aa 1 aag J 41
444 aa 2 aag H 43
444 bb 1 bbg H 45
我们可以使用 data.table
中的 melt
轻松做到这一点,它可以占用多个 measure
patterns
library(data.table)
melt(setDT(df), measure = patterns("^C\d+", "^P\d+", "^X\d+"),
value.name = c("C", "P", "X"))[, variable := NULL][]
# id timepoint position C P X
# 1: 222 aa 1 aat A 21
# 2: 222 aa 2 aaf B 22
# 3: 222 bb 1 bbg C 23
# 4: 333 aa 1 aag J 33
# 5: 333 aa 2 aag J 35
# 6: 333 bb 1 bbg J 33
# 7: 444 aa 1 aag J 41
# 8: 444 aa 2 aag H 43
# 9: 444 bb 1 bbg H 45
#10: 222 aa 1 aat A 21
#11: 222 aa 2 aaf B 22
#12: 222 bb 1 bbg C 23
#13: 333 aa 1 aag J 33
#14: 333 aa 2 aag J 35
#15: 333 bb 1 bbg J 33
#16: 444 aa 1 aag J 41
#17: 444 aa 2 aag H 43
#18: 444 bb 1 bbg H 45
#19: 222 aa 1 aat A 21
#20: 222 aa 2 aaf B 22
#21: 222 bb 1 bbg C 23
#22: 333 aa 1 aag J 33
#23: 333 aa 2 aag J 35
#24: 333 bb 1 bbg J 33
#25: 444 aa 1 aag J 41
#26: 444 aa 2 aag H 43
#27: 444 bb 1 bbg H 45