根据部分列名组合列
Combining columns based on partial column name
我希望有一种 easier/faster/cleaner 的方式来做我想做的事,因为现在这太复杂了:
我的列名是:
"OTS_SM0_1","OTS_SM0_2","OTS_SM0_3","OTS_SM0_4","OTS_SM0_5","OTS_SM0_6",
"OTS_SM0_7","OTS_SM0_8","OTS_SM0_9",
"OTS_SM1_x1_4","OTS_SM1_x1_6","OTS_SM1_x1_7","OTS_SM1_x1_8",
"OTS_SM1_x2_4","OTS_SM1_x2_6","OTS_SM1_x2_7","OTS_SM1_x2_8",
"OTS_SM1_x3_4","OTS_SM1_x3_6","OTS_SM1_x3_7","OTS_SM1_x3_8",
"OTS_SM1_x4_4","OTS_SM1_x4_6","OTS_SM1_x4_7","OTS_SM1_x4_8",
"OTS_SM1_x5_4","OTS_SM1_x5_6","OTS_SM1_x5_7","OTS_SM1_x5_8",
"OTS_SM1_x6_4","OTS_SM1_x6_6","OTS_SM1_x6_7","OTS_SM1_x6_8",
"OTS_SM1_x7_4","OTS_SM1_x7_6","OTS_SM1_x7_7","OTS_SM1_x7_8",
"OTS_SM1_x8_4","OTS_SM1_x8_6","OTS_SM1_x8_7","OTS_SM1_x8_8",
"OTS_SM1_x9_4","OTS_SM1_x9_6","OTS_SM1_x9_7","OTS_SM1_x9_8",
"OTS_SM2_x1","OTS_SM2_x2","OTS_SM2_x3","OTS_SM2_x4","OTS_SM2_x5",
"OTS_SM2_x6","OTS_SM2_x7","OTS_SM2_x8","OTS_SM2_x9"
我需要根据名称将他们的条目连接成一列。这些是需要合并的名称:
OTS_SM0 OTS_SM1_x1 OTS_SM1_x2 OTS_SM1_x3 OTS_SM1_x4 OTS_SM1_x5 OTS_SM1_x6
OTS_SM1_x7 OTS_SM1_x8 OTS_SM1_x9 OTS_SM2_x1 OTS_SM2_x2 OTS_SM2_x3 OTS_SM2_x4
OTS_SM2_x5 OTS_SM2_x6 OTS_SM2_x7 OTS_SM2_x8 OTS_SM2_x9
但是要注意的是,这些名称并不总是相同的,只是 OTS_SM 部分将保持不变,要组合的列数以及它们在数据框中的索引将发生变化。
我目前的解决方案是:
columnnames <- c("OTS_SM0_1","OTS_SM0_2","OTS_SM0_3","OTS_SM0_4","OTS_SM0_5","OTS_SM0_6","OTS_SM0_7","OTS_SM0_8","OTS_SM0_9",
"OTS_SM1_x1_4","OTS_SM1_x1_6","OTS_SM1_x1_7","OTS_SM1_x1_8","OTS_SM1_x2_4","OTS_SM1_x2_6","OTS_SM1_x2_7","OTS_SM1_x2_8","OTS_SM1_x3_4",
"OTS_SM1_x3_6","OTS_SM1_x3_7","OTS_SM1_x3_8","OTS_SM1_x4_4","OTS_SM1_x4_6","OTS_SM1_x4_7","OTS_SM1_x4_8","OTS_SM1_x5_4","OTS_SM1_x5_6",
"OTS_SM1_x5_7","OTS_SM1_x5_8","OTS_SM1_x6_4","OTS_SM1_x6_6","OTS_SM1_x6_7","OTS_SM1_x6_8","OTS_SM1_x7_4","OTS_SM1_x7_6","OTS_SM1_x7_7",
"OTS_SM1_x7_8","OTS_SM1_x8_4","OTS_SM1_x8_6","OTS_SM1_x8_7","OTS_SM1_x8_8","OTS_SM1_x9_4","OTS_SM1_x9_6","OTS_SM1_x9_7","OTS_SM1_x9_8",
"OTS_SM2_x1","OTS_SM2_x2","OTS_SM2_x3","OTS_SM2_x4","OTS_SM2_x5","OTS_SM2_x6","OTS_SM2_x7","OTS_SM2_x8","OTS_SM2_x9")
names1_index = grep('^(?!.*x).*OTS_SM', columnnames, perl=TRUE)
names1 = columnnames[names1_index]
names1 = substring(names1, 1, 7)
names2_index = grep("OTS_SM.*_x", columnnames)
names2 = columnnames[names2_index]
names2 = substring(names2, 1, 10)
输出如下:
> names1
"OTS_SM0" "OTS_SM0" "OTS_SM0" "OTS_SM0" "OTS_SM0"
"OTS_SM0" "OTS_SM0" "OTS_SM0" "OTS_SM0"
> names2
"OTS_SM1_x1" "OTS_SM1_x1" "OTS_SM1_x1" "OTS_SM1_x1" "OTS_SM1_x2"
"OTS_SM1_x2" "OTS_SM1_x2" "OTS_SM1_x2" "OTS_SM1_x3" "OTS_SM1_x3"
"OTS_SM1_x3" "OTS_SM1_x3" "OTS_SM1_x4" "OTS_SM1_x4" "OTS_SM1_x4"
"OTS_SM1_x4" "OTS_SM1_x5" "OTS_SM1_x5" "OTS_SM1_x5" "OTS_SM1_x5"
"OTS_SM1_x6" "OTS_SM1_x6" "OTS_SM1_x6" "OTS_SM1_x6" "OTS_SM1_x7"
"OTS_SM1_x7" "OTS_SM1_x7" "OTS_SM1_x7" "OTS_SM1_x8" "OTS_SM1_x8"
"OTS_SM1_x8" "OTS_SM1_x8" "OTS_SM1_x9" "OTS_SM1_x9" "OTS_SM1_x9"
"OTS_SM1_x9" "OTS_SM2_x1" "OTS_SM2_x2" "OTS_SM2_x3" "OTS_SM2_x4"
"OTS_SM2_x5" "OTS_SM2_x6" "OTS_SM2_x7" "OTS_SM2_x8" "OTS_SM2_x9"
例如,对于数据帧 DF 中的 name1
变量:
OTS_SM0_1 OTS_SM0_2 OTS_SM0_3 OTS_SM0_4 OTS_SM0_5 OTS_SM0_6 OTS_SM0_7 OTS_SM0_8 OTS_SM0_9
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 0 0 0 0 0 0 0 0 None of the above
2 Facebook 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 None of the above
4 Facebook Instagram Twitter Snapchat Pinterest 0 Tik Tok 0 0
5 0 0 0 0 0 LinkedIn 0 0 0
6 Facebook 0 0 0 Pinterest 0 0 0 0
7 Facebook Instagram 0 0 0 0 0 0 0
8 Facebook Instagram 0 0 Pinterest 0 Tik Tok 0 0
9 NA NA NA NA NA NA NA NA NA
10 NA NA NA NA NA NA NA NA NA
然后我将合并同名列索引:
unique_names1 <- unique(names1)
for (i in 1:length(unique_names1)){
combine1= DF[,grep(unique_names1[i],colnames(DF))]
NewCol1 <- do.call(paste, c(combine1[], sep = ";"))
NewCol1 <- str_remove_all(NewCol1,";NA")
NewCol1 <- str_remove_all(NewCol1,"NA;")
NewCol1 <- str_remove_all(NewCol1,";0")
NewCol1 <- str_remove_all(NewCol1,"0;")
DF <- cbind(DF,NewCol1)
}
NewCol1
[1] "None of the above" "Facebook"
[3] "None of the above" "Facebook;Instagram;Twitter;Snapchat;Pinterest;Tik Tok"
[5] "LinkedIn" "Facebook;Pinterest"
[7] "Facebook;Instagram" "Facebook;Instagram;Pinterest;Tik Tok"
[9] "NA" "NA"
然后使用一些更有趣的索引将其重命名为“OTS_SM0”。以及删除原始列。
iris
数据集非常接近您的问题:
head(iris)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
我们可以在 tidyverse 中很容易地做到这一点。首先,我们转换为长格式,以便名称易于使用。然后,我们将名称截断为要用于组合列的核心。然后我们用新的、数量更有限的列将它展开回宽格式。
还需要一个步骤:pivot_wider
确保每一行都是唯一的,因此新的“萼片”和“花瓣”列分别包含每个物种的值列表。因为你一行一行(你只是在连接),我们使用 unnest
来获取这些列表列并将它们转换成多行:
library(tidyverse)
iris %>%
pivot_longer(-Species) %>%
mutate(name = gsub('\..*', '', name)) %>%
pivot_wider(names_from = 'name', values_from = 'value', values_fn = list) %>%
unnest(cols = c('Sepal', 'Petal'))
# A tibble: 300 × 3
Species Sepal Petal
<fct> <dbl> <dbl>
1 setosa 5.1 1.4
2 setosa 3.5 0.2
3 setosa 4.9 1.4
4 setosa 3 0.2
5 setosa 4.7 1.3
6 setosa 3.2 0.2
7 setosa 4.6 1.5
8 setosa 3.1 0.2
9 setosa 5 1.4
10 setosa 3.6 0.2
# … with 290 more rows
如果您真的不想在 unnest
中指定列名,您可以使用 unnest(cols = colnames(.))
将其应用于所有列名,或者只是省略 cols
参数(尽管这会发出警告并且将来可能会中断),
假设您的数据框 DF
看起来像这样
OTS_SM0_1 OTS_SM0_2 OTS_SM0_3 OTS_SM0_4 OTS_SM0_5 OTS_SM0_6 OTS_SM0_7 OTS_SM0_8 OTS_SM0_9 OTS_SM1_x1_4 OTS_SM1_x1_6 OTS_SM1_x1_7 OTS_SM1_x1_8 OTS_SM1_x2_4 OTS_SM1_x2_6 OTS_SM1_x2_7 OTS_SM1_x2_8 OTS_SM1_x3_4 OTS_SM1_x3_6 OTS_SM1_x3_7 OTS_SM1_x3_8 OTS_SM1_x4_4 OTS_SM1_x4_6 OTS_SM1_x4_7 OTS_SM1_x4_8 OTS_SM1_x5_4 OTS_SM1_x5_6 OTS_SM1_x5_7 OTS_SM1_x5_8 OTS_SM1_x6_4 OTS_SM1_x6_6 OTS_SM1_x6_7 OTS_SM1_x6_8 OTS_SM1_x7_4 OTS_SM1_x7_6 OTS_SM1_x7_7 OTS_SM1_x7_8 OTS_SM1_x8_4 OTS_SM1_x8_6 OTS_SM1_x8_7 OTS_SM1_x8_8 OTS_SM1_x9_4 OTS_SM1_x9_6 OTS_SM1_x9_7 OTS_SM1_x9_8 OTS_SM2_x1 OTS_SM2_x2 OTS_SM2_x3 OTS_SM2_x4 OTS_SM2_x5 OTS_SM2_x6 OTS_SM2_x7 OTS_SM2_x8 OTS_SM2_x9
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
2 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74 76 78 80 82 84 86 88 90 92 94 96 98 100 102 104 106 108
这是一个dplyr
方法。我们使用 id
来保留行关系。在第一个主元之后,我们只删除最后一个 "_"
后面的字符,前提是它们是数字。最后,我们将每组 id 和变量(在本例中为 name
)的行汇总到一个单元格中,并将数据帧从长转换为宽。
library(dplyr)
library(tidyr)
DF %>%
mutate(id = row_number()) %>%
pivot_longer(-id) %>%
group_by(id, name = sub("(_\d+)?$", "", name)) %>%
summarize(value = paste0(value, collapse = ";"), .groups = "drop") %>%
pivot_wider() %>%
select(-id)
输出(我使用不同的打印方法向您显示所有列。默认打印方法可能会在您的屏幕上呈现不同的内容,但底层对象是相同的。)
OTS_SM0 OTS_SM1_x1 OTS_SM1_x2 OTS_SM1_x3 OTS_SM1_x4 OTS_SM1_x5 OTS_SM1_x6 OTS_SM1_x7 OTS_SM1_x8 OTS_SM1_x9 OTS_SM2_x1 OTS_SM2_x2 OTS_SM2_x3 OTS_SM2_x4 OTS_SM2_x5 OTS_SM2_x6 OTS_SM2_x7 OTS_SM2_x8 OTS_SM2_x9
1 1;2;3;4;5;6;7;8;9 10;11;12;13 14;15;16;17 18;19;20;21 22;23;24;25 26;27;28;29 30;31;32;33 34;35;36;37 38;39;40;41 42;43;44;45 46 47 48 49 50 51 52 53 54
2 2;4;6;8;10;12;14;16;18 20;22;24;26 28;30;32;34 36;38;40;42 44;46;48;50 52;54;56;58 60;62;64;66 68;70;72;74 76;78;80;82 84;86;88;90 92 94 96 98 100 102 104 106 108
但是,如果您想对变量进行一些分析,将值嵌套在单个单元格中通常没有帮助。也许您还想考虑稍微不同的数据表示。
library(dplyr)
library(tidyr)
DF %>%
rename_with(~sub("(_\d+)?$", "`\1", .)) %>%
mutate(id = row_number()) %>%
pivot_longer(-id, names_to = c(".value", NA), names_pattern = "(.+)`(_\d+)?")
,给出
id OTS_SM0 OTS_SM1_x1 OTS_SM1_x2 OTS_SM1_x3 OTS_SM1_x4 OTS_SM1_x5 OTS_SM1_x6 OTS_SM1_x7 OTS_SM1_x8 OTS_SM1_x9 OTS_SM2_x1 OTS_SM2_x2 OTS_SM2_x3 OTS_SM2_x4 OTS_SM2_x5 OTS_SM2_x6 OTS_SM2_x7 OTS_SM2_x8 OTS_SM2_x9
1 1 1 10 14 18 22 26 30 34 38 42 46 47 48 49 50 51 52 53 54
2 1 2 11 15 19 23 27 31 35 39 43 NA NA NA NA NA NA NA NA NA
3 1 3 12 16 20 24 28 32 36 40 44 NA NA NA NA NA NA NA NA NA
4 1 4 13 17 21 25 29 33 37 41 45 NA NA NA NA NA NA NA NA NA
5 1 5 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
6 1 6 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
7 1 7 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
8 1 8 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
9 1 9 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
10 2 2 20 28 36 44 52 60 68 76 84 92 94 96 98 100 102 104 106 108
11 2 4 22 30 38 46 54 62 70 78 86 NA NA NA NA NA NA NA NA NA
12 2 6 24 32 40 48 56 64 72 80 88 NA NA NA NA NA NA NA NA NA
13 2 8 26 34 42 50 58 66 74 82 90 NA NA NA NA NA NA NA NA NA
14 2 10 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
15 2 12 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
16 2 14 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
17 2 16 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
18 2 18 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
数据
structure(list(OTS_SM0_1 = c(1, 2), OTS_SM0_2 = c(2, 4), OTS_SM0_3 = c(3,
6), OTS_SM0_4 = c(4, 8), OTS_SM0_5 = c(5, 10), OTS_SM0_6 = c(6,
12), OTS_SM0_7 = c(7, 14), OTS_SM0_8 = c(8, 16), OTS_SM0_9 = c(9,
18), OTS_SM1_x1_4 = c(10, 20), OTS_SM1_x1_6 = c(11, 22), OTS_SM1_x1_7 = c(12,
24), OTS_SM1_x1_8 = c(13, 26), OTS_SM1_x2_4 = c(14, 28), OTS_SM1_x2_6 = c(15,
30), OTS_SM1_x2_7 = c(16, 32), OTS_SM1_x2_8 = c(17, 34), OTS_SM1_x3_4 = c(18,
36), OTS_SM1_x3_6 = c(19, 38), OTS_SM1_x3_7 = c(20, 40), OTS_SM1_x3_8 = c(21,
42), OTS_SM1_x4_4 = c(22, 44), OTS_SM1_x4_6 = c(23, 46), OTS_SM1_x4_7 = c(24,
48), OTS_SM1_x4_8 = c(25, 50), OTS_SM1_x5_4 = c(26, 52), OTS_SM1_x5_6 = c(27,
54), OTS_SM1_x5_7 = c(28, 56), OTS_SM1_x5_8 = c(29, 58), OTS_SM1_x6_4 = c(30,
60), OTS_SM1_x6_6 = c(31, 62), OTS_SM1_x6_7 = c(32, 64), OTS_SM1_x6_8 = c(33,
66), OTS_SM1_x7_4 = c(34, 68), OTS_SM1_x7_6 = c(35, 70), OTS_SM1_x7_7 = c(36,
72), OTS_SM1_x7_8 = c(37, 74), OTS_SM1_x8_4 = c(38, 76), OTS_SM1_x8_6 = c(39,
78), OTS_SM1_x8_7 = c(40, 80), OTS_SM1_x8_8 = c(41, 82), OTS_SM1_x9_4 = c(42,
84), OTS_SM1_x9_6 = c(43, 86), OTS_SM1_x9_7 = c(44, 88), OTS_SM1_x9_8 = c(45,
90), OTS_SM2_x1 = c(46, 92), OTS_SM2_x2 = c(47, 94), OTS_SM2_x3 = c(48,
96), OTS_SM2_x4 = c(49, 98), OTS_SM2_x5 = c(50, 100), OTS_SM2_x6 = c(51,
102), OTS_SM2_x7 = c(52, 104), OTS_SM2_x8 = c(53, 106), OTS_SM2_x9 = c(54,
108)), row.names = c(NA, -2L), class = "data.frame")
我希望有一种 easier/faster/cleaner 的方式来做我想做的事,因为现在这太复杂了:
我的列名是:
"OTS_SM0_1","OTS_SM0_2","OTS_SM0_3","OTS_SM0_4","OTS_SM0_5","OTS_SM0_6",
"OTS_SM0_7","OTS_SM0_8","OTS_SM0_9",
"OTS_SM1_x1_4","OTS_SM1_x1_6","OTS_SM1_x1_7","OTS_SM1_x1_8",
"OTS_SM1_x2_4","OTS_SM1_x2_6","OTS_SM1_x2_7","OTS_SM1_x2_8",
"OTS_SM1_x3_4","OTS_SM1_x3_6","OTS_SM1_x3_7","OTS_SM1_x3_8",
"OTS_SM1_x4_4","OTS_SM1_x4_6","OTS_SM1_x4_7","OTS_SM1_x4_8",
"OTS_SM1_x5_4","OTS_SM1_x5_6","OTS_SM1_x5_7","OTS_SM1_x5_8",
"OTS_SM1_x6_4","OTS_SM1_x6_6","OTS_SM1_x6_7","OTS_SM1_x6_8",
"OTS_SM1_x7_4","OTS_SM1_x7_6","OTS_SM1_x7_7","OTS_SM1_x7_8",
"OTS_SM1_x8_4","OTS_SM1_x8_6","OTS_SM1_x8_7","OTS_SM1_x8_8",
"OTS_SM1_x9_4","OTS_SM1_x9_6","OTS_SM1_x9_7","OTS_SM1_x9_8",
"OTS_SM2_x1","OTS_SM2_x2","OTS_SM2_x3","OTS_SM2_x4","OTS_SM2_x5",
"OTS_SM2_x6","OTS_SM2_x7","OTS_SM2_x8","OTS_SM2_x9"
我需要根据名称将他们的条目连接成一列。这些是需要合并的名称:
OTS_SM0 OTS_SM1_x1 OTS_SM1_x2 OTS_SM1_x3 OTS_SM1_x4 OTS_SM1_x5 OTS_SM1_x6
OTS_SM1_x7 OTS_SM1_x8 OTS_SM1_x9 OTS_SM2_x1 OTS_SM2_x2 OTS_SM2_x3 OTS_SM2_x4
OTS_SM2_x5 OTS_SM2_x6 OTS_SM2_x7 OTS_SM2_x8 OTS_SM2_x9
但是要注意的是,这些名称并不总是相同的,只是 OTS_SM 部分将保持不变,要组合的列数以及它们在数据框中的索引将发生变化。
我目前的解决方案是:
columnnames <- c("OTS_SM0_1","OTS_SM0_2","OTS_SM0_3","OTS_SM0_4","OTS_SM0_5","OTS_SM0_6","OTS_SM0_7","OTS_SM0_8","OTS_SM0_9",
"OTS_SM1_x1_4","OTS_SM1_x1_6","OTS_SM1_x1_7","OTS_SM1_x1_8","OTS_SM1_x2_4","OTS_SM1_x2_6","OTS_SM1_x2_7","OTS_SM1_x2_8","OTS_SM1_x3_4",
"OTS_SM1_x3_6","OTS_SM1_x3_7","OTS_SM1_x3_8","OTS_SM1_x4_4","OTS_SM1_x4_6","OTS_SM1_x4_7","OTS_SM1_x4_8","OTS_SM1_x5_4","OTS_SM1_x5_6",
"OTS_SM1_x5_7","OTS_SM1_x5_8","OTS_SM1_x6_4","OTS_SM1_x6_6","OTS_SM1_x6_7","OTS_SM1_x6_8","OTS_SM1_x7_4","OTS_SM1_x7_6","OTS_SM1_x7_7",
"OTS_SM1_x7_8","OTS_SM1_x8_4","OTS_SM1_x8_6","OTS_SM1_x8_7","OTS_SM1_x8_8","OTS_SM1_x9_4","OTS_SM1_x9_6","OTS_SM1_x9_7","OTS_SM1_x9_8",
"OTS_SM2_x1","OTS_SM2_x2","OTS_SM2_x3","OTS_SM2_x4","OTS_SM2_x5","OTS_SM2_x6","OTS_SM2_x7","OTS_SM2_x8","OTS_SM2_x9")
names1_index = grep('^(?!.*x).*OTS_SM', columnnames, perl=TRUE)
names1 = columnnames[names1_index]
names1 = substring(names1, 1, 7)
names2_index = grep("OTS_SM.*_x", columnnames)
names2 = columnnames[names2_index]
names2 = substring(names2, 1, 10)
输出如下:
> names1
"OTS_SM0" "OTS_SM0" "OTS_SM0" "OTS_SM0" "OTS_SM0"
"OTS_SM0" "OTS_SM0" "OTS_SM0" "OTS_SM0"
> names2
"OTS_SM1_x1" "OTS_SM1_x1" "OTS_SM1_x1" "OTS_SM1_x1" "OTS_SM1_x2"
"OTS_SM1_x2" "OTS_SM1_x2" "OTS_SM1_x2" "OTS_SM1_x3" "OTS_SM1_x3"
"OTS_SM1_x3" "OTS_SM1_x3" "OTS_SM1_x4" "OTS_SM1_x4" "OTS_SM1_x4"
"OTS_SM1_x4" "OTS_SM1_x5" "OTS_SM1_x5" "OTS_SM1_x5" "OTS_SM1_x5"
"OTS_SM1_x6" "OTS_SM1_x6" "OTS_SM1_x6" "OTS_SM1_x6" "OTS_SM1_x7"
"OTS_SM1_x7" "OTS_SM1_x7" "OTS_SM1_x7" "OTS_SM1_x8" "OTS_SM1_x8"
"OTS_SM1_x8" "OTS_SM1_x8" "OTS_SM1_x9" "OTS_SM1_x9" "OTS_SM1_x9"
"OTS_SM1_x9" "OTS_SM2_x1" "OTS_SM2_x2" "OTS_SM2_x3" "OTS_SM2_x4"
"OTS_SM2_x5" "OTS_SM2_x6" "OTS_SM2_x7" "OTS_SM2_x8" "OTS_SM2_x9"
例如,对于数据帧 DF 中的 name1
变量:
OTS_SM0_1 OTS_SM0_2 OTS_SM0_3 OTS_SM0_4 OTS_SM0_5 OTS_SM0_6 OTS_SM0_7 OTS_SM0_8 OTS_SM0_9
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 0 0 0 0 0 0 0 0 None of the above
2 Facebook 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 None of the above
4 Facebook Instagram Twitter Snapchat Pinterest 0 Tik Tok 0 0
5 0 0 0 0 0 LinkedIn 0 0 0
6 Facebook 0 0 0 Pinterest 0 0 0 0
7 Facebook Instagram 0 0 0 0 0 0 0
8 Facebook Instagram 0 0 Pinterest 0 Tik Tok 0 0
9 NA NA NA NA NA NA NA NA NA
10 NA NA NA NA NA NA NA NA NA
然后我将合并同名列索引:
unique_names1 <- unique(names1)
for (i in 1:length(unique_names1)){
combine1= DF[,grep(unique_names1[i],colnames(DF))]
NewCol1 <- do.call(paste, c(combine1[], sep = ";"))
NewCol1 <- str_remove_all(NewCol1,";NA")
NewCol1 <- str_remove_all(NewCol1,"NA;")
NewCol1 <- str_remove_all(NewCol1,";0")
NewCol1 <- str_remove_all(NewCol1,"0;")
DF <- cbind(DF,NewCol1)
}
NewCol1
[1] "None of the above" "Facebook"
[3] "None of the above" "Facebook;Instagram;Twitter;Snapchat;Pinterest;Tik Tok"
[5] "LinkedIn" "Facebook;Pinterest"
[7] "Facebook;Instagram" "Facebook;Instagram;Pinterest;Tik Tok"
[9] "NA" "NA"
然后使用一些更有趣的索引将其重命名为“OTS_SM0”。以及删除原始列。
iris
数据集非常接近您的问题:
head(iris)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
我们可以在 tidyverse 中很容易地做到这一点。首先,我们转换为长格式,以便名称易于使用。然后,我们将名称截断为要用于组合列的核心。然后我们用新的、数量更有限的列将它展开回宽格式。
还需要一个步骤:pivot_wider
确保每一行都是唯一的,因此新的“萼片”和“花瓣”列分别包含每个物种的值列表。因为你一行一行(你只是在连接),我们使用 unnest
来获取这些列表列并将它们转换成多行:
library(tidyverse)
iris %>%
pivot_longer(-Species) %>%
mutate(name = gsub('\..*', '', name)) %>%
pivot_wider(names_from = 'name', values_from = 'value', values_fn = list) %>%
unnest(cols = c('Sepal', 'Petal'))
# A tibble: 300 × 3
Species Sepal Petal
<fct> <dbl> <dbl>
1 setosa 5.1 1.4
2 setosa 3.5 0.2
3 setosa 4.9 1.4
4 setosa 3 0.2
5 setosa 4.7 1.3
6 setosa 3.2 0.2
7 setosa 4.6 1.5
8 setosa 3.1 0.2
9 setosa 5 1.4
10 setosa 3.6 0.2
# … with 290 more rows
如果您真的不想在 unnest
中指定列名,您可以使用 unnest(cols = colnames(.))
将其应用于所有列名,或者只是省略 cols
参数(尽管这会发出警告并且将来可能会中断),
假设您的数据框 DF
看起来像这样
OTS_SM0_1 OTS_SM0_2 OTS_SM0_3 OTS_SM0_4 OTS_SM0_5 OTS_SM0_6 OTS_SM0_7 OTS_SM0_8 OTS_SM0_9 OTS_SM1_x1_4 OTS_SM1_x1_6 OTS_SM1_x1_7 OTS_SM1_x1_8 OTS_SM1_x2_4 OTS_SM1_x2_6 OTS_SM1_x2_7 OTS_SM1_x2_8 OTS_SM1_x3_4 OTS_SM1_x3_6 OTS_SM1_x3_7 OTS_SM1_x3_8 OTS_SM1_x4_4 OTS_SM1_x4_6 OTS_SM1_x4_7 OTS_SM1_x4_8 OTS_SM1_x5_4 OTS_SM1_x5_6 OTS_SM1_x5_7 OTS_SM1_x5_8 OTS_SM1_x6_4 OTS_SM1_x6_6 OTS_SM1_x6_7 OTS_SM1_x6_8 OTS_SM1_x7_4 OTS_SM1_x7_6 OTS_SM1_x7_7 OTS_SM1_x7_8 OTS_SM1_x8_4 OTS_SM1_x8_6 OTS_SM1_x8_7 OTS_SM1_x8_8 OTS_SM1_x9_4 OTS_SM1_x9_6 OTS_SM1_x9_7 OTS_SM1_x9_8 OTS_SM2_x1 OTS_SM2_x2 OTS_SM2_x3 OTS_SM2_x4 OTS_SM2_x5 OTS_SM2_x6 OTS_SM2_x7 OTS_SM2_x8 OTS_SM2_x9
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
2 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74 76 78 80 82 84 86 88 90 92 94 96 98 100 102 104 106 108
这是一个dplyr
方法。我们使用 id
来保留行关系。在第一个主元之后,我们只删除最后一个 "_"
后面的字符,前提是它们是数字。最后,我们将每组 id 和变量(在本例中为 name
)的行汇总到一个单元格中,并将数据帧从长转换为宽。
library(dplyr)
library(tidyr)
DF %>%
mutate(id = row_number()) %>%
pivot_longer(-id) %>%
group_by(id, name = sub("(_\d+)?$", "", name)) %>%
summarize(value = paste0(value, collapse = ";"), .groups = "drop") %>%
pivot_wider() %>%
select(-id)
输出(我使用不同的打印方法向您显示所有列。默认打印方法可能会在您的屏幕上呈现不同的内容,但底层对象是相同的。)
OTS_SM0 OTS_SM1_x1 OTS_SM1_x2 OTS_SM1_x3 OTS_SM1_x4 OTS_SM1_x5 OTS_SM1_x6 OTS_SM1_x7 OTS_SM1_x8 OTS_SM1_x9 OTS_SM2_x1 OTS_SM2_x2 OTS_SM2_x3 OTS_SM2_x4 OTS_SM2_x5 OTS_SM2_x6 OTS_SM2_x7 OTS_SM2_x8 OTS_SM2_x9
1 1;2;3;4;5;6;7;8;9 10;11;12;13 14;15;16;17 18;19;20;21 22;23;24;25 26;27;28;29 30;31;32;33 34;35;36;37 38;39;40;41 42;43;44;45 46 47 48 49 50 51 52 53 54
2 2;4;6;8;10;12;14;16;18 20;22;24;26 28;30;32;34 36;38;40;42 44;46;48;50 52;54;56;58 60;62;64;66 68;70;72;74 76;78;80;82 84;86;88;90 92 94 96 98 100 102 104 106 108
但是,如果您想对变量进行一些分析,将值嵌套在单个单元格中通常没有帮助。也许您还想考虑稍微不同的数据表示。
library(dplyr)
library(tidyr)
DF %>%
rename_with(~sub("(_\d+)?$", "`\1", .)) %>%
mutate(id = row_number()) %>%
pivot_longer(-id, names_to = c(".value", NA), names_pattern = "(.+)`(_\d+)?")
,给出
id OTS_SM0 OTS_SM1_x1 OTS_SM1_x2 OTS_SM1_x3 OTS_SM1_x4 OTS_SM1_x5 OTS_SM1_x6 OTS_SM1_x7 OTS_SM1_x8 OTS_SM1_x9 OTS_SM2_x1 OTS_SM2_x2 OTS_SM2_x3 OTS_SM2_x4 OTS_SM2_x5 OTS_SM2_x6 OTS_SM2_x7 OTS_SM2_x8 OTS_SM2_x9
1 1 1 10 14 18 22 26 30 34 38 42 46 47 48 49 50 51 52 53 54
2 1 2 11 15 19 23 27 31 35 39 43 NA NA NA NA NA NA NA NA NA
3 1 3 12 16 20 24 28 32 36 40 44 NA NA NA NA NA NA NA NA NA
4 1 4 13 17 21 25 29 33 37 41 45 NA NA NA NA NA NA NA NA NA
5 1 5 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
6 1 6 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
7 1 7 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
8 1 8 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
9 1 9 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
10 2 2 20 28 36 44 52 60 68 76 84 92 94 96 98 100 102 104 106 108
11 2 4 22 30 38 46 54 62 70 78 86 NA NA NA NA NA NA NA NA NA
12 2 6 24 32 40 48 56 64 72 80 88 NA NA NA NA NA NA NA NA NA
13 2 8 26 34 42 50 58 66 74 82 90 NA NA NA NA NA NA NA NA NA
14 2 10 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
15 2 12 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
16 2 14 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
17 2 16 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
18 2 18 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
数据
structure(list(OTS_SM0_1 = c(1, 2), OTS_SM0_2 = c(2, 4), OTS_SM0_3 = c(3,
6), OTS_SM0_4 = c(4, 8), OTS_SM0_5 = c(5, 10), OTS_SM0_6 = c(6,
12), OTS_SM0_7 = c(7, 14), OTS_SM0_8 = c(8, 16), OTS_SM0_9 = c(9,
18), OTS_SM1_x1_4 = c(10, 20), OTS_SM1_x1_6 = c(11, 22), OTS_SM1_x1_7 = c(12,
24), OTS_SM1_x1_8 = c(13, 26), OTS_SM1_x2_4 = c(14, 28), OTS_SM1_x2_6 = c(15,
30), OTS_SM1_x2_7 = c(16, 32), OTS_SM1_x2_8 = c(17, 34), OTS_SM1_x3_4 = c(18,
36), OTS_SM1_x3_6 = c(19, 38), OTS_SM1_x3_7 = c(20, 40), OTS_SM1_x3_8 = c(21,
42), OTS_SM1_x4_4 = c(22, 44), OTS_SM1_x4_6 = c(23, 46), OTS_SM1_x4_7 = c(24,
48), OTS_SM1_x4_8 = c(25, 50), OTS_SM1_x5_4 = c(26, 52), OTS_SM1_x5_6 = c(27,
54), OTS_SM1_x5_7 = c(28, 56), OTS_SM1_x5_8 = c(29, 58), OTS_SM1_x6_4 = c(30,
60), OTS_SM1_x6_6 = c(31, 62), OTS_SM1_x6_7 = c(32, 64), OTS_SM1_x6_8 = c(33,
66), OTS_SM1_x7_4 = c(34, 68), OTS_SM1_x7_6 = c(35, 70), OTS_SM1_x7_7 = c(36,
72), OTS_SM1_x7_8 = c(37, 74), OTS_SM1_x8_4 = c(38, 76), OTS_SM1_x8_6 = c(39,
78), OTS_SM1_x8_7 = c(40, 80), OTS_SM1_x8_8 = c(41, 82), OTS_SM1_x9_4 = c(42,
84), OTS_SM1_x9_6 = c(43, 86), OTS_SM1_x9_7 = c(44, 88), OTS_SM1_x9_8 = c(45,
90), OTS_SM2_x1 = c(46, 92), OTS_SM2_x2 = c(47, 94), OTS_SM2_x3 = c(48,
96), OTS_SM2_x4 = c(49, 98), OTS_SM2_x5 = c(50, 100), OTS_SM2_x6 = c(51,
102), OTS_SM2_x7 = c(52, 104), OTS_SM2_x8 = c(53, 106), OTS_SM2_x9 = c(54,
108)), row.names = c(NA, -2L), class = "data.frame")