根据 R 中的 table 关键字对数据进行分组的最佳方法是什么
What would be the best approach for grouping data according to a table of keywords in R
我有以下用于分组数据的字典
1. [aa11, aa21, aa31, aa34], "group A"
2. [x23z, x22z, x32z, x35z, x34z],"group B"
3. [lg32z, lg22z, lg84x, lg94y], "group C"
4. ...
数据中的列本身也可能有多个代码。我想要的是使用上面的字典并将组分配给数据:
1. [aa31, aa34], "group A"
2. [lg94z], "group C"
3. [lg84x], "group C"
4. [x22z, x23z] "group B"
将记录分配给第一个 table 中描述的特定组的最佳方法是什么。
到目前为止我的想法:
是否应该首先对数据进行重塑(标记化),使“字典”table 和处理后的 table 中的每条记录都有一个代码?
有两种情况:
- 您的代码变量可能包含多个代码,但都来自同一组。
- 您的代码变量可能包含多个代码,但它们可能来自不同的组。
两种解决方案的查找输入
library(tidyverse)
library(tidytext)
lookup <- data.frame(code = c("[aa11, aa21, aa31, aa34]",
"[x23z, x22z, x32z, x35z, x34z]",
"[lg32z, lg22z, lg84x, lg94y]"),
group = c("group A", "group B", "group C"))
lookup_long <- lookup %>%
unnest_tokens(words, code, "words")
1 的解决方案
您有一个好主意,首先将查找和输入数据标记化,然后根据单词标记进行匹配。如果您的代码变量中有多个代码,我假设您只想保留一次信息,即只有一列包含组信息。
df_1 <- data.frame(code = c("[aa31, aa34]", "[lg94y]", "[lg84x]", "[x22z, x23z]"),
id = 1:4)
df_1 %>%
unnest_tokens(code_new, code, "words", drop = FALSE) %>%
left_join(lookup_long, by = c("code_new" = "words")) %>%
group_by(id) %>%
distinct(group, .keep_all = TRUE) %>%
ungroup() %>%
select(-id, -code_new)
给出:
# A tibble: 4 x 2
code group
<chr> <chr>
1 [aa31, aa34] group A
2 [lg94y] group C
3 [lg84x] group C
4 [x22z, x23z] group B
2 的解决方案
大体思路是一样的,只是为了存储所有代码的组信息,需要稍微改造一下。
df_2 <- data.frame(code = c("[aa31, aa34]", "[lg94y]", "[lg84x]", "[x22z, x23z]", "[x22z, aa11]"),
id = 1:5)
df_2 %>%
unnest_tokens(code_new, code, "words", drop = FALSE) %>%
left_join(lookup_long, by = c("code_new" = "words")) %>%
group_by(id) %>%
distinct(group, .keep_all = TRUE) %>%
mutate(id_wide = 1:n()) %>%
ungroup() %>%
pivot_wider(values_from = group,
names_from = id_wide,
names_prefix = "group_",
id_cols = code)
给出:
# A tibble: 5 x 3
code group_1 group_2
<chr> <chr> <chr>
1 [aa31, aa34] group A <NA>
2 [lg94y] group C <NA>
3 [lg84x] group C <NA>
4 [x22z, x23z] group B <NA>
5 [x22z, aa11] group B group A
标记化可能是最好的方法。您可以像这样手动查找 table:
a <- gsub("]", "", lookup$pattern, fixed = TRUE)
a <- gsub("[", "", a, fixed = TRUE)
a <- gsub(" ", "", a, fixed = TRUE)
b <- strsplit(a, ",")
c <- Map(cbind, b, lookup$group)
d <- data.frame(do.call(rbind, c))
colnames(d) <- c("value", "group")
d
#> value group
#> 1 aa11 group A
#> 2 aa21 group A
#> 3 aa31 group A
#> 4 aa34 group A
#> 5 x23z group B
#> 6 x22z group B
#> 7 x32z group B
#> 8 x35z group B
#> 9 x34z group B
#> 10 lg32z group C
#> 11 lg22z group C
#> 12 lg84x group C
#> 13 lg94y group C
或者您可以跳过分词和“模糊连接”:
a <- gsub(", ", "|", lookup$pattern, fixed = TRUE)
a <- gsub("[", "", a, fixed = TRUE)
a <- gsub("]", "", a, fixed = TRUE)
lookup2 <- cbind(lookup,a)
lookup2
#> pattern group a
#> 1 [aa11, aa21, aa31, aa34] group A aa11|aa21|aa31|aa34
#> 2 [x23z, x22z, x32z, x35z, x34z] group B x23z|x22z|x32z|x35z|x34z
#> 3 [lg32z, lg22z, lg84x, lg94y] group C lg32z|lg22z|lg84x|lg94y
for(i in 1:nrow(lookup2)){
df[grepl(pattern = lookup2$a[i], x = df$V1),"V3"] <- lookup2$group[i]
}
df
#> V1 V2 V3
#> 1 [aa31, aa34] group A group A
#> 2 [lg94z] group C <NA>
#> 3 [lg84x] group C group C
#> 4 [x22z, x23z] group B group B
Created on 2021-09-22 by the reprex package (v2.0.1)
数据:
lookup <- data.frame(
pattern = c("[aa11, aa21, aa31, aa34]",
"[x23z, x22z, x32z, x35z, x34z]",
"[lg32z, lg22z, lg84x, lg94y]"),
group = c("group A", "group B", "group C"))
df <- data.frame(
V1 = c("[aa31, aa34]", "[lg94z]", "[lg84x]", "[x22z, x23z]"),
V2 = c("group A", "group C", "group C", "group B"))
请注意,您输入的数据中有错字,因此 returns NA
在 df[2,"V3"]
。我很确定循环可以矢量化,我只是暂时想不出如何。有什么事我会更新的。
我有以下用于分组数据的字典
1. [aa11, aa21, aa31, aa34], "group A"
2. [x23z, x22z, x32z, x35z, x34z],"group B"
3. [lg32z, lg22z, lg84x, lg94y], "group C"
4. ...
数据中的列本身也可能有多个代码。我想要的是使用上面的字典并将组分配给数据:
1. [aa31, aa34], "group A"
2. [lg94z], "group C"
3. [lg84x], "group C"
4. [x22z, x23z] "group B"
将记录分配给第一个 table 中描述的特定组的最佳方法是什么。
到目前为止我的想法: 是否应该首先对数据进行重塑(标记化),使“字典”table 和处理后的 table 中的每条记录都有一个代码?
有两种情况:
- 您的代码变量可能包含多个代码,但都来自同一组。
- 您的代码变量可能包含多个代码,但它们可能来自不同的组。
两种解决方案的查找输入
library(tidyverse)
library(tidytext)
lookup <- data.frame(code = c("[aa11, aa21, aa31, aa34]",
"[x23z, x22z, x32z, x35z, x34z]",
"[lg32z, lg22z, lg84x, lg94y]"),
group = c("group A", "group B", "group C"))
lookup_long <- lookup %>%
unnest_tokens(words, code, "words")
1 的解决方案
您有一个好主意,首先将查找和输入数据标记化,然后根据单词标记进行匹配。如果您的代码变量中有多个代码,我假设您只想保留一次信息,即只有一列包含组信息。
df_1 <- data.frame(code = c("[aa31, aa34]", "[lg94y]", "[lg84x]", "[x22z, x23z]"),
id = 1:4)
df_1 %>%
unnest_tokens(code_new, code, "words", drop = FALSE) %>%
left_join(lookup_long, by = c("code_new" = "words")) %>%
group_by(id) %>%
distinct(group, .keep_all = TRUE) %>%
ungroup() %>%
select(-id, -code_new)
给出:
# A tibble: 4 x 2
code group
<chr> <chr>
1 [aa31, aa34] group A
2 [lg94y] group C
3 [lg84x] group C
4 [x22z, x23z] group B
2 的解决方案
大体思路是一样的,只是为了存储所有代码的组信息,需要稍微改造一下。
df_2 <- data.frame(code = c("[aa31, aa34]", "[lg94y]", "[lg84x]", "[x22z, x23z]", "[x22z, aa11]"),
id = 1:5)
df_2 %>%
unnest_tokens(code_new, code, "words", drop = FALSE) %>%
left_join(lookup_long, by = c("code_new" = "words")) %>%
group_by(id) %>%
distinct(group, .keep_all = TRUE) %>%
mutate(id_wide = 1:n()) %>%
ungroup() %>%
pivot_wider(values_from = group,
names_from = id_wide,
names_prefix = "group_",
id_cols = code)
给出:
# A tibble: 5 x 3
code group_1 group_2
<chr> <chr> <chr>
1 [aa31, aa34] group A <NA>
2 [lg94y] group C <NA>
3 [lg84x] group C <NA>
4 [x22z, x23z] group B <NA>
5 [x22z, aa11] group B group A
标记化可能是最好的方法。您可以像这样手动查找 table:
a <- gsub("]", "", lookup$pattern, fixed = TRUE)
a <- gsub("[", "", a, fixed = TRUE)
a <- gsub(" ", "", a, fixed = TRUE)
b <- strsplit(a, ",")
c <- Map(cbind, b, lookup$group)
d <- data.frame(do.call(rbind, c))
colnames(d) <- c("value", "group")
d
#> value group
#> 1 aa11 group A
#> 2 aa21 group A
#> 3 aa31 group A
#> 4 aa34 group A
#> 5 x23z group B
#> 6 x22z group B
#> 7 x32z group B
#> 8 x35z group B
#> 9 x34z group B
#> 10 lg32z group C
#> 11 lg22z group C
#> 12 lg84x group C
#> 13 lg94y group C
或者您可以跳过分词和“模糊连接”:
a <- gsub(", ", "|", lookup$pattern, fixed = TRUE)
a <- gsub("[", "", a, fixed = TRUE)
a <- gsub("]", "", a, fixed = TRUE)
lookup2 <- cbind(lookup,a)
lookup2
#> pattern group a
#> 1 [aa11, aa21, aa31, aa34] group A aa11|aa21|aa31|aa34
#> 2 [x23z, x22z, x32z, x35z, x34z] group B x23z|x22z|x32z|x35z|x34z
#> 3 [lg32z, lg22z, lg84x, lg94y] group C lg32z|lg22z|lg84x|lg94y
for(i in 1:nrow(lookup2)){
df[grepl(pattern = lookup2$a[i], x = df$V1),"V3"] <- lookup2$group[i]
}
df
#> V1 V2 V3
#> 1 [aa31, aa34] group A group A
#> 2 [lg94z] group C <NA>
#> 3 [lg84x] group C group C
#> 4 [x22z, x23z] group B group B
Created on 2021-09-22 by the reprex package (v2.0.1)
数据:
lookup <- data.frame(
pattern = c("[aa11, aa21, aa31, aa34]",
"[x23z, x22z, x32z, x35z, x34z]",
"[lg32z, lg22z, lg84x, lg94y]"),
group = c("group A", "group B", "group C"))
df <- data.frame(
V1 = c("[aa31, aa34]", "[lg94z]", "[lg84x]", "[x22z, x23z]"),
V2 = c("group A", "group C", "group C", "group B"))
请注意,您输入的数据中有错字,因此 returns NA
在 df[2,"V3"]
。我很确定循环可以矢量化,我只是暂时想不出如何。有什么事我会更新的。