模糊连接一个数据集中多列上的字符串
Fuzzy join strings on multiple columns in one dataset
我想将一列 (df2$brands) 与许多其他列 (df1$F6_1:f6_12) 进行模糊匹配,这些列包含相同的字符串,但有一些小的拼写错误。
我有两个数据集:
df1:
df1 <- structure(list(F6_1 = c("Braand1", "Brand2", "Brand3", "Brand4", "Brand4",
"Brand5", "Brand6", "Brand7", "Brand6", "Brand8"), F6_2 = c("Brand9",
"", "Brand4", "Brando6", "Brand6", "Brand8", "Brannd4", "Brandd8",
"Brand6", "Brand6"), F6_3 = c("Brand6", "", "Brand6",
"Brand10", "Brand10", "", "Brand8", "Brand10", "Brand8", "Brand3"
), F6_4 = c("", "", "Brand10", "", "Brand3", "", "Brand6", "Brand6",
"Bramd3", "BPand3"), F6_5 = c("", "", "", "", "Brand6",
"", "Brand1", "Brand1", "", "Brand1"), F6_6 = c("",
"", "", "", "Brand6", "", "Brand3", "", "", "Brand1"), F6_7 = c("",
"", "", "", "Brand1", "", "Brand1", "", "", "Brand1"), F6_8 = c("",
"", "", "", "Brand1", "", "", "", "", "Brand6"
), F6_9 = c("", "", "", "", "Brrandu3", "", "", "", "", ""), F6_10 = c("",
"", "", "", "Brand6", "", "", "", "", ""), F6_11 = c("",
"", "", "", "Brand6", "", "", "", "", ""), F6_12 = c("", "",
"", "", "Brand6", "", "", "", "", "")), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
df2:
df2 <- structure(list(brands = c("Brand1", "Brand2", "Brand3", "Brand4", "Brand5",
"Brand6")), row.names = c(NA, -6L), class = c("tbl_df", "tbl",
"data.frame"))
我尝试使用 fuzzyjoin 库中的 stringdist_left_join() 函数,效果很好。
library(tidyverse)
library(fuzzyjoin)
df1_F6_1 <- df1 %>% select(F6_1)
df2_F6_1 <- df2 %>% select(F6_1 = brands)
df_joined_F6_1 <- stringdist_left_join(df_F6_1, df2_F6_1, by = "F6_1", method = "soundex")
这仅适用于一列。但是,我想在完整的 df1 数据集上执行此操作。这可以通过模糊连接每一列并最终将它们全部加在一起来解决。但是应该有更简单更方便的方法来做到这一点。
我的输出应该是这样的:
df3 <- structure(list(F6_1 = c("Braand1", "Brand2", "Brand3", "Brand4",
"Brand4", "Brand5", "Brand6", "Brand7", "Brand6", "Brand8"),
F6_1_a = c("Brand1", "Brand2", "Brand3", "Brand4", "Brand4",
"Brand5", "Brand6", "Brand7", "Brand6", "Brand8"), F6_2 = c("Brand9",
NA, "Brand4", "Brando6", "Brand6", "Brand8", "Brannd4", "Brandd8",
"Brand6", "Brand6"), F6_2_a = c("Brand9", NA, "Brand4", "Brand6",
"Brand6", "Brand8", "Brand4", "Brand8", "Brand6", "Brand6"
), F6_3 = c("Brand6", NA, "Brand6", "Brand10", "Brand10",
"Brand8", "Brand8", "Brand10", "Brand8", "Brand3"), F6_3_a = c("Brand6",
NA, "Brand6", "Brand10", "Brand10", "Brand8", "Brand8", "Brand10",
"Brand8", "Brand3"), F6_4 = c(NA, NA, "Brand10", NA, "Brand3",
NA, "Brand6", "Brand6", "Bramd3", "BPand3"), F6_4_a = c(NA,
NA, "Brand10", NA, "Brand3", NA, "Brand6", "Brand6", "Brand3",
"Brand3"), F6_5 = c(NA, NA, NA, NA, "Brand6", NA, "Brand1",
"Brand1", NA, "Brand1"), F6_5_a = c(NA, NA, NA, NA, "Brand6",
NA, "Brand1", "Brand1", NA, "Brand1"), F6_6 = c(NA, NA, NA,
NA, "Brand6", NA, "Brand3", NA, NA, "Brand1"), F6_6_a = c(NA,
NA, NA, NA, "Brand6", NA, "Brand3", NA, NA, "Brand1"), F6_7 = c(NA,
NA, NA, NA, "Brand1", NA, "Brand1", NA, NA, "Brand1"), F6_7_a = c(NA,
NA, NA, NA, "Brand1", NA, "Brand1", NA, NA, "Brand1"), F6_8 = c(NA,
NA, NA, NA, "Brand1", NA, NA, NA, NA, "Brand6"), F6_8_a = c(NA,
NA, NA, NA, "Brand1", NA, NA, NA, NA, NA), F6_9 = c(NA, NA,
NA, NA, "Brrandu3", NA, NA, NA, NA, NA), F6_9_a = c(NA, NA,
NA, NA, "Brand3", NA, NA, NA, NA, NA), F6_10 = c(NA, NA,
NA, NA, "Brand6", NA, NA, NA, NA, NA), F6_10_a = c(NA, NA,
NA, NA, "Brand6", NA, NA, NA, NA, NA), F6_11 = c(NA, NA,
NA, NA, "Brand6", NA, NA, NA, NA, NA), F6_11_a = c(NA, NA,
NA, NA, "Brand6", NA, NA, NA, NA, NA), F6_12 = c(NA, NA,
NA, NA, "Brand6", NA, NA, NA, NA, NA), F6_12_a = c(NA, NA,
NA, NA, "Brand6", NA, NA, NA, NA, NA)), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
这是一种使用 tidyr 使数据变长,然后进行连接,然后再次变宽的方法。
df1 %>%
rowid_to_column() %>%
pivot_longer(-rowid, "col", values_to = "brands") %>%
stringdist_left_join(df2, method = "soundex") %>%
# just keep first match, since real data won't have multiples
group_by(rowid, col) %>%
slice(1) %>%
# tidying steps to make clean column titles
rename("orig" = brands.x,
"a" = brands.y) %>%
gather(col2, val, c(orig, a)) %>%
unite(col, c(col,col2)) %>%
# make data wide again
pivot_wider(names_from = col, values_from = val)
我想将一列 (df2$brands) 与许多其他列 (df1$F6_1:f6_12) 进行模糊匹配,这些列包含相同的字符串,但有一些小的拼写错误。
我有两个数据集:
df1:
df1 <- structure(list(F6_1 = c("Braand1", "Brand2", "Brand3", "Brand4", "Brand4",
"Brand5", "Brand6", "Brand7", "Brand6", "Brand8"), F6_2 = c("Brand9",
"", "Brand4", "Brando6", "Brand6", "Brand8", "Brannd4", "Brandd8",
"Brand6", "Brand6"), F6_3 = c("Brand6", "", "Brand6",
"Brand10", "Brand10", "", "Brand8", "Brand10", "Brand8", "Brand3"
), F6_4 = c("", "", "Brand10", "", "Brand3", "", "Brand6", "Brand6",
"Bramd3", "BPand3"), F6_5 = c("", "", "", "", "Brand6",
"", "Brand1", "Brand1", "", "Brand1"), F6_6 = c("",
"", "", "", "Brand6", "", "Brand3", "", "", "Brand1"), F6_7 = c("",
"", "", "", "Brand1", "", "Brand1", "", "", "Brand1"), F6_8 = c("",
"", "", "", "Brand1", "", "", "", "", "Brand6"
), F6_9 = c("", "", "", "", "Brrandu3", "", "", "", "", ""), F6_10 = c("",
"", "", "", "Brand6", "", "", "", "", ""), F6_11 = c("",
"", "", "", "Brand6", "", "", "", "", ""), F6_12 = c("", "",
"", "", "Brand6", "", "", "", "", "")), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
df2:
df2 <- structure(list(brands = c("Brand1", "Brand2", "Brand3", "Brand4", "Brand5",
"Brand6")), row.names = c(NA, -6L), class = c("tbl_df", "tbl",
"data.frame"))
我尝试使用 fuzzyjoin 库中的 stringdist_left_join() 函数,效果很好。
library(tidyverse)
library(fuzzyjoin)
df1_F6_1 <- df1 %>% select(F6_1)
df2_F6_1 <- df2 %>% select(F6_1 = brands)
df_joined_F6_1 <- stringdist_left_join(df_F6_1, df2_F6_1, by = "F6_1", method = "soundex")
这仅适用于一列。但是,我想在完整的 df1 数据集上执行此操作。这可以通过模糊连接每一列并最终将它们全部加在一起来解决。但是应该有更简单更方便的方法来做到这一点。
我的输出应该是这样的:
df3 <- structure(list(F6_1 = c("Braand1", "Brand2", "Brand3", "Brand4",
"Brand4", "Brand5", "Brand6", "Brand7", "Brand6", "Brand8"),
F6_1_a = c("Brand1", "Brand2", "Brand3", "Brand4", "Brand4",
"Brand5", "Brand6", "Brand7", "Brand6", "Brand8"), F6_2 = c("Brand9",
NA, "Brand4", "Brando6", "Brand6", "Brand8", "Brannd4", "Brandd8",
"Brand6", "Brand6"), F6_2_a = c("Brand9", NA, "Brand4", "Brand6",
"Brand6", "Brand8", "Brand4", "Brand8", "Brand6", "Brand6"
), F6_3 = c("Brand6", NA, "Brand6", "Brand10", "Brand10",
"Brand8", "Brand8", "Brand10", "Brand8", "Brand3"), F6_3_a = c("Brand6",
NA, "Brand6", "Brand10", "Brand10", "Brand8", "Brand8", "Brand10",
"Brand8", "Brand3"), F6_4 = c(NA, NA, "Brand10", NA, "Brand3",
NA, "Brand6", "Brand6", "Bramd3", "BPand3"), F6_4_a = c(NA,
NA, "Brand10", NA, "Brand3", NA, "Brand6", "Brand6", "Brand3",
"Brand3"), F6_5 = c(NA, NA, NA, NA, "Brand6", NA, "Brand1",
"Brand1", NA, "Brand1"), F6_5_a = c(NA, NA, NA, NA, "Brand6",
NA, "Brand1", "Brand1", NA, "Brand1"), F6_6 = c(NA, NA, NA,
NA, "Brand6", NA, "Brand3", NA, NA, "Brand1"), F6_6_a = c(NA,
NA, NA, NA, "Brand6", NA, "Brand3", NA, NA, "Brand1"), F6_7 = c(NA,
NA, NA, NA, "Brand1", NA, "Brand1", NA, NA, "Brand1"), F6_7_a = c(NA,
NA, NA, NA, "Brand1", NA, "Brand1", NA, NA, "Brand1"), F6_8 = c(NA,
NA, NA, NA, "Brand1", NA, NA, NA, NA, "Brand6"), F6_8_a = c(NA,
NA, NA, NA, "Brand1", NA, NA, NA, NA, NA), F6_9 = c(NA, NA,
NA, NA, "Brrandu3", NA, NA, NA, NA, NA), F6_9_a = c(NA, NA,
NA, NA, "Brand3", NA, NA, NA, NA, NA), F6_10 = c(NA, NA,
NA, NA, "Brand6", NA, NA, NA, NA, NA), F6_10_a = c(NA, NA,
NA, NA, "Brand6", NA, NA, NA, NA, NA), F6_11 = c(NA, NA,
NA, NA, "Brand6", NA, NA, NA, NA, NA), F6_11_a = c(NA, NA,
NA, NA, "Brand6", NA, NA, NA, NA, NA), F6_12 = c(NA, NA,
NA, NA, "Brand6", NA, NA, NA, NA, NA), F6_12_a = c(NA, NA,
NA, NA, "Brand6", NA, NA, NA, NA, NA)), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
这是一种使用 tidyr 使数据变长,然后进行连接,然后再次变宽的方法。
df1 %>%
rowid_to_column() %>%
pivot_longer(-rowid, "col", values_to = "brands") %>%
stringdist_left_join(df2, method = "soundex") %>%
# just keep first match, since real data won't have multiples
group_by(rowid, col) %>%
slice(1) %>%
# tidying steps to make clean column titles
rename("orig" = brands.x,
"a" = brands.y) %>%
gather(col2, val, c(orig, a)) %>%
unite(col, c(col,col2)) %>%
# make data wide again
pivot_wider(names_from = col, values_from = val)