模糊匹配(和覆盖)向量条目
Fuzzy matching (and overwriting) vector entries
我有 5 个带有列名的向量,它们相似但不相同。
我正在尝试根据 vector1
中的名称找到更正 vector2
、vector3
、vector4
、vector5
中的条目的方法].
我已经有了一些想法 and here,导致了下面的代码。但最后,我什至无法比较第一个 two.vectors。更不用说覆盖它们了。
library(dplyr)
library(fuzzyjoin)
vector1 <- c("something","nothing", "anything", "number4")
vector2 <- c("some thing","no thing","addition", "anything", "number4")
vector3 <- c("some thing wrong","nothing", "anything_")
vector4 <- c("something","nothingg", "anything", "number_4")
vector5 <- c("something","nothing", "anything happening", "number4")
我是这样开始的:
apply(adist(x = vector1, y = vector2), 1, which.min)
data.frame(string_to_match = vector1,
closest_match = vector2[apply(adist(x = vector1, y = vector2), 1, which.min)])
string_to_match closest_match
1 something some thing
2 nothing no thing
3 anything anything
4 number4 number4
是否可以将距离添加到此解决方案并根据距离覆盖矢量?
想要的结果:
string_to_match closest_match distance
1 something some thing 1
2 nothing no thing 1
3 anything anything 0
4 number4 number4 0
vector1 <- c("something","nothing", "anything", "number4")
vector2 <- c("something","nothing","addition", "anything", "number4")
vector3 <- c("something","nothing", "anything")
vector4 <- c("something","nothing", "anything", "number4")
vector5 <- c("something","nothing", "anything", "number4")
有没有人可以让我走上正轨?
fuzzyjoin
函数将添加距离度量。如果您只是 select closest_match column/vector.
,则无需覆盖
library(fuzzyjoin); library(dplyr)
vector1 <- c("something","nothing", "anything", "number4")
vector2 <- c("some thing","no thing","addition", "anything", "number4")
vector3 <- c("some thing wrong","nothing", "anything_")
vector4 <- c("something","nothingg", "anything", "number_4")
vector5 <- c("something","nothing", "anything happening", "number4")
# solution for your desired output for vector 2
stringdist_left_join(x = tibble(things = vector1), y = tibble(things = vector2),
max_dist = 1, distance_col = "distance")
#> Joining by: "things"
#> # A tibble: 4 x 3
#> things.x things.y distance
#> <chr> <chr> <dbl>
#> 1 something some thing 1
#> 2 nothing no thing 1
#> 3 anything anything 0
#> 4 number4 number4 0
# fuller solution for vector 3 or any other
(full_table_of_possible_matches_for_vector3 <- stringdist_left_join(x = tibble(things = vector3),
y = tibble(things = vector1),
max_dist = 99, distance_col = "distance"))
#> Joining by: "things"
#> # A tibble: 12 x 3
#> things.x things.y distance
#> <chr> <chr> <dbl>
#> 1 some thing wrong something 7
#> 2 some thing wrong nothing 10
#> 3 some thing wrong anything 11
#> 4 some thing wrong number4 14
#> 5 nothing something 3
#> 6 nothing nothing 0
#> 7 nothing anything 2
#> 8 nothing number4 6
#> 9 anything_ something 5
#> 10 anything_ nothing 3
#> 11 anything_ anything 1
#> 12 anything_ number4 8
(table_of_closest_matches <- full_table_of_possible_matches_for_vector3 %>%
group_by(things.x) %>%
mutate(rank = row_number(distance)) %>%
filter(rank == 1))
#> # A tibble: 3 x 4
#> # Groups: things.x [3]
#> things.x things.y distance rank
#> <chr> <chr> <dbl> <int>
#> 1 some thing wrong something 7 1
#> 2 nothing nothing 0 1
#> 3 anything_ anything 1 1
#slice_min(distance, with_ties = FALSE) # can't use slice_min or order will mess up
(new_vector3 <- table_of_closest_matches$things.y)
#> [1] "something" "nothing" "anything"
(new_vector2 <- stringdist_left_join(x = tibble(things = vector2),
y = tibble(things = vector1),
max_dist = 99, distance_col = "distance") %>%
group_by(things.x) %>%
mutate(rank = row_number(distance)) %>%
filter(rank == 1) %>%
.$things.y)
#> Joining by: "things"
#> [1] "something" "nothing" "anything" "anything" "number4"
由 reprex package (v0.3.0)
创建于 2021-01-06
我有 5 个带有列名的向量,它们相似但不相同。
我正在尝试根据 vector1
中的名称找到更正 vector2
、vector3
、vector4
、vector5
中的条目的方法].
我已经有了一些想法
library(dplyr)
library(fuzzyjoin)
vector1 <- c("something","nothing", "anything", "number4")
vector2 <- c("some thing","no thing","addition", "anything", "number4")
vector3 <- c("some thing wrong","nothing", "anything_")
vector4 <- c("something","nothingg", "anything", "number_4")
vector5 <- c("something","nothing", "anything happening", "number4")
我是这样开始的:
apply(adist(x = vector1, y = vector2), 1, which.min)
data.frame(string_to_match = vector1,
closest_match = vector2[apply(adist(x = vector1, y = vector2), 1, which.min)])
string_to_match closest_match
1 something some thing
2 nothing no thing
3 anything anything
4 number4 number4
是否可以将距离添加到此解决方案并根据距离覆盖矢量?
想要的结果:
string_to_match closest_match distance
1 something some thing 1
2 nothing no thing 1
3 anything anything 0
4 number4 number4 0
vector1 <- c("something","nothing", "anything", "number4")
vector2 <- c("something","nothing","addition", "anything", "number4")
vector3 <- c("something","nothing", "anything")
vector4 <- c("something","nothing", "anything", "number4")
vector5 <- c("something","nothing", "anything", "number4")
有没有人可以让我走上正轨?
fuzzyjoin
函数将添加距离度量。如果您只是 select closest_match column/vector.
library(fuzzyjoin); library(dplyr)
vector1 <- c("something","nothing", "anything", "number4")
vector2 <- c("some thing","no thing","addition", "anything", "number4")
vector3 <- c("some thing wrong","nothing", "anything_")
vector4 <- c("something","nothingg", "anything", "number_4")
vector5 <- c("something","nothing", "anything happening", "number4")
# solution for your desired output for vector 2
stringdist_left_join(x = tibble(things = vector1), y = tibble(things = vector2),
max_dist = 1, distance_col = "distance")
#> Joining by: "things"
#> # A tibble: 4 x 3
#> things.x things.y distance
#> <chr> <chr> <dbl>
#> 1 something some thing 1
#> 2 nothing no thing 1
#> 3 anything anything 0
#> 4 number4 number4 0
# fuller solution for vector 3 or any other
(full_table_of_possible_matches_for_vector3 <- stringdist_left_join(x = tibble(things = vector3),
y = tibble(things = vector1),
max_dist = 99, distance_col = "distance"))
#> Joining by: "things"
#> # A tibble: 12 x 3
#> things.x things.y distance
#> <chr> <chr> <dbl>
#> 1 some thing wrong something 7
#> 2 some thing wrong nothing 10
#> 3 some thing wrong anything 11
#> 4 some thing wrong number4 14
#> 5 nothing something 3
#> 6 nothing nothing 0
#> 7 nothing anything 2
#> 8 nothing number4 6
#> 9 anything_ something 5
#> 10 anything_ nothing 3
#> 11 anything_ anything 1
#> 12 anything_ number4 8
(table_of_closest_matches <- full_table_of_possible_matches_for_vector3 %>%
group_by(things.x) %>%
mutate(rank = row_number(distance)) %>%
filter(rank == 1))
#> # A tibble: 3 x 4
#> # Groups: things.x [3]
#> things.x things.y distance rank
#> <chr> <chr> <dbl> <int>
#> 1 some thing wrong something 7 1
#> 2 nothing nothing 0 1
#> 3 anything_ anything 1 1
#slice_min(distance, with_ties = FALSE) # can't use slice_min or order will mess up
(new_vector3 <- table_of_closest_matches$things.y)
#> [1] "something" "nothing" "anything"
(new_vector2 <- stringdist_left_join(x = tibble(things = vector2),
y = tibble(things = vector1),
max_dist = 99, distance_col = "distance") %>%
group_by(things.x) %>%
mutate(rank = row_number(distance)) %>%
filter(rank == 1) %>%
.$things.y)
#> Joining by: "things"
#> [1] "something" "nothing" "anything" "anything" "number4"
由 reprex package (v0.3.0)
创建于 2021-01-06