模糊匹配(和覆盖)向量条目

Fuzzy matching (and overwriting) vector entries

我有 5 个带有列名的向量,它们相似但不相同。

我正在尝试根据 vector1 中的名称找到更正 vector2vector3vector4vector5 中的条目的方法].

我已经有了一些想法 and here,导致了下面的代码。但最后,我什至无法比较第一个 two.vectors。更不用说覆盖它们了。

library(dplyr)
library(fuzzyjoin)


vector1 <- c("something","nothing", "anything", "number4")
vector2 <- c("some thing","no thing","addition", "anything", "number4")
vector3 <- c("some thing wrong","nothing", "anything_")
vector4 <- c("something","nothingg", "anything", "number_4")
vector5 <- c("something","nothing", "anything happening", "number4")

我是这样开始的:

apply(adist(x = vector1, y = vector2), 1, which.min)

data.frame(string_to_match = vector1, 
           closest_match = vector2[apply(adist(x = vector1, y = vector2), 1, which.min)])

           
  string_to_match closest_match
1       something    some thing
2         nothing      no thing
3        anything      anything
4         number4       number4

是否可以将距离添加到此解决方案并根据距离覆盖矢量?

想要的结果:

  string_to_match closest_match  distance
1       something    some thing   1
2         nothing      no thing   1
3        anything      anything   0
4         number4       number4   0

vector1 <- c("something","nothing", "anything", "number4")
vector2 <- c("something","nothing","addition", "anything", "number4")
vector3 <- c("something","nothing", "anything")
vector4 <- c("something","nothing", "anything", "number4")
vector5 <- c("something","nothing", "anything", "number4")

有没有人可以让我走上正轨?

fuzzyjoin 函数将添加距离度量。如果您只是 select closest_match column/vector.

,则无需覆盖
library(fuzzyjoin); library(dplyr)
vector1 <- c("something","nothing", "anything", "number4")
vector2 <- c("some thing","no thing","addition", "anything", "number4")
vector3 <- c("some thing wrong","nothing", "anything_")
vector4 <- c("something","nothingg", "anything", "number_4")
vector5 <- c("something","nothing", "anything happening", "number4")

# solution for your desired output for vector 2
stringdist_left_join(x = tibble(things = vector1), y = tibble(things = vector2), 
                     max_dist = 1, distance_col = "distance")
#> Joining by: "things"
#> # A tibble: 4 x 3
#>   things.x  things.y   distance
#>   <chr>     <chr>         <dbl>
#> 1 something some thing        1
#> 2 nothing   no thing          1
#> 3 anything  anything          0
#> 4 number4   number4           0

# fuller solution for vector 3 or any other
(full_table_of_possible_matches_for_vector3 <- stringdist_left_join(x = tibble(things = vector3), 
                                                                    y = tibble(things = vector1), 
                                                                    max_dist = 99, distance_col = "distance"))
#> Joining by: "things"
#> # A tibble: 12 x 3
#>    things.x         things.y  distance
#>    <chr>            <chr>        <dbl>
#>  1 some thing wrong something        7
#>  2 some thing wrong nothing         10
#>  3 some thing wrong anything        11
#>  4 some thing wrong number4         14
#>  5 nothing          something        3
#>  6 nothing          nothing          0
#>  7 nothing          anything         2
#>  8 nothing          number4          6
#>  9 anything_        something        5
#> 10 anything_        nothing          3
#> 11 anything_        anything         1
#> 12 anything_        number4          8
(table_of_closest_matches <- full_table_of_possible_matches_for_vector3 %>%
  group_by(things.x) %>%
  mutate(rank = row_number(distance)) %>%
  filter(rank == 1))
#> # A tibble: 3 x 4
#> # Groups:   things.x [3]
#>   things.x         things.y  distance  rank
#>   <chr>            <chr>        <dbl> <int>
#> 1 some thing wrong something        7     1
#> 2 nothing          nothing          0     1
#> 3 anything_        anything         1     1
  #slice_min(distance, with_ties = FALSE) # can't use slice_min or order will mess up
(new_vector3 <- table_of_closest_matches$things.y)
#> [1] "something" "nothing"   "anything"

(new_vector2 <- stringdist_left_join(x = tibble(things = vector2), 
                                     y = tibble(things = vector1), 
                                     max_dist = 99, distance_col = "distance") %>%
    group_by(things.x) %>%
    mutate(rank = row_number(distance)) %>%
    filter(rank == 1) %>%
    .$things.y)
#> Joining by: "things"
#> [1] "something" "nothing"   "anything"  "anything"  "number4"

reprex package (v0.3.0)

创建于 2021-01-06