检查元素是否包含在不同长度的不相等元素上
Check if element is contained on an unequal element of different length
我正在尝试查找字符向量的一部分是否与另一个字符向量的一部分重叠
x <- c("OCT/NOV/DEC", "JAN/DEC/AUG")
y <- c("JAN/FEB/MAR", "APR/MAY/JUN", "JUL/AUG/SEP")
# Months should be split into separate characters
So I would use:
list_x <- strsplit(x, '/')
list_x
#> [[1]]
#> [1] "OCT" "NOV" "DEC"
#>
#> [[2]]
#> [1] "JAN" "DEC" "AUG"
list_y <- strsplit(y, '/')
list_y
#> [[1]]
#> [1] "JAN" "FEB" "MAR"
#>
#> [[2]]
#> [1] "APR" "MAY" "JUN"
#>
#> [[3]]
#> [1] "JUL" "AUG" "SEP"
正如我们所见,list_x[[1]] 中没有任何元素位于 list_y 中,因此 FALSE 应该 return;
list_x[[2]]有“JAN”和“AUG”,分别位于list_y[[1]]和list_y[[3]],所以 TRUE 应该 return
# The response should be
c(FALSE, TRUE) # for each of x elements
# I tried:
detect <- function(x, y){
mapply(function(x, y) any(x %in% y), strsplit(x, '/'), strsplit(y, '/'))
}
detect(x,y)
# Which gives a warning stating the lengths are not multiple and:
#> [1] FALSE FALSE FALSE
那么如何判断y元素中是否有x元素?
编辑:在 Akrun 的回应之后,我尝试了一种更复杂的方法,包括非 equi 连接
detect <- function(a,b){
sapply(str_split(a, '/'), function(x) any(sapply(str_split(b, '/'),
function(y) any(x %in% y))))
}
a <- tibble(a1 = c("A/B/C", "F/E/G"),
b1 = c(1,2),
c1 = c("OCT/NOV/DEC", "JAN/DEC/AUG"))
b <- tibble(a2 = c("A/B/C", "D/E/F", "G/H/I"),
b2 = c(1,2,3),
c2 = c("JAN/FEB/MAR", "APR/MAY/JUN", "JUL/AUG/SEP"))
fuzzyjoin::fuzzy_left_join(a, b, by = c("a1" = "a2",
"b1" = "b2",
"c1" = "c2"),
match_fun = list(detect, `==`, detect))
## Wrong Result:
#> a1 b1 c1 a2 b2 c2
#> <chr> <int> <chr> <chr> <int> <chr>
#> 1 A/B/C 1 OCT/NOV/DEC NA NA NA
#> 2 F/E/G 2 JAN/DEC/AUG D/E/F 2 APR/MAY/JUN
# Row 2: Although a1 and a2 have matching characters and b1 matches b2, c1 and c2 have no matching characters, so the join shouldn't be possible
## Expected:
#> a1 b1 c1 a2 b2 c2
#> <chr> <int> <chr> <chr> <int> <chr>
#> 1 A/B/C 1 OCT/NOV/DEC NA NA NA
#> 2 F/E/G 2 JAN/DEC/AUG NA NA NA
也许我误解了这个函数中的某些内容?
我们可以使用嵌套的 sapply
和 any
sapply(list_x, function(x) any(sapply(list_y, function(y) any(x %in% y))))
#[1] FALSE TRUE
对于更新后的数据,如果我们将 any
更改为 all
,它将给出预期的输出
detect <- function(a,b){
sapply(str_split(a, '/'), function(x) all(sapply(str_split(b, '/'),
function(y) any(x %in% y))))
}
fuzzyjoin::fuzzy_left_join(a, b, by = c("a1" = "a2",
"b1" = "b2",
"c1" = "c2"),
match_fun = list(detect, `==`, detect))
# A tibble: 2 x 6
# a1 b1 c1 a2 b2 c2
# <chr> <dbl> <chr> <chr> <dbl> <chr>
#1 A/B/C 1 OCT/NOV/DEC <NA> NA <NA>
#2 F/E/G 2 JAN/DEC/AUG <NA> NA <NA>
我正在尝试查找字符向量的一部分是否与另一个字符向量的一部分重叠
x <- c("OCT/NOV/DEC", "JAN/DEC/AUG")
y <- c("JAN/FEB/MAR", "APR/MAY/JUN", "JUL/AUG/SEP")
# Months should be split into separate characters
So I would use:
list_x <- strsplit(x, '/')
list_x
#> [[1]]
#> [1] "OCT" "NOV" "DEC"
#>
#> [[2]]
#> [1] "JAN" "DEC" "AUG"
list_y <- strsplit(y, '/')
list_y
#> [[1]]
#> [1] "JAN" "FEB" "MAR"
#>
#> [[2]]
#> [1] "APR" "MAY" "JUN"
#>
#> [[3]]
#> [1] "JUL" "AUG" "SEP"
正如我们所见,list_x[[1]] 中没有任何元素位于 list_y 中,因此 FALSE 应该 return;
list_x[[2]]有“JAN”和“AUG”,分别位于list_y[[1]]和list_y[[3]],所以 TRUE 应该 return
# The response should be
c(FALSE, TRUE) # for each of x elements
# I tried:
detect <- function(x, y){
mapply(function(x, y) any(x %in% y), strsplit(x, '/'), strsplit(y, '/'))
}
detect(x,y)
# Which gives a warning stating the lengths are not multiple and:
#> [1] FALSE FALSE FALSE
那么如何判断y元素中是否有x元素?
编辑:在 Akrun 的回应之后,我尝试了一种更复杂的方法,包括非 equi 连接
detect <- function(a,b){
sapply(str_split(a, '/'), function(x) any(sapply(str_split(b, '/'),
function(y) any(x %in% y))))
}
a <- tibble(a1 = c("A/B/C", "F/E/G"),
b1 = c(1,2),
c1 = c("OCT/NOV/DEC", "JAN/DEC/AUG"))
b <- tibble(a2 = c("A/B/C", "D/E/F", "G/H/I"),
b2 = c(1,2,3),
c2 = c("JAN/FEB/MAR", "APR/MAY/JUN", "JUL/AUG/SEP"))
fuzzyjoin::fuzzy_left_join(a, b, by = c("a1" = "a2",
"b1" = "b2",
"c1" = "c2"),
match_fun = list(detect, `==`, detect))
## Wrong Result:
#> a1 b1 c1 a2 b2 c2
#> <chr> <int> <chr> <chr> <int> <chr>
#> 1 A/B/C 1 OCT/NOV/DEC NA NA NA
#> 2 F/E/G 2 JAN/DEC/AUG D/E/F 2 APR/MAY/JUN
# Row 2: Although a1 and a2 have matching characters and b1 matches b2, c1 and c2 have no matching characters, so the join shouldn't be possible
## Expected:
#> a1 b1 c1 a2 b2 c2
#> <chr> <int> <chr> <chr> <int> <chr>
#> 1 A/B/C 1 OCT/NOV/DEC NA NA NA
#> 2 F/E/G 2 JAN/DEC/AUG NA NA NA
也许我误解了这个函数中的某些内容?
我们可以使用嵌套的 sapply
和 any
sapply(list_x, function(x) any(sapply(list_y, function(y) any(x %in% y))))
#[1] FALSE TRUE
对于更新后的数据,如果我们将 any
更改为 all
,它将给出预期的输出
detect <- function(a,b){
sapply(str_split(a, '/'), function(x) all(sapply(str_split(b, '/'),
function(y) any(x %in% y))))
}
fuzzyjoin::fuzzy_left_join(a, b, by = c("a1" = "a2",
"b1" = "b2",
"c1" = "c2"),
match_fun = list(detect, `==`, detect))
# A tibble: 2 x 6
# a1 b1 c1 a2 b2 c2
# <chr> <dbl> <chr> <chr> <dbl> <chr>
#1 A/B/C 1 OCT/NOV/DEC <NA> NA <NA>
#2 F/E/G 2 JAN/DEC/AUG <NA> NA <NA>