在 R 中识别两个字符串中的模式
Identifying patterns in two strings in R
我想评估 ColA 是否包含比 ColB 新的字符串。但是,我对某些类型的字符串不感兴趣,例如 oil。我想要一个指示变量如下:
ColA ColB Ind
-------------------------- ------------------------ -----
coconut+grape+pine grape+coconut TRUE
orange+apple+grape+pine grape+coconut TRUE
grape+pine grape+oil TRUE
oil+grape grape+apple FALSE
grape grape+oil FALSE
grape+pine grape+orange+pine FALSE
有使用 R 的建议吗?
非常感谢!
由于我们需要拆分字符串,我们将从 strsplit
、
开始
strsplit(dat$ColA, '+', fixed = TRUE)
# [[1]]
# [1] "coconut" "grape" "pine"
# [[2]]
# [1] "orange" "apple" "grape" "pine"
# [[3]]
# [1] "grape" "pine"
# [[4]]
# [1] "oil" "grape"
# [[5]]
# [1] "grape"
# [[6]]
# [1] "grape" "pine"
从这里,我们要确定 ColA
中有哪些不在 ColB
中。我将在每组中使用 Map
到 运行 setdiff
(ColA
的 [[1]]
和 ColB
的 [[1]]
,等)。
Map(setdiff, strsplit(dat$ColA, '+', fixed = TRUE), strsplit(dat$ColB, '+', fixed = TRUE))
# [[1]]
# [1] "pine"
# [[2]]
# [1] "orange" "apple" "pine"
# [[3]]
# [1] "pine"
# [[4]]
# [1] "oil"
# [[5]]
# character(0)
# [[6]]
# character(0)
要确定哪个有“新词”,我们可以使用 lengths(.) > 0
:
检查非零长度
lengths(Map(setdiff, strsplit(dat$ColA, '+', fixed = TRUE), strsplit(dat$ColB, '+', fixed = TRUE))) > 0
# [1] TRUE TRUE TRUE TRUE FALSE FALSE
但由于您不关心 oil
,我们也需要将其删除。
lapply(Map(setdiff, strsplit(dat$ColA, '+', fixed = TRUE), strsplit(dat$ColB, '+', fixed = TRUE)), setdiff, "oil")
# [[1]]
# [1] "pine"
# [[2]]
# [1] "orange" "apple" "pine"
# [[3]]
# [1] "pine"
# [[4]]
# character(0)
# [[5]]
# character(0)
# [[6]]
# character(0)
lengths(lapply(Map(setdiff, strsplit(dat$ColA, '+', fixed = TRUE), strsplit(dat$ColB, '+', fixed = TRUE)),
setdiff, "oil")) > 0
# [1] TRUE TRUE TRUE FALSE FALSE FALSE
@ak运行 建议使用 tidyverse 变体:
library(dplyr)
library(purrr) # map2_lgl
library(stringr) # str_extract_all
dat %>%
mutate(
new = map2_lgl(
str_extract_all(ColB, "\w+"), str_extract_all(ColA, "\w+"),
~ !all(setdiff(.y, "oil") %in% .x)
)
)
# ColA ColB Ind new
# 1 coconut+grape+pine grape+coconut TRUE TRUE
# 2 orange+apple+grape+pine grape+coconut TRUE TRUE
# 3 grape+pine grape+oil TRUE TRUE
# 4 oil+grape grape+apple FALSE FALSE
# 5 grape grape+oil FALSE FALSE
# 6 grape+pine grape+orange+pine FALSE FALSE
数据
dat <- structure(list(ColA = c("coconut+grape+pine", "orange+apple+grape+pine", "grape+pine", "oil+grape", "grape", "grape+pine"), ColB = c("grape+coconut", "grape+coconut", "grape+oil", "grape+apple", "grape+oil", "grape+orange+pine"), Ind = c(TRUE, TRUE, TRUE, FALSE, FALSE, FALSE)), class = "data.frame", row.names = c(NA, -6L))
这是一个类似于 的解决方案,它在 do.call
的帮助下仅调用一次 strsplit
。
rid <- function(x) x[!x %in% z] ## helper FUN to get rid of the oil
z <- "oil"
L <- sapply(unname(dat), strsplit, "\+")
dat$ind <- sapply(1:nrow(L), function(x) length(do.call(setdiff, rev(Map(rid, L[x,]))))) > 0
dat
# V1 V2 ind
# 1 grape+coconut coconut+grape+pine TRUE
# 2 grape+coconut orange+apple+grape+pine TRUE
# 3 grape+oil grape+pine TRUE
# 4 grape+apple oil+grape FALSE
# 5 grape+oil grape FALSE
# 6 grape+orange+pine grape+pine FALSE
数据:
dat <- structure(list(V1 = c("grape+coconut", "grape+coconut", "grape+oil",
"grape+apple", "grape+oil", "grape+orange+pine"), V2 = c("coconut+grape+pine",
"orange+apple+grape+pine", "grape+pine", "oil+grape", "grape",
"grape+pine")), row.names = c(NA, -6L), class = "data.frame")
我想评估 ColA 是否包含比 ColB 新的字符串。但是,我对某些类型的字符串不感兴趣,例如 oil。我想要一个指示变量如下:
ColA ColB Ind
-------------------------- ------------------------ -----
coconut+grape+pine grape+coconut TRUE
orange+apple+grape+pine grape+coconut TRUE
grape+pine grape+oil TRUE
oil+grape grape+apple FALSE
grape grape+oil FALSE
grape+pine grape+orange+pine FALSE
有使用 R 的建议吗?
非常感谢!
由于我们需要拆分字符串,我们将从 strsplit
、
strsplit(dat$ColA, '+', fixed = TRUE)
# [[1]]
# [1] "coconut" "grape" "pine"
# [[2]]
# [1] "orange" "apple" "grape" "pine"
# [[3]]
# [1] "grape" "pine"
# [[4]]
# [1] "oil" "grape"
# [[5]]
# [1] "grape"
# [[6]]
# [1] "grape" "pine"
从这里,我们要确定 ColA
中有哪些不在 ColB
中。我将在每组中使用 Map
到 运行 setdiff
(ColA
的 [[1]]
和 ColB
的 [[1]]
,等)。
Map(setdiff, strsplit(dat$ColA, '+', fixed = TRUE), strsplit(dat$ColB, '+', fixed = TRUE))
# [[1]]
# [1] "pine"
# [[2]]
# [1] "orange" "apple" "pine"
# [[3]]
# [1] "pine"
# [[4]]
# [1] "oil"
# [[5]]
# character(0)
# [[6]]
# character(0)
要确定哪个有“新词”,我们可以使用 lengths(.) > 0
:
lengths(Map(setdiff, strsplit(dat$ColA, '+', fixed = TRUE), strsplit(dat$ColB, '+', fixed = TRUE))) > 0
# [1] TRUE TRUE TRUE TRUE FALSE FALSE
但由于您不关心 oil
,我们也需要将其删除。
lapply(Map(setdiff, strsplit(dat$ColA, '+', fixed = TRUE), strsplit(dat$ColB, '+', fixed = TRUE)), setdiff, "oil")
# [[1]]
# [1] "pine"
# [[2]]
# [1] "orange" "apple" "pine"
# [[3]]
# [1] "pine"
# [[4]]
# character(0)
# [[5]]
# character(0)
# [[6]]
# character(0)
lengths(lapply(Map(setdiff, strsplit(dat$ColA, '+', fixed = TRUE), strsplit(dat$ColB, '+', fixed = TRUE)),
setdiff, "oil")) > 0
# [1] TRUE TRUE TRUE FALSE FALSE FALSE
@ak运行 建议使用 tidyverse 变体:
library(dplyr)
library(purrr) # map2_lgl
library(stringr) # str_extract_all
dat %>%
mutate(
new = map2_lgl(
str_extract_all(ColB, "\w+"), str_extract_all(ColA, "\w+"),
~ !all(setdiff(.y, "oil") %in% .x)
)
)
# ColA ColB Ind new
# 1 coconut+grape+pine grape+coconut TRUE TRUE
# 2 orange+apple+grape+pine grape+coconut TRUE TRUE
# 3 grape+pine grape+oil TRUE TRUE
# 4 oil+grape grape+apple FALSE FALSE
# 5 grape grape+oil FALSE FALSE
# 6 grape+pine grape+orange+pine FALSE FALSE
数据
dat <- structure(list(ColA = c("coconut+grape+pine", "orange+apple+grape+pine", "grape+pine", "oil+grape", "grape", "grape+pine"), ColB = c("grape+coconut", "grape+coconut", "grape+oil", "grape+apple", "grape+oil", "grape+orange+pine"), Ind = c(TRUE, TRUE, TRUE, FALSE, FALSE, FALSE)), class = "data.frame", row.names = c(NA, -6L))
这是一个类似于 do.call
的帮助下仅调用一次 strsplit
。
rid <- function(x) x[!x %in% z] ## helper FUN to get rid of the oil
z <- "oil"
L <- sapply(unname(dat), strsplit, "\+")
dat$ind <- sapply(1:nrow(L), function(x) length(do.call(setdiff, rev(Map(rid, L[x,]))))) > 0
dat
# V1 V2 ind
# 1 grape+coconut coconut+grape+pine TRUE
# 2 grape+coconut orange+apple+grape+pine TRUE
# 3 grape+oil grape+pine TRUE
# 4 grape+apple oil+grape FALSE
# 5 grape+oil grape FALSE
# 6 grape+orange+pine grape+pine FALSE
数据:
dat <- structure(list(V1 = c("grape+coconut", "grape+coconut", "grape+oil",
"grape+apple", "grape+oil", "grape+orange+pine"), V2 = c("coconut+grape+pine",
"orange+apple+grape+pine", "grape+pine", "oil+grape", "grape",
"grape+pine")), row.names = c(NA, -6L), class = "data.frame")