如何根据其他列中的字符在字符串中找到匹配项?
How to find a match within a string based on character from other column?
我有这些数据:
df1 <- data.frame(matrix(, nrow=2, ncol=2))
colnames(df1) <- c("ca", "ea")
df1$ca <- c("A=C,T=G", "T=C,G=G")
df1$ea <- c("G", "T")
我想创建一个名为“match”的新列,它为我提供“ca”列中的字母等于“ea”列中的字母。所以我的输出看起来像这样:
df1 <- data.frame(matrix(, nrow=2, ncol=2))
colnames(df1) <- c("ca", "ea")
df1$ca <- c("A=C,T=G", "T=C,G=G")
df1$ea <- c("G", "T")
df1$match <- c("T", "C")
这很棘手,因为在第一个实例中,我要匹配的字母在“=”之后,但在第二个实例中,它在它之前。
我相信这对你有用:
df1 <- data.frame(matrix(, nrow=2, ncol=2))
colnames(df1) <- c("ca", "ea")
df1$ca <- c("A=C,T=G", "T=C,G=G")
df1$ea <- c("G", "T")
my_f <- function(x) {
my_pattern <- paste("[ACGT]=", df1[x, "ea"], "|", df1[x, "ea"], "=[ACGT]", sep
= "")
my_a <- str_extract_all(string = df1[x, "ca"], pattern = my_pattern, simplify = TRUE)
my_pattern <- paste(df1[x, "ea"], "|=", sep = "")
my_a <- gsub(pattern = my_pattern, replacement = "", x = my_a)
return (my_a)
}
df1$match <- lapply(1:nrow(df1), my_f)
library(tidyverse)
df1 <- data.frame(matrix(, nrow = 2, ncol = 2))
colnames(df1) <- c("ca", "ea")
df1$ca <- c("A=C,T=G", "T=C,G=G")
df1$ea <- c("G", "T")
df1
#> ca ea
#> 1 A=C,T=G G
#> 2 T=C,G=G T
df1 %>%
mutate(
match = ea %>% map2_chr(ca, function(ea, ca) {
ca %>%
str_split(",") %>%
simplify() %>%
keep(~ str_detect(.x, ea)) %>%
str_remove_all(str_glue("[=|{ea}]"))
})
)
#> ca ea match
#> 1 A=C,T=G G T
#> 2 T=C,G=G T C
由 reprex package (v2.0.1)
于 2021-12-08 创建
这是另一个 tidyverse
解决方案,使用正则表达式可能更简单一些。您需要 R > 4.0 才能使用 |>
管道运算符,如果不是这种情况,只需将其替换为 %>%
.
library(tidyverse)
df1 |>
# add a named match column as an extracted string by the following
# two possible patterns
mutate(match = str_extract(ca,
# Search for the letter preceded by ea=
paste0(paste0("(?<=",ea,"\=)","[A-Z]"),
# or
"|",
# search for the letter followed by =ea
paste0("[A-Z]","(?=\=",ea,")"))))
# ca ea match
# 1 A=C,T=G G T
# 2 T=C,G=G T C
简单的好机会branch reset group
df1 <- data.frame(matrix(, nrow=2, ncol=2))
colnames(df1) <- c("ca", "ea")
df1$ca <- c("A=C,T=G", "T=C,G=G")
df1$ea <- c("G", "T")
df1$match <- c("T", "C")
mapply(
function(p, x)
gsub(sprintf('(?|%s=(.)|(.)=%s)|.', p, p), '\1', x, perl = TRUE),
df1$ea, df1$ca, USE.NAMES = FALSE
)
# [1] "T" "C"
我有这些数据:
df1 <- data.frame(matrix(, nrow=2, ncol=2))
colnames(df1) <- c("ca", "ea")
df1$ca <- c("A=C,T=G", "T=C,G=G")
df1$ea <- c("G", "T")
我想创建一个名为“match”的新列,它为我提供“ca”列中的字母等于“ea”列中的字母。所以我的输出看起来像这样:
df1 <- data.frame(matrix(, nrow=2, ncol=2))
colnames(df1) <- c("ca", "ea")
df1$ca <- c("A=C,T=G", "T=C,G=G")
df1$ea <- c("G", "T")
df1$match <- c("T", "C")
这很棘手,因为在第一个实例中,我要匹配的字母在“=”之后,但在第二个实例中,它在它之前。
我相信这对你有用:
df1 <- data.frame(matrix(, nrow=2, ncol=2))
colnames(df1) <- c("ca", "ea")
df1$ca <- c("A=C,T=G", "T=C,G=G")
df1$ea <- c("G", "T")
my_f <- function(x) {
my_pattern <- paste("[ACGT]=", df1[x, "ea"], "|", df1[x, "ea"], "=[ACGT]", sep
= "")
my_a <- str_extract_all(string = df1[x, "ca"], pattern = my_pattern, simplify = TRUE)
my_pattern <- paste(df1[x, "ea"], "|=", sep = "")
my_a <- gsub(pattern = my_pattern, replacement = "", x = my_a)
return (my_a)
}
df1$match <- lapply(1:nrow(df1), my_f)
library(tidyverse)
df1 <- data.frame(matrix(, nrow = 2, ncol = 2))
colnames(df1) <- c("ca", "ea")
df1$ca <- c("A=C,T=G", "T=C,G=G")
df1$ea <- c("G", "T")
df1
#> ca ea
#> 1 A=C,T=G G
#> 2 T=C,G=G T
df1 %>%
mutate(
match = ea %>% map2_chr(ca, function(ea, ca) {
ca %>%
str_split(",") %>%
simplify() %>%
keep(~ str_detect(.x, ea)) %>%
str_remove_all(str_glue("[=|{ea}]"))
})
)
#> ca ea match
#> 1 A=C,T=G G T
#> 2 T=C,G=G T C
由 reprex package (v2.0.1)
于 2021-12-08 创建这是另一个 tidyverse
解决方案,使用正则表达式可能更简单一些。您需要 R > 4.0 才能使用 |>
管道运算符,如果不是这种情况,只需将其替换为 %>%
.
library(tidyverse)
df1 |>
# add a named match column as an extracted string by the following
# two possible patterns
mutate(match = str_extract(ca,
# Search for the letter preceded by ea=
paste0(paste0("(?<=",ea,"\=)","[A-Z]"),
# or
"|",
# search for the letter followed by =ea
paste0("[A-Z]","(?=\=",ea,")"))))
# ca ea match
# 1 A=C,T=G G T
# 2 T=C,G=G T C
简单的好机会branch reset group
df1 <- data.frame(matrix(, nrow=2, ncol=2))
colnames(df1) <- c("ca", "ea")
df1$ca <- c("A=C,T=G", "T=C,G=G")
df1$ea <- c("G", "T")
df1$match <- c("T", "C")
mapply(
function(p, x)
gsub(sprintf('(?|%s=(.)|(.)=%s)|.', p, p), '\1', x, perl = TRUE),
df1$ea, df1$ca, USE.NAMES = FALSE
)
# [1] "T" "C"