查找字符串中重叠的长度
Find length of overlap in strings
你知道任何现成的方法来获取两个字符串的长度和重叠吗?但是只有 R
,也许来自 stringr
?我一直在看这里,不幸的是没有成功。
str1 <- 'ABCDE'
str2 <- 'CDEFG'
str_overlap(str1, str2)
'CDE'
str_overlap_len(str1, str2)
3
其他示例:
str1 <- 'ATTAGACCTG'
str2 <- 'CCTGCCGGAA'
str_overlap(str1, str2)
'CCTG'
str_overlap_len(str1, str2)
4
///
str1 <- 'foobarandfoo'
str2 <- 'barand'
str_overlap(str1, str2)
'barand'
str_overlap_len(str1, str2)
6
/// 是的两个解,总是pick always overlap
str1 <- 'EFGABCDE'
str2 <- 'ABCDECDE'
str_overlap(str1, str2)
'ABCDE'
str_overlap_len(str1, str2)
5
我想知道为此自制一些小功能,例如this one?
希望对您有所帮助:
library(stringr)
larsub<-function(x) {
a<-x[1]
b<-x[2]
# get all forward substrings of a
for(n in seq(1,nchar(a)))
{
sb<-unique(combn(strsplit(a, "")[[1]],n, FUN=paste, collapse=""))
if(length(unlist(str_extract_all(b,sb)))==0){
r<-prior
return(r)
}
prior<-unlist(str_extract_all(b,sb))
}
}
c1<-larsub(c('ABCD','BCDE'))
c2<-larsub(c('ABDFD','BCDE'))
c3<-larsub(c('CDEWQ','DEQ'))
c4<-larsub(c('BNEOYJBELMGY','BELM'))
print(c1)
print(c2)
print(c3)
print(c4)
输出:
> print(c1)
[1] "BCD"
> print(c2)
[1] "B" "D"
> print(c3)
[1] "DEQ"
> print(c4)
[1] "BELM"
`
免责声明:逻辑是从此处的 lcs 答案中借用的:longest common substring in R finding non-contiguous matches between the two strings @Rick Scriven 发布
在我看来,您 (OP) 并不是很关心代码的性能,而是更感兴趣的是在没有现成函数的情况下解决它的潜在方法。所以这是我想出的一个例子来计算最长的公共子串。我必须注意,这只是 returns 找到的第一个最大公共子串,即使可以有多个相同长度的子串。这是您可以修改以满足您的需要的内容。请不要指望这会非常快 - 它不会。
foo <- function(str1, str2, ignore.case = FALSE, verbose = FALSE) {
if(ignore.case) {
str1 <- tolower(str1)
str2 <- tolower(str2)
}
if(nchar(str1) < nchar(str2)) {
x <- str2
str2 <- str1
str1 <- x
}
x <- strsplit(str2, "")[[1L]]
n <- length(x)
s <- sequence(seq_len(n))
s <- split(s, cumsum(s == 1L))
s <- rep(list(s), n)
for(i in seq_along(s)) {
s[[i]] <- lapply(s[[i]], function(x) {
x <- x + (i-1L)
x[x <= n]
})
s[[i]] <- unique(s[[i]])
}
s <- unlist(s, recursive = FALSE)
s <- unique(s[order(-lengths(s))])
i <- 1L
len_s <- length(s)
while(i < len_s) {
lcs <- paste(x[s[[i]]], collapse = "")
if(verbose) cat("now checking:", lcs, "\n")
check <- grepl(lcs, str1, fixed = TRUE)
if(check) {
cat("the (first) longest common substring is:", lcs, "of length", nchar(lcs), "\n")
break
} else {
i <- i + 1L
}
}
}
str1 <- 'ABCDE'
str2 <- 'CDEFG'
foo(str1, str2)
# the (first) longest common substring is: CDE of length 3
str1 <- 'ATTAGACCTG'
str2 <- 'CCTGCCGGAA'
foo(str1, str2)
# the (first) longest common substring is: CCTG of length 4
str1 <- 'foobarandfoo'
str2 <- 'barand'
foo(str1, str2)
# the (first) longest common substring is: barand of length 6
str1 <- 'EFGABCDE'
str2 <- 'ABCDECDE'
foo(str1, str2)
# the (first) longest common substring is: ABCDE of length 5
set.seed(2018)
str1 <- paste(sample(c(LETTERS, letters), 500, TRUE), collapse = "")
str2 <- paste(sample(c(LETTERS, letters), 250, TRUE), collapse = "")
foo(str1, str2, ignore.case = TRUE)
# the (first) longest common substring is: oba of length 3
foo(str1, str2, ignore.case = FALSE)
# the (first) longest common substring is: Vh of length 2
你知道任何现成的方法来获取两个字符串的长度和重叠吗?但是只有 R
,也许来自 stringr
?我一直在看这里,不幸的是没有成功。
str1 <- 'ABCDE'
str2 <- 'CDEFG'
str_overlap(str1, str2)
'CDE'
str_overlap_len(str1, str2)
3
其他示例:
str1 <- 'ATTAGACCTG'
str2 <- 'CCTGCCGGAA'
str_overlap(str1, str2)
'CCTG'
str_overlap_len(str1, str2)
4
///
str1 <- 'foobarandfoo'
str2 <- 'barand'
str_overlap(str1, str2)
'barand'
str_overlap_len(str1, str2)
6
/// 是的两个解,总是pick always overlap
str1 <- 'EFGABCDE'
str2 <- 'ABCDECDE'
str_overlap(str1, str2)
'ABCDE'
str_overlap_len(str1, str2)
5
我想知道为此自制一些小功能,例如this one?
希望对您有所帮助:
library(stringr)
larsub<-function(x) {
a<-x[1]
b<-x[2]
# get all forward substrings of a
for(n in seq(1,nchar(a)))
{
sb<-unique(combn(strsplit(a, "")[[1]],n, FUN=paste, collapse=""))
if(length(unlist(str_extract_all(b,sb)))==0){
r<-prior
return(r)
}
prior<-unlist(str_extract_all(b,sb))
}
}
c1<-larsub(c('ABCD','BCDE'))
c2<-larsub(c('ABDFD','BCDE'))
c3<-larsub(c('CDEWQ','DEQ'))
c4<-larsub(c('BNEOYJBELMGY','BELM'))
print(c1)
print(c2)
print(c3)
print(c4)
输出:
> print(c1)
[1] "BCD"
> print(c2)
[1] "B" "D"
> print(c3)
[1] "DEQ"
> print(c4)
[1] "BELM"
`
免责声明:逻辑是从此处的 lcs 答案中借用的:longest common substring in R finding non-contiguous matches between the two strings @Rick Scriven 发布
在我看来,您 (OP) 并不是很关心代码的性能,而是更感兴趣的是在没有现成函数的情况下解决它的潜在方法。所以这是我想出的一个例子来计算最长的公共子串。我必须注意,这只是 returns 找到的第一个最大公共子串,即使可以有多个相同长度的子串。这是您可以修改以满足您的需要的内容。请不要指望这会非常快 - 它不会。
foo <- function(str1, str2, ignore.case = FALSE, verbose = FALSE) {
if(ignore.case) {
str1 <- tolower(str1)
str2 <- tolower(str2)
}
if(nchar(str1) < nchar(str2)) {
x <- str2
str2 <- str1
str1 <- x
}
x <- strsplit(str2, "")[[1L]]
n <- length(x)
s <- sequence(seq_len(n))
s <- split(s, cumsum(s == 1L))
s <- rep(list(s), n)
for(i in seq_along(s)) {
s[[i]] <- lapply(s[[i]], function(x) {
x <- x + (i-1L)
x[x <= n]
})
s[[i]] <- unique(s[[i]])
}
s <- unlist(s, recursive = FALSE)
s <- unique(s[order(-lengths(s))])
i <- 1L
len_s <- length(s)
while(i < len_s) {
lcs <- paste(x[s[[i]]], collapse = "")
if(verbose) cat("now checking:", lcs, "\n")
check <- grepl(lcs, str1, fixed = TRUE)
if(check) {
cat("the (first) longest common substring is:", lcs, "of length", nchar(lcs), "\n")
break
} else {
i <- i + 1L
}
}
}
str1 <- 'ABCDE'
str2 <- 'CDEFG'
foo(str1, str2)
# the (first) longest common substring is: CDE of length 3
str1 <- 'ATTAGACCTG'
str2 <- 'CCTGCCGGAA'
foo(str1, str2)
# the (first) longest common substring is: CCTG of length 4
str1 <- 'foobarandfoo'
str2 <- 'barand'
foo(str1, str2)
# the (first) longest common substring is: barand of length 6
str1 <- 'EFGABCDE'
str2 <- 'ABCDECDE'
foo(str1, str2)
# the (first) longest common substring is: ABCDE of length 5
set.seed(2018)
str1 <- paste(sample(c(LETTERS, letters), 500, TRUE), collapse = "")
str2 <- paste(sample(c(LETTERS, letters), 250, TRUE), collapse = "")
foo(str1, str2, ignore.case = TRUE)
# the (first) longest common substring is: oba of length 3
foo(str1, str2, ignore.case = FALSE)
# the (first) longest common substring is: Vh of length 2