Python: 查找子字符串是否存在于给定条件的字符串中

Python: Find If Substring Exists in String Given Condition

我正在尝试针对接受 2 个参数的函数优化此解决方案:fullstringsubstring。如果子字符串存在于完整字符串中,函数将 return True,如果不存在,则函数将 False。子串中可以输入一个特殊的通配符,表示前一个符号的0或1,子串中可以有多个通配符。

例如"a*"表示"""a"

我的解决方案工作正常,但我正在尝试减少 for 循环 (3) 的数量并优化时间复杂度。不允许使用正则表达式。有没有更pythonic的方法来做到这一点?

当前解决方案:

def complex_search(fullstring, substring):
    patterns = []
    if "*" in substring:
        index = substring.index("*")
        patterns.append(substring[:index-1] + substring[index+1:])
        patterns.append(substring[:index] + substring[index+1:])
    else:
        patterns.append(substring)

    def check(s1, s2):
        for a, b in zip(s1, s2):
            if a != b:
                return False
        return True

    for pattern in patterns:
        for i in range(len(fullstring) - len(pattern) + 1):
            if check(fullstring[i:i+len(pattern)], pattern):
                return True
    return False

>> print(complex_search("dogandcats", "dogs*andcats"))
>> True

接近

  1. 根据子字符串中的“”创建子字符串的所有替代项(子字符串中可以有零个或多个“”)

    参见下面的函数 combs(...)

  2. 使用Aho-Corasick检查字符串中是否有子字符串模式之一。 Aho-Corasick 是一种非常有效的算法,用于检查一个或多个子字符串是否出现在字符串中,并作为原始 Unix 命令 fgrep 的基础形成。

  3. 出于说明目的,下面使用了 Aho-Corasik 的 Python 版本,但 pyahocorasick 提供了 C 实现(带有 Python 包装器)更高的性能。

    参见下面的 class Aho-Corasick。

代码

# Note: This is a modification of code explained in https://carshen.github.io/data-structures/algorithms/2014/04/07/aho-corasick-implementation-in-python.html

from collections import deque

class Aho_Corasick():
    def __init__(self, keywords):
        self.adj_list = []
        # creates a trie of keywords, then sets fail transitions
        self.create_empty_trie()
        self.add_keywords(keywords)
        self.set_fail_transitions()

    def create_empty_trie(self):
        """ initalize the root of the trie """
        self.adj_list.append({'value':'', 'next_states':[],'fail_state':0,'output':[]})

    def add_keywords(self, keywords):
        """ add all keywords in list of keywords """
        for keyword in keywords:
            self.add_keyword(keyword)

    def find_next_state(self, current_state, value):
        for node in self.adj_list[current_state]["next_states"]:
            if self.adj_list[node]["value"] == value:
                return node
        return None

    def add_keyword(self, keyword):
        """ add a keyword to the trie and mark output at the last node """
        current_state = 0
        j = 0
        keyword = keyword.lower()
        child = self.find_next_state(current_state, keyword[j])
        while child != None:
            current_state = child
            j = j + 1
            if j < len(keyword):
                child = self.find_next_state(current_state, keyword[j])
            else:
                break

        for i in range(j, len(keyword)):
            node = {'value':keyword[i],'next_states':[],'fail_state':0,'output':[]}
            self.adj_list.append(node)
            self.adj_list[current_state]["next_states"].append(len(self.adj_list) - 1)
            current_state = len(self.adj_list) - 1
        self.adj_list[current_state]["output"].append(keyword)

    def set_fail_transitions(self):
        q = deque()
        child = 0
        for node in self.adj_list[0]["next_states"]:
           q.append(node)
           self.adj_list[node]["fail_state"] = 0
        while q:
            r = q.popleft()
            for child in self.adj_list[r]["next_states"]:
                q.append(child)
                state = self.adj_list[r]["fail_state"]
                while (self.find_next_state(state, self.adj_list[child]["value"]) == None
                      and state != 0):
                    state = self.adj_list[state]["fail_state"]
                self.adj_list[child]["fail_state"] = self.find_next_state(state, self.adj_list[child]["value"])
                if self.adj_list[child]["fail_state"] is None:
                    self.adj_list[child]["fail_state"] = 0
                self.adj_list[child]["output"] = self.adj_list[child]["output"] + self.adj_list[self.adj_list[child]["fail_state"]]["output"]

    def get_keywords_found(self, line):
        """ returns keywords in trie from line """
        line = line.lower()
        current_state = 0
        keywords_found = []

        for i, c in enumerate(line):
            while self.find_next_state(current_state, c) is None and current_state != 0:
                current_state = self.adj_list[current_state]["fail_state"]
            current_state = self.find_next_state(current_state, c)
            if current_state is None:
                current_state = 0
            else:
                for j in self.adj_list[current_state]["output"]:
                    yield {"index":i-len(j) + 1,"word":j}
    
    def pattern_found(self, line):
        ''' Returns true when the pattern is found '''
        return next(self.get_keywords_found(line), None) is not None
    
        
def combs(word, n = 0, path = ""):
    ''' Generate all combinations of words with star
    
        e.g. list(combs("he*lp*")) = ['help', 'helpp', 'heelp', 'heelpp']
    '''
    if n == len(word):
        yield path
    elif word[n] == '*':
        # Next letter
        yield from combs(word, n+1, path)            # don't add * to path
    else:
        if n < len(word) - 1 and word[n+1] == '*':
            yield from combs(word, n+1, path)        # Not including letter at n
        yield from combs(word, n+1, path + word[n])  # including letter at n
    

测试

patterns = combs("dogs*andcats")                    # ['dogandcats', 'dogsandcats']
aho = Aho_Corasick(patterns)                        # Aho-Corasick structure to recognize patterns
print(aho.pattern_found("dogandcats"))              # Output: True
print(aho.pattern_found("dogsandcats"))             # Output: True