Python: 查找子字符串是否存在于给定条件的字符串中
Python: Find If Substring Exists in String Given Condition
我正在尝试针对接受 2 个参数的函数优化此解决方案:fullstring
和 substring
。如果子字符串存在于完整字符串中,函数将 return True
,如果不存在,则函数将 False
。子串中可以输入一个特殊的通配符,表示前一个符号的0或1,子串中可以有多个通配符。
例如"a*"
表示""
或"a"
我的解决方案工作正常,但我正在尝试减少 for 循环 (3) 的数量并优化时间复杂度。不允许使用正则表达式。有没有更pythonic的方法来做到这一点?
当前解决方案:
def complex_search(fullstring, substring):
patterns = []
if "*" in substring:
index = substring.index("*")
patterns.append(substring[:index-1] + substring[index+1:])
patterns.append(substring[:index] + substring[index+1:])
else:
patterns.append(substring)
def check(s1, s2):
for a, b in zip(s1, s2):
if a != b:
return False
return True
for pattern in patterns:
for i in range(len(fullstring) - len(pattern) + 1):
if check(fullstring[i:i+len(pattern)], pattern):
return True
return False
>> print(complex_search("dogandcats", "dogs*andcats"))
>> True
接近
根据子字符串中的“”创建子字符串的所有替代项(子字符串中可以有零个或多个“”)
参见下面的函数 combs(...)
使用Aho-Corasick检查字符串中是否有子字符串模式之一。 Aho-Corasick 是一种非常有效的算法,用于检查一个或多个子字符串是否出现在字符串中,并作为原始 Unix 命令 fgrep 的基础形成。
出于说明目的,下面使用了 Aho-Corasik 的 Python 版本,但 pyahocorasick 提供了 C 实现(带有 Python 包装器)更高的性能。
参见下面的 class Aho-Corasick。
代码
# Note: This is a modification of code explained in https://carshen.github.io/data-structures/algorithms/2014/04/07/aho-corasick-implementation-in-python.html
from collections import deque
class Aho_Corasick():
def __init__(self, keywords):
self.adj_list = []
# creates a trie of keywords, then sets fail transitions
self.create_empty_trie()
self.add_keywords(keywords)
self.set_fail_transitions()
def create_empty_trie(self):
""" initalize the root of the trie """
self.adj_list.append({'value':'', 'next_states':[],'fail_state':0,'output':[]})
def add_keywords(self, keywords):
""" add all keywords in list of keywords """
for keyword in keywords:
self.add_keyword(keyword)
def find_next_state(self, current_state, value):
for node in self.adj_list[current_state]["next_states"]:
if self.adj_list[node]["value"] == value:
return node
return None
def add_keyword(self, keyword):
""" add a keyword to the trie and mark output at the last node """
current_state = 0
j = 0
keyword = keyword.lower()
child = self.find_next_state(current_state, keyword[j])
while child != None:
current_state = child
j = j + 1
if j < len(keyword):
child = self.find_next_state(current_state, keyword[j])
else:
break
for i in range(j, len(keyword)):
node = {'value':keyword[i],'next_states':[],'fail_state':0,'output':[]}
self.adj_list.append(node)
self.adj_list[current_state]["next_states"].append(len(self.adj_list) - 1)
current_state = len(self.adj_list) - 1
self.adj_list[current_state]["output"].append(keyword)
def set_fail_transitions(self):
q = deque()
child = 0
for node in self.adj_list[0]["next_states"]:
q.append(node)
self.adj_list[node]["fail_state"] = 0
while q:
r = q.popleft()
for child in self.adj_list[r]["next_states"]:
q.append(child)
state = self.adj_list[r]["fail_state"]
while (self.find_next_state(state, self.adj_list[child]["value"]) == None
and state != 0):
state = self.adj_list[state]["fail_state"]
self.adj_list[child]["fail_state"] = self.find_next_state(state, self.adj_list[child]["value"])
if self.adj_list[child]["fail_state"] is None:
self.adj_list[child]["fail_state"] = 0
self.adj_list[child]["output"] = self.adj_list[child]["output"] + self.adj_list[self.adj_list[child]["fail_state"]]["output"]
def get_keywords_found(self, line):
""" returns keywords in trie from line """
line = line.lower()
current_state = 0
keywords_found = []
for i, c in enumerate(line):
while self.find_next_state(current_state, c) is None and current_state != 0:
current_state = self.adj_list[current_state]["fail_state"]
current_state = self.find_next_state(current_state, c)
if current_state is None:
current_state = 0
else:
for j in self.adj_list[current_state]["output"]:
yield {"index":i-len(j) + 1,"word":j}
def pattern_found(self, line):
''' Returns true when the pattern is found '''
return next(self.get_keywords_found(line), None) is not None
def combs(word, n = 0, path = ""):
''' Generate all combinations of words with star
e.g. list(combs("he*lp*")) = ['help', 'helpp', 'heelp', 'heelpp']
'''
if n == len(word):
yield path
elif word[n] == '*':
# Next letter
yield from combs(word, n+1, path) # don't add * to path
else:
if n < len(word) - 1 and word[n+1] == '*':
yield from combs(word, n+1, path) # Not including letter at n
yield from combs(word, n+1, path + word[n]) # including letter at n
测试
patterns = combs("dogs*andcats") # ['dogandcats', 'dogsandcats']
aho = Aho_Corasick(patterns) # Aho-Corasick structure to recognize patterns
print(aho.pattern_found("dogandcats")) # Output: True
print(aho.pattern_found("dogsandcats")) # Output: True
我正在尝试针对接受 2 个参数的函数优化此解决方案:fullstring
和 substring
。如果子字符串存在于完整字符串中,函数将 return True
,如果不存在,则函数将 False
。子串中可以输入一个特殊的通配符,表示前一个符号的0或1,子串中可以有多个通配符。
例如"a*"
表示""
或"a"
我的解决方案工作正常,但我正在尝试减少 for 循环 (3) 的数量并优化时间复杂度。不允许使用正则表达式。有没有更pythonic的方法来做到这一点?
当前解决方案:
def complex_search(fullstring, substring):
patterns = []
if "*" in substring:
index = substring.index("*")
patterns.append(substring[:index-1] + substring[index+1:])
patterns.append(substring[:index] + substring[index+1:])
else:
patterns.append(substring)
def check(s1, s2):
for a, b in zip(s1, s2):
if a != b:
return False
return True
for pattern in patterns:
for i in range(len(fullstring) - len(pattern) + 1):
if check(fullstring[i:i+len(pattern)], pattern):
return True
return False
>> print(complex_search("dogandcats", "dogs*andcats"))
>> True
接近
根据子字符串中的“”创建子字符串的所有替代项(子字符串中可以有零个或多个“”)
参见下面的函数 combs(...)
使用Aho-Corasick检查字符串中是否有子字符串模式之一。 Aho-Corasick 是一种非常有效的算法,用于检查一个或多个子字符串是否出现在字符串中,并作为原始 Unix 命令 fgrep 的基础形成。
出于说明目的,下面使用了 Aho-Corasik 的 Python 版本,但 pyahocorasick 提供了 C 实现(带有 Python 包装器)更高的性能。
参见下面的 class Aho-Corasick。
代码
# Note: This is a modification of code explained in https://carshen.github.io/data-structures/algorithms/2014/04/07/aho-corasick-implementation-in-python.html
from collections import deque
class Aho_Corasick():
def __init__(self, keywords):
self.adj_list = []
# creates a trie of keywords, then sets fail transitions
self.create_empty_trie()
self.add_keywords(keywords)
self.set_fail_transitions()
def create_empty_trie(self):
""" initalize the root of the trie """
self.adj_list.append({'value':'', 'next_states':[],'fail_state':0,'output':[]})
def add_keywords(self, keywords):
""" add all keywords in list of keywords """
for keyword in keywords:
self.add_keyword(keyword)
def find_next_state(self, current_state, value):
for node in self.adj_list[current_state]["next_states"]:
if self.adj_list[node]["value"] == value:
return node
return None
def add_keyword(self, keyword):
""" add a keyword to the trie and mark output at the last node """
current_state = 0
j = 0
keyword = keyword.lower()
child = self.find_next_state(current_state, keyword[j])
while child != None:
current_state = child
j = j + 1
if j < len(keyword):
child = self.find_next_state(current_state, keyword[j])
else:
break
for i in range(j, len(keyword)):
node = {'value':keyword[i],'next_states':[],'fail_state':0,'output':[]}
self.adj_list.append(node)
self.adj_list[current_state]["next_states"].append(len(self.adj_list) - 1)
current_state = len(self.adj_list) - 1
self.adj_list[current_state]["output"].append(keyword)
def set_fail_transitions(self):
q = deque()
child = 0
for node in self.adj_list[0]["next_states"]:
q.append(node)
self.adj_list[node]["fail_state"] = 0
while q:
r = q.popleft()
for child in self.adj_list[r]["next_states"]:
q.append(child)
state = self.adj_list[r]["fail_state"]
while (self.find_next_state(state, self.adj_list[child]["value"]) == None
and state != 0):
state = self.adj_list[state]["fail_state"]
self.adj_list[child]["fail_state"] = self.find_next_state(state, self.adj_list[child]["value"])
if self.adj_list[child]["fail_state"] is None:
self.adj_list[child]["fail_state"] = 0
self.adj_list[child]["output"] = self.adj_list[child]["output"] + self.adj_list[self.adj_list[child]["fail_state"]]["output"]
def get_keywords_found(self, line):
""" returns keywords in trie from line """
line = line.lower()
current_state = 0
keywords_found = []
for i, c in enumerate(line):
while self.find_next_state(current_state, c) is None and current_state != 0:
current_state = self.adj_list[current_state]["fail_state"]
current_state = self.find_next_state(current_state, c)
if current_state is None:
current_state = 0
else:
for j in self.adj_list[current_state]["output"]:
yield {"index":i-len(j) + 1,"word":j}
def pattern_found(self, line):
''' Returns true when the pattern is found '''
return next(self.get_keywords_found(line), None) is not None
def combs(word, n = 0, path = ""):
''' Generate all combinations of words with star
e.g. list(combs("he*lp*")) = ['help', 'helpp', 'heelp', 'heelpp']
'''
if n == len(word):
yield path
elif word[n] == '*':
# Next letter
yield from combs(word, n+1, path) # don't add * to path
else:
if n < len(word) - 1 and word[n+1] == '*':
yield from combs(word, n+1, path) # Not including letter at n
yield from combs(word, n+1, path + word[n]) # including letter at n
测试
patterns = combs("dogs*andcats") # ['dogandcats', 'dogsandcats']
aho = Aho_Corasick(patterns) # Aho-Corasick structure to recognize patterns
print(aho.pattern_found("dogandcats")) # Output: True
print(aho.pattern_found("dogsandcats")) # Output: True