将字符串列表拆分为 "similar string" 组
Split list of string into groups of "similar string"
我有一个字符串列表(大约 300 个字符串)。
其中一些字符串以相同的字符开头。
例如:
- "ACTUATOR_alim5V0ResetErrorRatio_1",
- "ACTUATOR_alim5V0ResetErrorRatio_2",
- "ACTUATOR_alim5V0ResetErrorSensors_1",
- "SENSOR_inputPwm2DutyCycle",
- "SENSOR_inputPwm2DigitalLevel",
- "SENSOR_inputPwm2RisingEdgeCounter",
- 等...
我想对以相同字符开头的字符串进行分组。
例如:
- "ACTUATOR_alim5V0ResetErrorRatio_1",
- "ACTUATOR_alim5V0ResetErrorRatio_2",
- "ACTUATOR_alim5V0ResetErrorSensors_1"
将属于“ACTUATOR_alim5V0ResetError”组。
在这个例子中,属于该组的所有字符串都以“ACTUATOR_alim5V0ResetError”开头,因此
该组的名称是“ACTUATOR_alim5V0ResetError”。
每个组不得包含超过预定义数量的字符串 (MAX_NUMBER_STRINGS_PER_GROUP)。
我正在寻找可以解决这个问题的算法。
我必须把这个算法翻译成Python.
任何帮助都是有用的。
我认为应该这样做。只需将您的字符串添加到数组中,字典就会自行排序。 :)
将要赋值的字符串放入AssignString函数中,需要时可以在StringDescriptorGroups中找到。
StringDescriptors = [
"ACTUATOR_alim5V0ResetErrorRatio_1",
"ACTUATOR_alim5V0ResetErrorRatio_2",
"ACTUATOR_alim5V0ResetErrorSensors_1"
# . . . < You can add more but here are the first few (they dont need to be in order)
]
StringDescriptorGroups = {}
# Here we set up our dictionary to assign these strings to their group via
substring
for i in StringDescriptors:
StringDescriptorGroups.update( {i : []} )
#Create disctionary from string descriptors with empty arrays to assign these
inputs
def AssignString(s):
# Input an arbitary string s
for i in StringDescriptors:
# Go trough each one in a linear fashion
if i in s:
StringDescriptorGroups[i].append(s)
#Add to group if it contains descriptor
return True
# Work is done so return True
return False
#Couldnt find so return False
正在测试trie算法,感谢georg的指点:
trie.py
# Script Name : trie.py
# Author : The_Average_Engineer
# Created : 26th April 2021
# Description : Prefix tree
# useful link : https://en.wikipedia.org/wiki/Trie
# useful link : https://towardsdatascience.com/implementing-a-trie-data-structure-in-python-in-less-than-100-lines-of-code-a877ea23c1a1
class TrieNode(object):
"""
Trie node implementation.
"""
def __init__(self, char, parent):
self.char = char
self.parent = parent
self.children = []
# Is it the last character of the string.`
self.is_string_finished = False
# How many times this character appeared in the addition process
self.counter = 1
def add(node, string):
"""
Adding a string in the trie structure
"""
for char in string:
found_in_child = False
# Search for the character in the children of the present `node`
for child in node.children:
if child.char == char:
# We found it, increase the counter by 1 to keep track that another
# word has it as well
child.counter += 1
# And point the node to the child that contains this char
node = child
found_in_child = True
break
# We did not find it so add a new child
if not found_in_child:
new_node = TrieNode(char, node)
node.children.append(new_node)
# And then point node to the new child
node = new_node
# Everything finished. Mark it as the end of a word.
node.is_string_finished = True
def chunk_into_clusters(node, threshold, clusters_nodes):
"""
Chunk trie into clusters.
threshold is maximum number of string a cluster can contain.
clusters_nodes is the output/resulting list of nodes.
"""
for child in node.children:
if (child.counter < threshold) and (len(child.children) > 1):
clusters_nodes.append(child)
elif child.is_string_finished:
clusters_nodes.append(child)
else:
chunk_into_clusters(child, threshold, clusters_nodes) # recursive call
def find_end_nodes(node, end_nodes):
"""
Find nodes which end strings (nodes having the attribute "is_string_finished" set to True).
threshold is the maximum number of string a cluster can contain.
clusters_nodes is the output/resulting list of nodes.
"""
for child in node.children:
if child.is_string_finished:
end_nodes.append(child)
find_end_nodes(child, end_nodes) # recursive call
def retrieve_string(node):
"""
retrieve string from node.
"""
string = ""
while node is not None: # while node is not root
string = node.char + string
node = node.parent
return string
main.py
# Script Name : main.py
# Author : The_Average_Engineer
# Created : 25th April 2021
# Description : Testing string list clustering method
import string_list
import trie
if __name__ == '__main__':
r"""
script entry point
"""
# max number of strings per cluster
MAX_NB_STRINGS_PER_CLUSTER = 30
# result dict
clusters = {}
# add strings to trie
root = trie.TrieNode('', None)
for string in string_list.strings:
trie.add(root, string)
# get clusters from trie
clusters_nodes = []
trie.chunk_into_clusters(root, MAX_NB_STRINGS_PER_CLUSTER, clusters_nodes)
# get strings associated with each clusters nodes
for cluster_node in clusters_nodes:
cluster_node_string = trie.retrieve_string(cluster_node)
clusters[cluster_node_string] = []
# get strings contained in each cluster
end_nodes = []
trie.find_end_nodes(cluster_node, end_nodes)
if cluster_node.is_string_finished:
clusters[cluster_node_string].append(cluster_node_string)
for end_node in end_nodes:
end_node_string = trie.retrieve_string(end_node)
clusters[cluster_node_string].append(end_node_string)
# print results
for cluster_name, cluster_strings in clusters.items():
print("\n{}:".format(cluster_name))
for string in cluster_strings:
print("\t{}".format(string))
输出
SENSOR_inDigitalPD:
SENSOR_inDigitalPD1Lvl
SENSOR_inDigitalPD2Lvl
SENSOR_inputPwm1:
SENSOR_inputPwm1FrequencyMeasure
SENSOR_inputPwm1FrequencyMeasureValid
SENSOR_inputPwm1FallingEdgeCounter
SENSOR_inputPwm1DutyCycle
SENSOR_inputPwm1DigitalLevel
SENSOR_inputPwm1RisingEdgeCounter
SENSOR_inputPwm2:
SENSOR_inputPwm2FrequencyMeasure
SENSOR_inputPwm2FrequencyMeasureValid
SENSOR_inputPwm2FallingEdgeCounter
SENSOR_inputPwm2DutyCycle
SENSOR_inputPwm2DigitalLevel
SENSOR_inputPwm2RisingEdgeCounter
etc...
它似乎工作得很好。
如果你们中的一些人有简化此实现的想法,我非常感兴趣!
我有一个字符串列表(大约 300 个字符串)。 其中一些字符串以相同的字符开头。
例如:
- "ACTUATOR_alim5V0ResetErrorRatio_1",
- "ACTUATOR_alim5V0ResetErrorRatio_2",
- "ACTUATOR_alim5V0ResetErrorSensors_1",
- "SENSOR_inputPwm2DutyCycle",
- "SENSOR_inputPwm2DigitalLevel",
- "SENSOR_inputPwm2RisingEdgeCounter",
- 等...
我想对以相同字符开头的字符串进行分组。 例如:
- "ACTUATOR_alim5V0ResetErrorRatio_1",
- "ACTUATOR_alim5V0ResetErrorRatio_2",
- "ACTUATOR_alim5V0ResetErrorSensors_1"
将属于“ACTUATOR_alim5V0ResetError”组。
在这个例子中,属于该组的所有字符串都以“ACTUATOR_alim5V0ResetError”开头,因此 该组的名称是“ACTUATOR_alim5V0ResetError”。
每个组不得包含超过预定义数量的字符串 (MAX_NUMBER_STRINGS_PER_GROUP)。
我正在寻找可以解决这个问题的算法。 我必须把这个算法翻译成Python.
任何帮助都是有用的。
我认为应该这样做。只需将您的字符串添加到数组中,字典就会自行排序。 :)
将要赋值的字符串放入AssignString函数中,需要时可以在StringDescriptorGroups中找到。
StringDescriptors = [
"ACTUATOR_alim5V0ResetErrorRatio_1",
"ACTUATOR_alim5V0ResetErrorRatio_2",
"ACTUATOR_alim5V0ResetErrorSensors_1"
# . . . < You can add more but here are the first few (they dont need to be in order)
]
StringDescriptorGroups = {}
# Here we set up our dictionary to assign these strings to their group via
substring
for i in StringDescriptors:
StringDescriptorGroups.update( {i : []} )
#Create disctionary from string descriptors with empty arrays to assign these
inputs
def AssignString(s):
# Input an arbitary string s
for i in StringDescriptors:
# Go trough each one in a linear fashion
if i in s:
StringDescriptorGroups[i].append(s)
#Add to group if it contains descriptor
return True
# Work is done so return True
return False
#Couldnt find so return False
正在测试trie算法,感谢georg的指点:
trie.py
# Script Name : trie.py
# Author : The_Average_Engineer
# Created : 26th April 2021
# Description : Prefix tree
# useful link : https://en.wikipedia.org/wiki/Trie
# useful link : https://towardsdatascience.com/implementing-a-trie-data-structure-in-python-in-less-than-100-lines-of-code-a877ea23c1a1
class TrieNode(object):
"""
Trie node implementation.
"""
def __init__(self, char, parent):
self.char = char
self.parent = parent
self.children = []
# Is it the last character of the string.`
self.is_string_finished = False
# How many times this character appeared in the addition process
self.counter = 1
def add(node, string):
"""
Adding a string in the trie structure
"""
for char in string:
found_in_child = False
# Search for the character in the children of the present `node`
for child in node.children:
if child.char == char:
# We found it, increase the counter by 1 to keep track that another
# word has it as well
child.counter += 1
# And point the node to the child that contains this char
node = child
found_in_child = True
break
# We did not find it so add a new child
if not found_in_child:
new_node = TrieNode(char, node)
node.children.append(new_node)
# And then point node to the new child
node = new_node
# Everything finished. Mark it as the end of a word.
node.is_string_finished = True
def chunk_into_clusters(node, threshold, clusters_nodes):
"""
Chunk trie into clusters.
threshold is maximum number of string a cluster can contain.
clusters_nodes is the output/resulting list of nodes.
"""
for child in node.children:
if (child.counter < threshold) and (len(child.children) > 1):
clusters_nodes.append(child)
elif child.is_string_finished:
clusters_nodes.append(child)
else:
chunk_into_clusters(child, threshold, clusters_nodes) # recursive call
def find_end_nodes(node, end_nodes):
"""
Find nodes which end strings (nodes having the attribute "is_string_finished" set to True).
threshold is the maximum number of string a cluster can contain.
clusters_nodes is the output/resulting list of nodes.
"""
for child in node.children:
if child.is_string_finished:
end_nodes.append(child)
find_end_nodes(child, end_nodes) # recursive call
def retrieve_string(node):
"""
retrieve string from node.
"""
string = ""
while node is not None: # while node is not root
string = node.char + string
node = node.parent
return string
main.py
# Script Name : main.py
# Author : The_Average_Engineer
# Created : 25th April 2021
# Description : Testing string list clustering method
import string_list
import trie
if __name__ == '__main__':
r"""
script entry point
"""
# max number of strings per cluster
MAX_NB_STRINGS_PER_CLUSTER = 30
# result dict
clusters = {}
# add strings to trie
root = trie.TrieNode('', None)
for string in string_list.strings:
trie.add(root, string)
# get clusters from trie
clusters_nodes = []
trie.chunk_into_clusters(root, MAX_NB_STRINGS_PER_CLUSTER, clusters_nodes)
# get strings associated with each clusters nodes
for cluster_node in clusters_nodes:
cluster_node_string = trie.retrieve_string(cluster_node)
clusters[cluster_node_string] = []
# get strings contained in each cluster
end_nodes = []
trie.find_end_nodes(cluster_node, end_nodes)
if cluster_node.is_string_finished:
clusters[cluster_node_string].append(cluster_node_string)
for end_node in end_nodes:
end_node_string = trie.retrieve_string(end_node)
clusters[cluster_node_string].append(end_node_string)
# print results
for cluster_name, cluster_strings in clusters.items():
print("\n{}:".format(cluster_name))
for string in cluster_strings:
print("\t{}".format(string))
输出
SENSOR_inDigitalPD:
SENSOR_inDigitalPD1Lvl
SENSOR_inDigitalPD2Lvl
SENSOR_inputPwm1:
SENSOR_inputPwm1FrequencyMeasure
SENSOR_inputPwm1FrequencyMeasureValid
SENSOR_inputPwm1FallingEdgeCounter
SENSOR_inputPwm1DutyCycle
SENSOR_inputPwm1DigitalLevel
SENSOR_inputPwm1RisingEdgeCounter
SENSOR_inputPwm2:
SENSOR_inputPwm2FrequencyMeasure
SENSOR_inputPwm2FrequencyMeasureValid
SENSOR_inputPwm2FallingEdgeCounter
SENSOR_inputPwm2DutyCycle
SENSOR_inputPwm2DigitalLevel
SENSOR_inputPwm2RisingEdgeCounter
etc...
它似乎工作得很好。 如果你们中的一些人有简化此实现的想法,我非常感兴趣!