在字符串列表中查找差异
Find differences in list of strings
输入列表:
['A.B.C.D','A.A.C.D','A.B.E.F','A.B.E.GG']
字符串是用点连接的变量。所以在第一个字符串中,变量是 A B C D,但在最后一个字符串中变量是 A B E GG(变量可以有不同的长度),但是字符串总是有相同数量的变量,用点分隔。我想将那些只有一个不同变量的字符串组合在一起。所以上面会产生2组。
['A.B.C.D','A.A.C.D']
['A.B.E.F','A.B.E.GG']
差异必须在同一个变量中。所有变量之间没有一个差异。每个字符串应该只包含一次,而不是跨组多次。
真实数据最多可以有 20 个变量(但每个列表中的所有项目总是有相同数量的变量),列表可以有多个 minion 字符串,因此性能也是一个问题。
我写了一个算法可以做到,但是太复杂了。我也尝试通过 itertools groupby 但无法生成正确的结果:
import itertools
import difflib
class Grouper:
def __init__(self, diff):
self.last = None
self.diff = diff
def get_diffs(self, curr_key):
if self.last == None:
return []
dims = curr_key.split('.')
previous_dims = self.last.split('.')
diffs = list((dm for dm in difflib.ndiff(dims, previous_dims) if '-' not in dm and '?' not in dm))
return [n for n, d in enumerate(diffs) if '+' in d]
def __call__(self, item):
result = self.get_diffs(item)
print(item)
self.last = item
return result
grp = Grouper(data)
groups = []
uniquekeys = []
for k, g in itertools.groupby(data, grp):
groups.append(list(g)) # Store group iterator as a list
uniquekeys.append(k)
更新:
更多示例输入:
['D.1.2.A.1.B.C', 'D.7.2.A.1.B.C', 'D.21.2.A.1.B.C', 'D.15.6.A.1.B.C', 'D.25.6.A.1.B.C', 'D.7.2.A.1.B.C', 'D.8.2.A.1.B.C', 'D.8.6.A.1.B.C', 'D.10.2.A.1.B.C', 'D.14.2.A.1.B.C', 'D.15.2.A.1.B.C', 'D.15.6.A.1.B.C', 'D.16.2.A.1.B.C', 'D.17.2.A.1.B.C', 'D.18.2.A.1.B.C', 'D.19.2.A.1.B.C', 'D.20.2.A.1.B.C', 'D.21.2.A.1.B.C', 'D.22.2.A.1.B.C', 'D.23.2.A.1.B.C', 'D.25.2.A.1.B.C', 'D.25.6.A.1.B.C', 'D.26.2.A.1.B.C', 'D.27.2.A.1.B.C']
我又添加了一个字符串“A.B.C.F”,这个字符串可以匹配“A.B.C.D”和“A.B.E.F”:
import itertools
def main(l: list) -> list:
splits = [tuple(s.split(".")) for s in l]
groups = {}
# Dict of {(tuple, index of difference): list of tuples that match the key}
for split in splits:
matched = False
for (t, i) in groups.keys():
# We match two splits if they only have one difference
diffs = [i for i in range(len(split)) if split[i] != t[i]]
if len(diffs) == 1:
# The strings match but is the first match of 't'?
if i is None:
groups.pop((t, i))
groups[(t, diffs[0])] = [split]
# We found a match, stop the matching of this tuple
matched = True
break
elif diffs[0] == i:
groups[(t, i)].append(split)
matched = True
break
if not matched:
# Let's add this split as a candidate for future splits
groups[(split, None)] = []
return [[".".join(k)] + [".".join(s) for s in v] for (k, i), v in groups.items()]
print(main(["A.B.C.D", "A.A.C.D", "A.B.E.F", "A.B.E.GG", "A.B.C.F"]))
# [['A.B.C.D', 'A.A.C.D'], ['A.B.E.F', 'A.B.E.GG'], ['A.B.C.F']]
print(main(["A.B.C", "A.B.D", "A.E.D"]))
# [['A.B.C', 'A.B.D'], ['A.E.D']]
使用贪心算法求解set cover problem:
string_list = ['A.B.C.D','A.A.C.D','A.B.E.F','A.B.E.GG', 'A.B.C.F']
list_list = [s.split('.') for s in string_list]
# [['A', 'B', 'C', 'D'], ['A', 'A', 'C', 'D'], ...]
universe = set(range(len(string_list)))
# {0, 1, 2, 3, 4}
# represents string_list by index
# for instance, 3 represents 'A.B.E.GG'
subsets = {frozenset(j for j,l in enumerate(list_list) if l[:i]==l0[:i] and l[i+1:]==l0[i+1:]) for l0 in list_list for i in range(len(l0))}
# {{2}, {2, 3}, {0, 1}, {0, 4}, {3}, {2, 4}, {1}, {4}, {0}}
# represents possible groupings by index
# for instance, {2, 3} represents {'A.B.E.F', 'A.B.E.GG'}
solution = []
while universe:
max_subset = max(subsets, key=len)
solution.append(max_subset)
universe -= max_subset
subsets.remove(max_subset)
subsets = {subset & universe for subset in subsets}
print(solution)
# {{2, 3}, {0, 1}, {4}}
clusters = [[string_list[i] for i in subset] for subset in solution]
print(clusters)
# [['A.B.E.F', 'A.B.E.GG'], ['A.B.C.D', 'A.A.C.D'], ['A.B.C.F']]
您可以使用生成器函数:
from collections import defaultdict
def m(a, b):
return max(map(len, [a - b, b - a])) < 2
def options(l, ind, a):
for i in a:
yield '.'.join(l[:ind]+[i]+l[ind+1:])
def groups(d, ch):
d1, v, last_ind, ind_set = defaultdict(list), [], None, set()
for i in d:
d1[i].append(i.split('.'))
d1 = dict(d1)
while d1:
if not v:
v.extend([(d[0], i) for i in d1.pop(d[0])])
elif not ind_set:
k1 = iter(d1)
while (n:=next(k1)) is not None and not m(set(v[0][-1]), set(n.split('.'))):
pass
if n is not None:
v.extend([(n, i) for i in d1.pop(n)])
(last_ind, ind_set), *_ = [(i, j) for i, a in enumerate(zip(*[j for _, j in v])) if len((j:=set(a))) != 1]
else:
yield [x for x, _ in v]
n = next(iter(d1))
v = [(n, i) for i in d1.pop(n)]
ind_set, last_ind = set(), None
else:
if not (o:=[i for i in options(v[0][-1], last_ind, ch[last_ind] - ind_set) if i in d1]):
yield [x for x, _ in v]
n = next(iter(d1))
v = [(n, i) for i in d1.pop(n)]
ind_set, last_ind = set(), None
else:
v.extend([(y, i) for y in o for i in d1.pop(y)])
yield from ([] if not v else [[x for x, _ in v]])
def get_groups(d):
ch = list(map(set, zip(*[i.split('.') for i in d])))
return list(groups(d, ch))
print(get_groups(['A.B.C.D','A.A.C.D','A.B.E.F','A.B.E.GG']))
print(get_groups(["A.B.C", "A.B.D", "A.E.D"]))
print(get_groups(['D.1.2.A.1.B.C', 'D.7.2.A.1.B.C', 'D.21.2.A.1.B.C', 'D.15.6.A.1.B.C', 'D.25.6.A.1.B.C', 'D.7.2.A.1.B.C', 'D.8.2.A.1.B.C', 'D.8.6.A.1.B.C', 'D.10.2.A.1.B.C', 'D.14.2.A.1.B.C', 'D.15.2.A.1.B.C', 'D.15.6.A.1.B.C', 'D.16.2.A.1.B.C', 'D.17.2.A.1.B.C', 'D.18.2.A.1.B.C', 'D.19.2.A.1.B.C', 'D.20.2.A.1.B.C', 'D.21.2.A.1.B.C', 'D.22.2.A.1.B.C', 'D.23.2.A.1.B.C', 'D.25.2.A.1.B.C', 'D.25.6.A.1.B.C', 'D.26.2.A.1.B.C', 'D.27.2.A.1.B.C']))
输出:
[['A.A.C.D', 'A.B.C.D'], ['A.B.E.F', 'A.B.E.GG']]
[['A.B.C', 'A.B.D'], ['A.E.D']]
[['D.1.2.A.1.B.C', 'D.7.2.A.1.B.C', 'D.21.2.A.1.B.C', 'D.8.2.A.1.B.C', 'D.10.2.A.1.B.C', 'D.14.2.A.1.B.C', 'D.15.2.A.1.B.C', 'D.16.2.A.1.B.C', 'D.17.2.A.1.B.C', 'D.18.2.A.1.B.C', 'D.19.2.A.1.B.C', 'D.20.2.A.1.B.C', 'D.22.2.A.1.B.C', 'D.23.2.A.1.B.C', 'D.25.2.A.1.B.C', 'D.26.2.A.1.B.C', 'D.27.2.A.1.B.C'], ['D.15.6.A.1.B.C', 'D.25.6.A.1.B.C', 'D.8.6.A.1.B.C']]
时间安排:
import string, time
import contextlib, itertools as it
r_s = map('.'.join, it.product(*[string.ascii_uppercase[:10] if not i%2 else string.digits for i in range(6)]))
@contextlib.contextmanager
def timeit(f):
t = time.time()
yield
print(f'{f} time: {time.time() - t}')
with timeit('groups'):
_ = [*get_groups([*r_s])]
输出:
groups time: 43.55898189544678
输入列表:
['A.B.C.D','A.A.C.D','A.B.E.F','A.B.E.GG']
字符串是用点连接的变量。所以在第一个字符串中,变量是 A B C D,但在最后一个字符串中变量是 A B E GG(变量可以有不同的长度),但是字符串总是有相同数量的变量,用点分隔。我想将那些只有一个不同变量的字符串组合在一起。所以上面会产生2组。
['A.B.C.D','A.A.C.D'] ['A.B.E.F','A.B.E.GG']
差异必须在同一个变量中。所有变量之间没有一个差异。每个字符串应该只包含一次,而不是跨组多次。
真实数据最多可以有 20 个变量(但每个列表中的所有项目总是有相同数量的变量),列表可以有多个 minion 字符串,因此性能也是一个问题。
我写了一个算法可以做到,但是太复杂了。我也尝试通过 itertools groupby 但无法生成正确的结果:
import itertools
import difflib
class Grouper:
def __init__(self, diff):
self.last = None
self.diff = diff
def get_diffs(self, curr_key):
if self.last == None:
return []
dims = curr_key.split('.')
previous_dims = self.last.split('.')
diffs = list((dm for dm in difflib.ndiff(dims, previous_dims) if '-' not in dm and '?' not in dm))
return [n for n, d in enumerate(diffs) if '+' in d]
def __call__(self, item):
result = self.get_diffs(item)
print(item)
self.last = item
return result
grp = Grouper(data)
groups = []
uniquekeys = []
for k, g in itertools.groupby(data, grp):
groups.append(list(g)) # Store group iterator as a list
uniquekeys.append(k)
更新:
更多示例输入:
['D.1.2.A.1.B.C', 'D.7.2.A.1.B.C', 'D.21.2.A.1.B.C', 'D.15.6.A.1.B.C', 'D.25.6.A.1.B.C', 'D.7.2.A.1.B.C', 'D.8.2.A.1.B.C', 'D.8.6.A.1.B.C', 'D.10.2.A.1.B.C', 'D.14.2.A.1.B.C', 'D.15.2.A.1.B.C', 'D.15.6.A.1.B.C', 'D.16.2.A.1.B.C', 'D.17.2.A.1.B.C', 'D.18.2.A.1.B.C', 'D.19.2.A.1.B.C', 'D.20.2.A.1.B.C', 'D.21.2.A.1.B.C', 'D.22.2.A.1.B.C', 'D.23.2.A.1.B.C', 'D.25.2.A.1.B.C', 'D.25.6.A.1.B.C', 'D.26.2.A.1.B.C', 'D.27.2.A.1.B.C']
我又添加了一个字符串“A.B.C.F”,这个字符串可以匹配“A.B.C.D”和“A.B.E.F”:
import itertools
def main(l: list) -> list:
splits = [tuple(s.split(".")) for s in l]
groups = {}
# Dict of {(tuple, index of difference): list of tuples that match the key}
for split in splits:
matched = False
for (t, i) in groups.keys():
# We match two splits if they only have one difference
diffs = [i for i in range(len(split)) if split[i] != t[i]]
if len(diffs) == 1:
# The strings match but is the first match of 't'?
if i is None:
groups.pop((t, i))
groups[(t, diffs[0])] = [split]
# We found a match, stop the matching of this tuple
matched = True
break
elif diffs[0] == i:
groups[(t, i)].append(split)
matched = True
break
if not matched:
# Let's add this split as a candidate for future splits
groups[(split, None)] = []
return [[".".join(k)] + [".".join(s) for s in v] for (k, i), v in groups.items()]
print(main(["A.B.C.D", "A.A.C.D", "A.B.E.F", "A.B.E.GG", "A.B.C.F"]))
# [['A.B.C.D', 'A.A.C.D'], ['A.B.E.F', 'A.B.E.GG'], ['A.B.C.F']]
print(main(["A.B.C", "A.B.D", "A.E.D"]))
# [['A.B.C', 'A.B.D'], ['A.E.D']]
使用贪心算法求解set cover problem:
string_list = ['A.B.C.D','A.A.C.D','A.B.E.F','A.B.E.GG', 'A.B.C.F']
list_list = [s.split('.') for s in string_list]
# [['A', 'B', 'C', 'D'], ['A', 'A', 'C', 'D'], ...]
universe = set(range(len(string_list)))
# {0, 1, 2, 3, 4}
# represents string_list by index
# for instance, 3 represents 'A.B.E.GG'
subsets = {frozenset(j for j,l in enumerate(list_list) if l[:i]==l0[:i] and l[i+1:]==l0[i+1:]) for l0 in list_list for i in range(len(l0))}
# {{2}, {2, 3}, {0, 1}, {0, 4}, {3}, {2, 4}, {1}, {4}, {0}}
# represents possible groupings by index
# for instance, {2, 3} represents {'A.B.E.F', 'A.B.E.GG'}
solution = []
while universe:
max_subset = max(subsets, key=len)
solution.append(max_subset)
universe -= max_subset
subsets.remove(max_subset)
subsets = {subset & universe for subset in subsets}
print(solution)
# {{2, 3}, {0, 1}, {4}}
clusters = [[string_list[i] for i in subset] for subset in solution]
print(clusters)
# [['A.B.E.F', 'A.B.E.GG'], ['A.B.C.D', 'A.A.C.D'], ['A.B.C.F']]
您可以使用生成器函数:
from collections import defaultdict
def m(a, b):
return max(map(len, [a - b, b - a])) < 2
def options(l, ind, a):
for i in a:
yield '.'.join(l[:ind]+[i]+l[ind+1:])
def groups(d, ch):
d1, v, last_ind, ind_set = defaultdict(list), [], None, set()
for i in d:
d1[i].append(i.split('.'))
d1 = dict(d1)
while d1:
if not v:
v.extend([(d[0], i) for i in d1.pop(d[0])])
elif not ind_set:
k1 = iter(d1)
while (n:=next(k1)) is not None and not m(set(v[0][-1]), set(n.split('.'))):
pass
if n is not None:
v.extend([(n, i) for i in d1.pop(n)])
(last_ind, ind_set), *_ = [(i, j) for i, a in enumerate(zip(*[j for _, j in v])) if len((j:=set(a))) != 1]
else:
yield [x for x, _ in v]
n = next(iter(d1))
v = [(n, i) for i in d1.pop(n)]
ind_set, last_ind = set(), None
else:
if not (o:=[i for i in options(v[0][-1], last_ind, ch[last_ind] - ind_set) if i in d1]):
yield [x for x, _ in v]
n = next(iter(d1))
v = [(n, i) for i in d1.pop(n)]
ind_set, last_ind = set(), None
else:
v.extend([(y, i) for y in o for i in d1.pop(y)])
yield from ([] if not v else [[x for x, _ in v]])
def get_groups(d):
ch = list(map(set, zip(*[i.split('.') for i in d])))
return list(groups(d, ch))
print(get_groups(['A.B.C.D','A.A.C.D','A.B.E.F','A.B.E.GG']))
print(get_groups(["A.B.C", "A.B.D", "A.E.D"]))
print(get_groups(['D.1.2.A.1.B.C', 'D.7.2.A.1.B.C', 'D.21.2.A.1.B.C', 'D.15.6.A.1.B.C', 'D.25.6.A.1.B.C', 'D.7.2.A.1.B.C', 'D.8.2.A.1.B.C', 'D.8.6.A.1.B.C', 'D.10.2.A.1.B.C', 'D.14.2.A.1.B.C', 'D.15.2.A.1.B.C', 'D.15.6.A.1.B.C', 'D.16.2.A.1.B.C', 'D.17.2.A.1.B.C', 'D.18.2.A.1.B.C', 'D.19.2.A.1.B.C', 'D.20.2.A.1.B.C', 'D.21.2.A.1.B.C', 'D.22.2.A.1.B.C', 'D.23.2.A.1.B.C', 'D.25.2.A.1.B.C', 'D.25.6.A.1.B.C', 'D.26.2.A.1.B.C', 'D.27.2.A.1.B.C']))
输出:
[['A.A.C.D', 'A.B.C.D'], ['A.B.E.F', 'A.B.E.GG']]
[['A.B.C', 'A.B.D'], ['A.E.D']]
[['D.1.2.A.1.B.C', 'D.7.2.A.1.B.C', 'D.21.2.A.1.B.C', 'D.8.2.A.1.B.C', 'D.10.2.A.1.B.C', 'D.14.2.A.1.B.C', 'D.15.2.A.1.B.C', 'D.16.2.A.1.B.C', 'D.17.2.A.1.B.C', 'D.18.2.A.1.B.C', 'D.19.2.A.1.B.C', 'D.20.2.A.1.B.C', 'D.22.2.A.1.B.C', 'D.23.2.A.1.B.C', 'D.25.2.A.1.B.C', 'D.26.2.A.1.B.C', 'D.27.2.A.1.B.C'], ['D.15.6.A.1.B.C', 'D.25.6.A.1.B.C', 'D.8.6.A.1.B.C']]
时间安排:
import string, time
import contextlib, itertools as it
r_s = map('.'.join, it.product(*[string.ascii_uppercase[:10] if not i%2 else string.digits for i in range(6)]))
@contextlib.contextmanager
def timeit(f):
t = time.time()
yield
print(f'{f} time: {time.time() - t}')
with timeit('groups'):
_ = [*get_groups([*r_s])]
输出:
groups time: 43.55898189544678