在字符串列表中查找差异

Question

输入列表：

['A.B.C.D','A.A.C.D','A.B.E.F','A.B.E.GG']

字符串是用点连接的变量。所以在第一个字符串中，变量是 A B C D，但在最后一个字符串中变量是 A B E GG（变量可以有不同的长度），但是字符串总是有相同数量的变量，用点分隔。我想将那些只有一个不同变量的字符串组合在一起。所以上面会产生2组。

['A.B.C.D','A.A.C.D'] ['A.B.E.F','A.B.E.GG']

差异必须在同一个变量中。所有变量之间没有一个差异。每个字符串应该只包含一次，而不是跨组多次。

真实数据最多可以有 20 个变量（但每个列表中的所有项目总是有相同数量的变量），列表可以有多个 minion 字符串，因此性能也是一个问题。

我写了一个算法可以做到，但是太复杂了。我也尝试通过 itertools groupby 但无法生成正确的结果：

import itertools
import difflib

class Grouper:
    def __init__(self, diff):
        self.last = None
        self.diff = diff
    def get_diffs(self, curr_key):
        if self.last == None:
            return []
        dims = curr_key.split('.')
        previous_dims = self.last.split('.')
        diffs = list((dm for dm in difflib.ndiff(dims, previous_dims) if '-' not in dm and '?' not in dm))
        return [n for n, d in enumerate(diffs) if '+' in d]
    
    def __call__(self, item):
        result = self.get_diffs(item)
        print(item)
        self.last = item
        return result


grp = Grouper(data)
groups = []
uniquekeys = []

for k, g in itertools.groupby(data, grp):
    groups.append(list(g))      # Store group iterator as a list
    uniquekeys.append(k)

更新：

更多示例输入：

['D.1.2.A.1.B.C', 'D.7.2.A.1.B.C', 'D.21.2.A.1.B.C', 'D.15.6.A.1.B.C', 'D.25.6.A.1.B.C', 'D.7.2.A.1.B.C', 'D.8.2.A.1.B.C', 'D.8.6.A.1.B.C', 'D.10.2.A.1.B.C', 'D.14.2.A.1.B.C', 'D.15.2.A.1.B.C', 'D.15.6.A.1.B.C', 'D.16.2.A.1.B.C', 'D.17.2.A.1.B.C', 'D.18.2.A.1.B.C', 'D.19.2.A.1.B.C', 'D.20.2.A.1.B.C', 'D.21.2.A.1.B.C', 'D.22.2.A.1.B.C', 'D.23.2.A.1.B.C', 'D.25.2.A.1.B.C', 'D.25.6.A.1.B.C', 'D.26.2.A.1.B.C', 'D.27.2.A.1.B.C']

Answer 1

我又添加了一个字符串“A.B.C.F”，这个字符串可以匹配“A.B.C.D”和“A.B.E.F”：

import itertools


def main(l: list) -> list:
    splits = [tuple(s.split(".")) for s in l]

    groups = {}
    # Dict of {(tuple, index of difference): list of tuples that match the key}

    for split in splits:
        matched = False
        for (t, i) in groups.keys():
            # We match two splits if they only have one difference
            diffs = [i for i in range(len(split)) if split[i] != t[i]]
            if len(diffs) == 1:
                # The strings match but is the first match of 't'?
                if i is None:
                    groups.pop((t, i))
                    groups[(t, diffs[0])] = [split]
                    # We found a match, stop the matching of this tuple
                    matched = True
                    break
                elif diffs[0] == i:
                    groups[(t, i)].append(split)
                    matched = True
                    break
        if not matched:
            # Let's add this split as a candidate for future splits
            groups[(split, None)] = []

    return [[".".join(k)] + [".".join(s) for s in v] for (k, i), v in groups.items()]


print(main(["A.B.C.D", "A.A.C.D", "A.B.E.F", "A.B.E.GG", "A.B.C.F"]))
# [['A.B.C.D', 'A.A.C.D'], ['A.B.E.F', 'A.B.E.GG'], ['A.B.C.F']]
print(main(["A.B.C", "A.B.D", "A.E.D"]))
# [['A.B.C', 'A.B.D'], ['A.E.D']]

Answer 2

使用贪心算法求解set cover problem：

string_list = ['A.B.C.D','A.A.C.D','A.B.E.F','A.B.E.GG', 'A.B.C.F']
list_list = [s.split('.') for s in string_list]
# [['A', 'B', 'C', 'D'], ['A', 'A', 'C', 'D'], ...]

universe = set(range(len(string_list)))
# {0, 1, 2, 3, 4}
# represents string_list by index
# for instance, 3 represents 'A.B.E.GG'

subsets = {frozenset(j for j,l in enumerate(list_list) if l[:i]==l0[:i] and l[i+1:]==l0[i+1:]) for l0 in list_list for i in range(len(l0))}
# {{2}, {2, 3}, {0, 1}, {0, 4}, {3}, {2, 4}, {1}, {4}, {0}}
# represents possible groupings by index
# for instance, {2, 3} represents {'A.B.E.F', 'A.B.E.GG'}

solution = []
while universe:
  max_subset = max(subsets, key=len)
  solution.append(max_subset)
  universe -= max_subset
  subsets.remove(max_subset)
  subsets = {subset & universe for subset in subsets}

print(solution)
# {{2, 3}, {0, 1}, {4}}

clusters = [[string_list[i] for i in subset] for subset in solution]
print(clusters)
# [['A.B.E.F', 'A.B.E.GG'], ['A.B.C.D', 'A.A.C.D'], ['A.B.C.F']]

Answer 3

您可以使用生成器函数：

from collections import defaultdict
def m(a, b):
   return max(map(len, [a - b, b - a])) < 2

def options(l, ind, a):
   for i in a:
      yield '.'.join(l[:ind]+[i]+l[ind+1:])

def groups(d, ch):
   d1, v, last_ind, ind_set = defaultdict(list), [], None, set()
   for i in d:
      d1[i].append(i.split('.'))
   d1 = dict(d1)
   while d1:
      if not v:
         v.extend([(d[0], i) for i in d1.pop(d[0])])
      elif not ind_set:
         k1 = iter(d1)
         while (n:=next(k1)) is not None and not m(set(v[0][-1]), set(n.split('.'))):
            pass
         if n is not None:
             v.extend([(n, i) for i in d1.pop(n)])
             (last_ind, ind_set), *_ = [(i, j) for i, a in enumerate(zip(*[j for _, j in v])) if len((j:=set(a))) != 1]
         else:
             yield [x for x, _ in v]
             n = next(iter(d1))
             v = [(n, i) for i in d1.pop(n)]
             ind_set, last_ind = set(), None
      else:
         if not (o:=[i for i in options(v[0][-1], last_ind, ch[last_ind] - ind_set) if i in d1]):
             yield [x for x, _ in v]
             n = next(iter(d1))
             v = [(n, i) for i in d1.pop(n)]
             ind_set, last_ind = set(), None
         else:
             v.extend([(y, i) for y in o for i in d1.pop(y)])
   yield from ([] if not v else [[x for x, _ in v]])

def get_groups(d):
   ch = list(map(set, zip(*[i.split('.') for i in d])))
   return list(groups(d, ch))

print(get_groups(['A.B.C.D','A.A.C.D','A.B.E.F','A.B.E.GG']))
print(get_groups(["A.B.C", "A.B.D", "A.E.D"]))
print(get_groups(['D.1.2.A.1.B.C', 'D.7.2.A.1.B.C', 'D.21.2.A.1.B.C', 'D.15.6.A.1.B.C', 'D.25.6.A.1.B.C', 'D.7.2.A.1.B.C', 'D.8.2.A.1.B.C', 'D.8.6.A.1.B.C', 'D.10.2.A.1.B.C', 'D.14.2.A.1.B.C', 'D.15.2.A.1.B.C', 'D.15.6.A.1.B.C', 'D.16.2.A.1.B.C', 'D.17.2.A.1.B.C', 'D.18.2.A.1.B.C', 'D.19.2.A.1.B.C', 'D.20.2.A.1.B.C', 'D.21.2.A.1.B.C', 'D.22.2.A.1.B.C', 'D.23.2.A.1.B.C', 'D.25.2.A.1.B.C', 'D.25.6.A.1.B.C', 'D.26.2.A.1.B.C', 'D.27.2.A.1.B.C']))

输出：

[['A.A.C.D', 'A.B.C.D'], ['A.B.E.F', 'A.B.E.GG']]
[['A.B.C', 'A.B.D'], ['A.E.D']] 
[['D.1.2.A.1.B.C', 'D.7.2.A.1.B.C', 'D.21.2.A.1.B.C', 'D.8.2.A.1.B.C', 'D.10.2.A.1.B.C', 'D.14.2.A.1.B.C', 'D.15.2.A.1.B.C', 'D.16.2.A.1.B.C', 'D.17.2.A.1.B.C', 'D.18.2.A.1.B.C', 'D.19.2.A.1.B.C', 'D.20.2.A.1.B.C', 'D.22.2.A.1.B.C', 'D.23.2.A.1.B.C', 'D.25.2.A.1.B.C', 'D.26.2.A.1.B.C', 'D.27.2.A.1.B.C'], ['D.15.6.A.1.B.C', 'D.25.6.A.1.B.C', 'D.8.6.A.1.B.C']]

时间安排：

import string, time
import contextlib, itertools as it
r_s = map('.'.join, it.product(*[string.ascii_uppercase[:10] if not i%2 else string.digits for i in range(6)]))
@contextlib.contextmanager
def timeit(f):
    t = time.time()
    yield 
    print(f'{f} time: {time.time() - t}')

with timeit('groups'):
   _ = [*get_groups([*r_s])]

输出：

groups time: 43.55898189544678

在字符串列表中查找差异

Find differences in list of strings

python

sorting

algorithm

group-by

itertools