C# 字符串列表中的所有常见序列

C# all common sequences in lists of strings

我试图在提供的数组中找到最长的公共字符串序列。

我有 25,000 个带序列的列表,总共有 450,000 个单词,我需要按长度排序,然后按计数排序。

List<string> listA = new List<string>() {"Step1", "Step3", "Process", "System", "Process"};
List<string> listB = new List<string>() {"Process", "System", "Process"};
List<string> listC = new List<string>() {"Terminal", "Step1", "Step3"};
...

打印所有可能序列及其长度和计数的所需输出是:

Sequence                               Length Count
Step1->Step3->Process->System->Process 5      1
Step1->Step3->Process->System          4      1
Step3->Process->System->Process        4      1
Process->System->Process               3      2
Step1->Step3->Process                  3      1
Step3->Process->System                 3      1
Terminal->Step1->Step3                 3      1
Step1->Step3                           2      2
Process->System                        2      2
System->Process                        2      2
Step3->Process                         2      1
Terminal->Step1                        2      1
Process                                1      4
Step1                                  1      2
Step3                                  1      2
System                                 1      2
Terminal                               1      1

我只能找到子字符串的实现,而不是可以将多个列表作为输入的整个单词。

好的,所以您实际上可以重载 GetHashCodeEquals 来处理字符串,就像字符串中的字符一样。此外,创建列表段以防止运行时间被多个集合淹没也可能是合理的。

public class ListSegment<T>
{
    private sealed class ListSegmentEqualityComparer : IEqualityComparer<ListSegment<T>>
    {
        public bool Equals(ListSegment<T> x, ListSegment<T> y)
        {
            if (x.Length != y.Length)
            {
                return false;
            }
            return x.Lst.Skip(x.Offset).Take(x.Length)
                .SequenceEqual(y.Lst.Skip(y.Offset).Take(y.Length));
        }

        public int GetHashCode(ListSegment<T> obj)
        {
            unchecked
            {
                int hash = 17;
                for (int i = obj.Offset; i < obj.Offset + obj.Length; i++)
                {
                    hash = hash * 31 + obj.Lst[i].GetHashCode();
                }
                return hash;
            }
        }
    }

    public static IEqualityComparer<ListSegment<T>> Default { get; } = new ListSegmentEqualityComparer();

    public List<T> Lst { get; set; }
    public int Offset { get; set; }
    public int Length { get; set; }

    public IEnumerable<T> GetEnumerable()
    {
        return Lst.Skip(Offset).Take(Length);
    }

    public override string ToString()
    {
        return string.Join("->", GetEnumerable());
    }
}

然后你运行通过列表列表计算出现次数

    public List<KeyValuePair<ListSegment<string>, int>> GetOrderedPairs(List<List<string>> data)
    {
        var segmentsDictionary = new Dictionary<ListSegment<string>, int>(ListSegment<string>.Default);
        foreach (var list in data)
        {
            for (int i = 0; i < list.Count; i++)
            for (int j = i + 1; j <= list.Count; j++)
            {
                var segment = new ListSegment<string>
                {
                    Lst = list,
                    Length = j-i,
                    Offset = i,
                };
                if (segmentsDictionary.TryGetValue(segment, out var val))
                {
                    segmentsDictionary[segment] = val + 1;
                }
                else
                {
                    segmentsDictionary[segment] = 1;
                }
            }
        }

        return segmentsDictionary.OrderByDescending(pair => pair.Key.Length).ToList();
    }

待测试运行关注

        List<string> listA = new List<string>() { "Step1", "Step3", "Process", "System", "Process" };
        List<string> listB = new List<string>() { "Process", "System", "Process" };
        List<string> listC = new List<string>() { "Terminal", "Step1", "Step3" };
        var pairs = GetOrderedPairs(new List<List<string>>()
        {
            listA, listB, listC
        });
        foreach (var keyValuePair in pairs)
        {
            Console.WriteLine(keyValuePair.Key + " " + keyValuePair.Key.Length + " " + keyValuePair.Value);
        }

使用一些扩展方法,您可以创建一个 IEQualityComparer 来比较 IEnumerable 序列。使用它,您可以使用 LINQ Distinct 按序列进行比较:

public static class IEnumerableExt {
    public static IEnumerable<IEnumerable<T>> DistinctIE<T>(this IEnumerable<IEnumerable<T>> src) => src.Distinct(Make.IESequenceEqualityComparer<T>());

    // IEnumerable<string>
    public static string Join(this IEnumerable<string> src, string sep) => String.Join(sep, src);
}

public static class Make {
    public static IEqualityComparer<IEnumerable<T>> IESequenceEqualityComparer<T>() => new IEnumerableSequenceEqualityComparer<T>();
    public static IEqualityComparer<IEnumerable<T>> IESequenceEqualityComparer<T>(T _) => new IEnumerableSequenceEqualityComparer<T>();

    public class IEnumerableSequenceEqualityComparer<T> : IEqualityComparer<IEnumerable<T>> {
        public bool Equals(IEnumerable<T> x, IEnumerable<T> y) =>
            Object.ReferenceEquals(x, y) || (x != null && y != null && (x.SequenceEqual(y)));

        public int GetHashCode(IEnumerable<T> src) {
            var hc = new HashCode();
            foreach (var v in src)
                hc.Add(v);
            return hc.ToHashCode();
        }
    }
}

使用这些工具,您可以创建扩展方法来生成 List 的所有子序列和所有不同的子序列:

public static class ListExt {
    public static IEnumerable<IEnumerable<T>> Subsequences<T>(this List<T> src) {
        IEnumerable<T> Helper(int start, int end) {
            for (int j3 = start; j3 <= end; ++j3)
                yield return src[j3];
        }

        for (int j1 = 0; j1 < src.Count; ++j1) {
            for (int j2 = j1; j2 < src.Count; ++j2)
                yield return Helper(j1, j2);
        }
    }

    public static IEnumerable<IEnumerable<T>> DistinctSubsequences<T>(this List<T> src) => src.Subsequences().DistinctIE();
}

现在您可以计算答案了。

首先,计算所有子序列并将它们组合起来:

var ssA = listA.DistinctSubsequences();
var ssB = listB.DistinctSubsequences();
var ssC = listC.DistinctSubsequences();
var ssAll = ssA.Concat(ssB).Concat(ssC).DistinctIE();

然后,创建一些用于计算出现次数的助手:

var hA = ssA.ToHashSet(Make.IESequenceEqualityComparer<string>());
var hB = ssB.ToHashSet(Make.IESequenceEqualityComparer<string>());
var hC = ssC.ToHashSet(Make.IESequenceEqualityComparer<string>());

Func<IEnumerable<string>, HashSet<IEnumerable<string>>, int> testIn = (s, h) => h.Contains(s) ? 1 : 0;
Func<IEnumerable<string>,int> countIn = s => testIn(s,hA)+testIn(s,hB)+testIn(s,hC);

最后,计算答案:

var ans = ssAll.Select(ss => new { Sequence = ss.Join("->"), Length = ss.Count(), Count = countIn(ss) }).OrderByDescending(sc => sc.Sequence.Length);