C# 字符串列表中的所有常见序列
C# all common sequences in lists of strings
我试图在提供的数组中找到最长的公共字符串序列。
我有 25,000 个带序列的列表,总共有 450,000 个单词,我需要按长度排序,然后按计数排序。
List<string> listA = new List<string>() {"Step1", "Step3", "Process", "System", "Process"};
List<string> listB = new List<string>() {"Process", "System", "Process"};
List<string> listC = new List<string>() {"Terminal", "Step1", "Step3"};
...
打印所有可能序列及其长度和计数的所需输出是:
Sequence Length Count
Step1->Step3->Process->System->Process 5 1
Step1->Step3->Process->System 4 1
Step3->Process->System->Process 4 1
Process->System->Process 3 2
Step1->Step3->Process 3 1
Step3->Process->System 3 1
Terminal->Step1->Step3 3 1
Step1->Step3 2 2
Process->System 2 2
System->Process 2 2
Step3->Process 2 1
Terminal->Step1 2 1
Process 1 4
Step1 1 2
Step3 1 2
System 1 2
Terminal 1 1
我只能找到子字符串的实现,而不是可以将多个列表作为输入的整个单词。
好的,所以您实际上可以重载 GetHashCode
和 Equals
来处理字符串,就像字符串中的字符一样。此外,创建列表段以防止运行时间被多个集合淹没也可能是合理的。
public class ListSegment<T>
{
private sealed class ListSegmentEqualityComparer : IEqualityComparer<ListSegment<T>>
{
public bool Equals(ListSegment<T> x, ListSegment<T> y)
{
if (x.Length != y.Length)
{
return false;
}
return x.Lst.Skip(x.Offset).Take(x.Length)
.SequenceEqual(y.Lst.Skip(y.Offset).Take(y.Length));
}
public int GetHashCode(ListSegment<T> obj)
{
unchecked
{
int hash = 17;
for (int i = obj.Offset; i < obj.Offset + obj.Length; i++)
{
hash = hash * 31 + obj.Lst[i].GetHashCode();
}
return hash;
}
}
}
public static IEqualityComparer<ListSegment<T>> Default { get; } = new ListSegmentEqualityComparer();
public List<T> Lst { get; set; }
public int Offset { get; set; }
public int Length { get; set; }
public IEnumerable<T> GetEnumerable()
{
return Lst.Skip(Offset).Take(Length);
}
public override string ToString()
{
return string.Join("->", GetEnumerable());
}
}
然后你运行通过列表列表计算出现次数
public List<KeyValuePair<ListSegment<string>, int>> GetOrderedPairs(List<List<string>> data)
{
var segmentsDictionary = new Dictionary<ListSegment<string>, int>(ListSegment<string>.Default);
foreach (var list in data)
{
for (int i = 0; i < list.Count; i++)
for (int j = i + 1; j <= list.Count; j++)
{
var segment = new ListSegment<string>
{
Lst = list,
Length = j-i,
Offset = i,
};
if (segmentsDictionary.TryGetValue(segment, out var val))
{
segmentsDictionary[segment] = val + 1;
}
else
{
segmentsDictionary[segment] = 1;
}
}
}
return segmentsDictionary.OrderByDescending(pair => pair.Key.Length).ToList();
}
待测试运行关注
List<string> listA = new List<string>() { "Step1", "Step3", "Process", "System", "Process" };
List<string> listB = new List<string>() { "Process", "System", "Process" };
List<string> listC = new List<string>() { "Terminal", "Step1", "Step3" };
var pairs = GetOrderedPairs(new List<List<string>>()
{
listA, listB, listC
});
foreach (var keyValuePair in pairs)
{
Console.WriteLine(keyValuePair.Key + " " + keyValuePair.Key.Length + " " + keyValuePair.Value);
}
使用一些扩展方法,您可以创建一个 IEQualityComparer
来比较 IEnumerable
序列。使用它,您可以使用 LINQ Distinct
按序列进行比较:
public static class IEnumerableExt {
public static IEnumerable<IEnumerable<T>> DistinctIE<T>(this IEnumerable<IEnumerable<T>> src) => src.Distinct(Make.IESequenceEqualityComparer<T>());
// IEnumerable<string>
public static string Join(this IEnumerable<string> src, string sep) => String.Join(sep, src);
}
public static class Make {
public static IEqualityComparer<IEnumerable<T>> IESequenceEqualityComparer<T>() => new IEnumerableSequenceEqualityComparer<T>();
public static IEqualityComparer<IEnumerable<T>> IESequenceEqualityComparer<T>(T _) => new IEnumerableSequenceEqualityComparer<T>();
public class IEnumerableSequenceEqualityComparer<T> : IEqualityComparer<IEnumerable<T>> {
public bool Equals(IEnumerable<T> x, IEnumerable<T> y) =>
Object.ReferenceEquals(x, y) || (x != null && y != null && (x.SequenceEqual(y)));
public int GetHashCode(IEnumerable<T> src) {
var hc = new HashCode();
foreach (var v in src)
hc.Add(v);
return hc.ToHashCode();
}
}
}
使用这些工具,您可以创建扩展方法来生成 List
的所有子序列和所有不同的子序列:
public static class ListExt {
public static IEnumerable<IEnumerable<T>> Subsequences<T>(this List<T> src) {
IEnumerable<T> Helper(int start, int end) {
for (int j3 = start; j3 <= end; ++j3)
yield return src[j3];
}
for (int j1 = 0; j1 < src.Count; ++j1) {
for (int j2 = j1; j2 < src.Count; ++j2)
yield return Helper(j1, j2);
}
}
public static IEnumerable<IEnumerable<T>> DistinctSubsequences<T>(this List<T> src) => src.Subsequences().DistinctIE();
}
现在您可以计算答案了。
首先,计算所有子序列并将它们组合起来:
var ssA = listA.DistinctSubsequences();
var ssB = listB.DistinctSubsequences();
var ssC = listC.DistinctSubsequences();
var ssAll = ssA.Concat(ssB).Concat(ssC).DistinctIE();
然后,创建一些用于计算出现次数的助手:
var hA = ssA.ToHashSet(Make.IESequenceEqualityComparer<string>());
var hB = ssB.ToHashSet(Make.IESequenceEqualityComparer<string>());
var hC = ssC.ToHashSet(Make.IESequenceEqualityComparer<string>());
Func<IEnumerable<string>, HashSet<IEnumerable<string>>, int> testIn = (s, h) => h.Contains(s) ? 1 : 0;
Func<IEnumerable<string>,int> countIn = s => testIn(s,hA)+testIn(s,hB)+testIn(s,hC);
最后,计算答案:
var ans = ssAll.Select(ss => new { Sequence = ss.Join("->"), Length = ss.Count(), Count = countIn(ss) }).OrderByDescending(sc => sc.Sequence.Length);
我试图在提供的数组中找到最长的公共字符串序列。
我有 25,000 个带序列的列表,总共有 450,000 个单词,我需要按长度排序,然后按计数排序。
List<string> listA = new List<string>() {"Step1", "Step3", "Process", "System", "Process"};
List<string> listB = new List<string>() {"Process", "System", "Process"};
List<string> listC = new List<string>() {"Terminal", "Step1", "Step3"};
...
打印所有可能序列及其长度和计数的所需输出是:
Sequence Length Count
Step1->Step3->Process->System->Process 5 1
Step1->Step3->Process->System 4 1
Step3->Process->System->Process 4 1
Process->System->Process 3 2
Step1->Step3->Process 3 1
Step3->Process->System 3 1
Terminal->Step1->Step3 3 1
Step1->Step3 2 2
Process->System 2 2
System->Process 2 2
Step3->Process 2 1
Terminal->Step1 2 1
Process 1 4
Step1 1 2
Step3 1 2
System 1 2
Terminal 1 1
我只能找到子字符串的实现,而不是可以将多个列表作为输入的整个单词。
好的,所以您实际上可以重载 GetHashCode
和 Equals
来处理字符串,就像字符串中的字符一样。此外,创建列表段以防止运行时间被多个集合淹没也可能是合理的。
public class ListSegment<T>
{
private sealed class ListSegmentEqualityComparer : IEqualityComparer<ListSegment<T>>
{
public bool Equals(ListSegment<T> x, ListSegment<T> y)
{
if (x.Length != y.Length)
{
return false;
}
return x.Lst.Skip(x.Offset).Take(x.Length)
.SequenceEqual(y.Lst.Skip(y.Offset).Take(y.Length));
}
public int GetHashCode(ListSegment<T> obj)
{
unchecked
{
int hash = 17;
for (int i = obj.Offset; i < obj.Offset + obj.Length; i++)
{
hash = hash * 31 + obj.Lst[i].GetHashCode();
}
return hash;
}
}
}
public static IEqualityComparer<ListSegment<T>> Default { get; } = new ListSegmentEqualityComparer();
public List<T> Lst { get; set; }
public int Offset { get; set; }
public int Length { get; set; }
public IEnumerable<T> GetEnumerable()
{
return Lst.Skip(Offset).Take(Length);
}
public override string ToString()
{
return string.Join("->", GetEnumerable());
}
}
然后你运行通过列表列表计算出现次数
public List<KeyValuePair<ListSegment<string>, int>> GetOrderedPairs(List<List<string>> data)
{
var segmentsDictionary = new Dictionary<ListSegment<string>, int>(ListSegment<string>.Default);
foreach (var list in data)
{
for (int i = 0; i < list.Count; i++)
for (int j = i + 1; j <= list.Count; j++)
{
var segment = new ListSegment<string>
{
Lst = list,
Length = j-i,
Offset = i,
};
if (segmentsDictionary.TryGetValue(segment, out var val))
{
segmentsDictionary[segment] = val + 1;
}
else
{
segmentsDictionary[segment] = 1;
}
}
}
return segmentsDictionary.OrderByDescending(pair => pair.Key.Length).ToList();
}
待测试运行关注
List<string> listA = new List<string>() { "Step1", "Step3", "Process", "System", "Process" };
List<string> listB = new List<string>() { "Process", "System", "Process" };
List<string> listC = new List<string>() { "Terminal", "Step1", "Step3" };
var pairs = GetOrderedPairs(new List<List<string>>()
{
listA, listB, listC
});
foreach (var keyValuePair in pairs)
{
Console.WriteLine(keyValuePair.Key + " " + keyValuePair.Key.Length + " " + keyValuePair.Value);
}
使用一些扩展方法,您可以创建一个 IEQualityComparer
来比较 IEnumerable
序列。使用它,您可以使用 LINQ Distinct
按序列进行比较:
public static class IEnumerableExt {
public static IEnumerable<IEnumerable<T>> DistinctIE<T>(this IEnumerable<IEnumerable<T>> src) => src.Distinct(Make.IESequenceEqualityComparer<T>());
// IEnumerable<string>
public static string Join(this IEnumerable<string> src, string sep) => String.Join(sep, src);
}
public static class Make {
public static IEqualityComparer<IEnumerable<T>> IESequenceEqualityComparer<T>() => new IEnumerableSequenceEqualityComparer<T>();
public static IEqualityComparer<IEnumerable<T>> IESequenceEqualityComparer<T>(T _) => new IEnumerableSequenceEqualityComparer<T>();
public class IEnumerableSequenceEqualityComparer<T> : IEqualityComparer<IEnumerable<T>> {
public bool Equals(IEnumerable<T> x, IEnumerable<T> y) =>
Object.ReferenceEquals(x, y) || (x != null && y != null && (x.SequenceEqual(y)));
public int GetHashCode(IEnumerable<T> src) {
var hc = new HashCode();
foreach (var v in src)
hc.Add(v);
return hc.ToHashCode();
}
}
}
使用这些工具,您可以创建扩展方法来生成 List
的所有子序列和所有不同的子序列:
public static class ListExt {
public static IEnumerable<IEnumerable<T>> Subsequences<T>(this List<T> src) {
IEnumerable<T> Helper(int start, int end) {
for (int j3 = start; j3 <= end; ++j3)
yield return src[j3];
}
for (int j1 = 0; j1 < src.Count; ++j1) {
for (int j2 = j1; j2 < src.Count; ++j2)
yield return Helper(j1, j2);
}
}
public static IEnumerable<IEnumerable<T>> DistinctSubsequences<T>(this List<T> src) => src.Subsequences().DistinctIE();
}
现在您可以计算答案了。
首先,计算所有子序列并将它们组合起来:
var ssA = listA.DistinctSubsequences();
var ssB = listB.DistinctSubsequences();
var ssC = listC.DistinctSubsequences();
var ssAll = ssA.Concat(ssB).Concat(ssC).DistinctIE();
然后,创建一些用于计算出现次数的助手:
var hA = ssA.ToHashSet(Make.IESequenceEqualityComparer<string>());
var hB = ssB.ToHashSet(Make.IESequenceEqualityComparer<string>());
var hC = ssC.ToHashSet(Make.IESequenceEqualityComparer<string>());
Func<IEnumerable<string>, HashSet<IEnumerable<string>>, int> testIn = (s, h) => h.Contains(s) ? 1 : 0;
Func<IEnumerable<string>,int> countIn = s => testIn(s,hA)+testIn(s,hB)+testIn(s,hC);
最后,计算答案:
var ans = ssAll.Select(ss => new { Sequence = ss.Join("->"), Length = ss.Count(), Count = countIn(ss) }).OrderByDescending(sc => sc.Sequence.Length);