用于子串匹配的 Levenshtein 算法的 C# 实现

C# implementation of Levenshtein algorithm for substring matching

我正在尝试 Levenshtein distance 以获得 C# 实现,它不仅可以判断两个字符串是否相似,还可以找到相似的字符串(needle) 在一个更大的字符串中 (haystack).

为此,我尝试遵循 this excellent post 底部的建议,但遇到了一些问题。

首先,我采用了this implementation, changing it to fit my additional requirements. I also added some diagnostic dump support to let me understand the algorithm better, inspired by this other post

我的实现 returns 一个具有分数和(在请求时)索引和长度的对象,以及对用于诊断目的的计算矩阵的引用:

public class LevenshteinMatch
{
    public int Score { get; }
    public int Index { get; }
    public int Length { get; }
    public int[,] Matrix { get; set; }

    public LevenshteinMatch(int score, int index = 0, int length = 0)
    {
        Score = score;
        Index = index;
        Length = length;
    }

    public override string ToString()
    {
        return $"{Score} @{Index}x{Length}";
    }
}

这是我的实现:如果 sub 为假,Distance 方法“正常”工作;否则,它会找到一个相似的子串。 DumpMatrix 只是一种诊断辅助方法。

public static class Levenshtein
{
    public static string DumpMatrix(int[,] d, string a, string b)
    {
        if (d == null) throw new ArgumentNullException(nameof(d));
        if (a == null) throw new ArgumentNullException(nameof(a));
        if (b == null) throw new ArgumentNullException(nameof(b));

        //      #  k  i  t  t  e  n  
        //      00 01 02 03 04 05 06 
        // # 00 .. .. .. .. .. .. .. 
        // s 01 .. .. .. .. .. .. .. 
        // ...etc (sitting)

        StringBuilder sb = new StringBuilder();
        int n = a.Length;
        int m = b.Length;

        // b-legend
        sb.Append("     #  ");
        for (int j = 0; j < m; j++) sb.Append(b[j]).Append("  ");
        sb.AppendLine();
        sb.Append("     00 ");
        for (int j = 1; j < m; j++) sb.AppendFormat("{0:00}", j).Append(' ');
        sb.AppendFormat("{0:00} ", m).AppendLine();

        // matrix
        for (int i = 0; i <= n; i++)
        {
            // a-legend
            if (i == 0)
            {
                sb.Append("# 00 ");
            }
            else
            {
                sb.Append(a[i - 1])
                  .Append(' ')
                  .AppendFormat("{0:00}", i)
                  .Append(' ');
            }

            // row of values
            for (int j = 0; j <= m; j++)
                sb.AppendFormat("{0,2} ", d[i, j]);
            sb.AppendLine();
        }
        return sb.ToString();
    }

    private static LevenshteinMatch BuildMatch(string a, string b, int[,] d)
    {
        int n = a.Length;
        int m = b.Length;

        // take the min rightmost score instead of the bottom-right corner
        int min = 0, rightMinIndex = -1;
        for (int j = m; j > -1; j--)
        {
            if (rightMinIndex == -1 || d[n, j] < min)
            {
                min = d[n, j];
                rightMinIndex = j;
            }
        }

        // corner case: perfect match, just collect m chars from score=0
        if (min == 0)
        {
            return new LevenshteinMatch(min,
                rightMinIndex - n,
                n);
        }

        // collect all the lowest scores on the bottom row leftwards,
        // up to the length of the needle
        int count = n, leftMinIndex = rightMinIndex;
        while (leftMinIndex > -1)
        {
            if (d[n, leftMinIndex] == min && --count == 0) break;
            leftMinIndex--;
        }

        return new LevenshteinMatch(min,
            leftMinIndex - 1,
            rightMinIndex + 1 - leftMinIndex);
    }

    public static LevenshteinMatch Distance(string a, string b,
        bool sub = false, bool withMatrix = false)
    {
        if (a is null) throw new ArgumentNullException(nameof(a));
        if (b == null) throw new ArgumentNullException(nameof(b));

        int n = a.Length;
        int m = b.Length;
        int[,] d = new int[n + 1, m + 1];

        if (n == 0) return new LevenshteinMatch(m);
        if (m == 0) return new LevenshteinMatch(n);

        for (int i = 0; i <= n; i++) d[i, 0] = i;
        // if matching substring, leave the top row to 0
        if (!sub)
        {
            for (int j = 0; j <= m; j++) d[0, j] = j;
        }

        for (int j = 1; j <= m; j++)
        {
            for (int i = 1; i <= n; i++)
            {
                if (a[i - 1] == b[j - 1])
                {
                    d[i, j] = d[i - 1, j - 1];  // no operation
                }
                else
                {
                    d[i, j] = Math.Min(Math.Min(
                        d[i - 1, j] + 1,    // a deletion
                        d[i, j - 1] + 1),   // an insertion
                        d[i - 1, j - 1] + 1 // a substitution
                        );
                }
            }
        }

        LevenshteinMatch match = sub
            ? BuildMatch(a, b, d)
            : new LevenshteinMatch(d[n, m]);
        if (withMatrix) match.Matrix = d;
        return match;
    }
}

为了更完整,这里是使用它的演示控制台程序。这只是提示用户匹配模式(是否为子字符串)和两个字符串,然后调用 Distance 方法,转储结果矩阵,并在需要时显示子字符串。

internal static class Program
{
    private static string ReadLine(string defaultLine)
    {
        string s = Console.ReadLine();
        return string.IsNullOrEmpty(s) ? defaultLine ?? s : s;
    }

    private static void Main()
    {
        Console.WriteLine("Fuzzy Levenshtein Matcher");

        string a = "sitting", b = "kitten";
        bool sub = false;
        LevenshteinMatch match;

        while (true)
        {
            Console.Write("sub [y/n]? ");
            string yn = Console.ReadLine();
            if (!string.IsNullOrEmpty(yn)) sub = yn == "y" || yn == "Y";

            Console.Write(sub? $"needle ({a}): " : $"A ({a}): ");
            a = ReadLine(a);

            Console.Write(sub? $"haystack ({b}): " : $"B ({b}): ");
            b = ReadLine(b);

            match = Levenshtein.Distance(a, b, sub, true);
            Console.WriteLine($"{a} - {b}: {match}");
            Console.WriteLine(Levenshtein.DumpMatrix(match.Matrix, a, b));
            if (sub) Console.WriteLine(b.Substring(match.Index, match.Length));
        }
    }
}

现在,对于子字符串匹配,这适用于“c abba c”中的“aba”这样的情况。这是矩阵:

aba - c abba c: 1 @3x3
     #  c     a  b  b  a     c
     00 01 02 03 04 05 06 07 08
# 00  0  0  0  0  0  0  0  0  0
a 01  1  1  1  0  1  1  0  1  1
b 02  2  2  2  1  0  1  1  1  2
a 03  3  3  3  2  1  1  1  2  2

然而,在其他情况下,例如“ego sum abbas Cucaniensis”中的“abas”,我未能从底行收集到最低分数:

abas - ego sum abbas Cucaniensis: 1 @-2x15
     #  e  g  o     s  u  m     a  b  b  a  s     C  u  c  a  n  i  e  n  s  i  s
     00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
# 00  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
a 01  1  1  1  1  1  1  1  1  1  0  1  1  0  1  1  1  1  1  0  1  1  1  1  1  1  1
b 02  2  2  2  2  2  2  2  2  2  1  0  1  1  1  2  2  2  2  1  1  2  2  2  2  2  2
a 03  3  3  3  3  3  3  3  3  3  2  1  1  1  2  2  3  3  3  2  2  2  3  3  3  3  3
s 04  4  4  4  4  4  3  4  4  4  3  2  2  2  1  2  3  4  4  3  3  3  3  4  3  4  3

这里最后一行只有一个score=1。在完美匹配(分数=0)的情况下,我的代码只从最右边的最低分数开始取左边的 N 个字符(其中 N 是针的长度);但这里我的分数大于 0。可能我只是误解了上面 post 中的提示,因为我是这个算法的新手。谁能建议大海捞针的索引和长度的正确方法?

您从底行的最佳分数开始:(13,4) 处的 1

然后您会发现可以让您到达那里的前任状态和转换:

  • (12,4) - 不可能,因为它有更高的差异
  • (13,3) - 不可能,因为它有更高的差异
  • (12,3) - 相同的差异和字符匹配,所以这有效

从 (12,3) 开始,您按照相同的步骤到达 (11,2),然后到达 (10,1)

在 (10,1) 处字母不匹配,因此您不可能来自 (9,0)。您可以对类似的字符串“bas”使用 (10,0),或者对类似的字符串“abbas”使用 (9,1) 然后 (8,0),两者的距离均为 1.