计算 LevenshteinDistance,为不同的操作设置自定义成本

Calculate LevenshteinDistance, set custom cost for different operations

有没有办法为不同的操作设置自定义成本?喜欢:

Replace = 1
Insert = 1.5
Delete = 1.2

我找到的最接近的是 https://github.com/glienard/StringSimilarity.NET with Weighted Levenshtein,但我找不到如何实际使用它。

我的(简化)版本是这样的。

帮助classes:

  [Flags]
  public enum EditOperationKind {
    None = 0,
    Insert = 1,
    Delete = 2,
    Edit = Insert | Delete
  }

  public sealed class EditOperation<T>
    : IEquatable<EditOperation<T>> {

    internal string ToReport() {
      switch (Kind) {
        case EditOperationKind.None: return $"Keep   '{Before}', ({Cost})";
        case EditOperationKind.Insert: return $"Insert '{After}', ({Cost})";
        case EditOperationKind.Delete: return $"Delete '{Before}', ({Cost})";
        case EditOperationKind.Edit: return $"Edit   '{Before}' into '{After}', ({Cost})";
        default: return $"???    '{Before}' into '{After}', ({Cost})";
      };
    }

    internal EditOperation(EditOperationKind kind, T before, T after, double cost)
      : base() {

      Kind = kind;
      Before = before;
      After = after;
      Cost = cost;
    }

    public EditOperationKind Kind { get; }
    public T Before { get; }
    public T After { get; }
    public double Cost { get; private set;    }

    public override string ToString() {
      switch (Kind)
      {
        case EditOperationKind.None: return $"Keep '{Before}'";
        case EditOperationKind.Insert: return $"Insert '{After}'";
        case EditOperationKind.Delete: return $"Delete '{Before}'";
        case EditOperationKind.Edit: return $"Edit '{Before}' into '{After}'";
        default: return $"Unknown '{Before}' into '{After}'";
      };
    }

    public static bool operator ==(EditOperation<T> left, EditOperation<T> right) {
      if (ReferenceEquals(left, right))
        return true;
      else if (null == left || null == right)
        return false;

      return left.Equals(right);
    }

    public static bool operator !=(EditOperation<T> left, EditOperation<T> right) {
      if (ReferenceEquals(left, right))
        return false;
      else if (null == left || null == right)
        return true;

      return !left.Equals(right);
    }

    public bool Equals(EditOperation<T> other) {
      if (ReferenceEquals(this, other))
        return true;
      else if (null == other)
        return false;

      return object.Equals(this.Before, other.Before) &&
             object.Equals(this.After, other.After) &&
             this.Kind == other.Kind;
    }

    public override bool Equals(object obj) => Equals(obj as EditOperation<T>);
    
    public override int GetHashCode() {
      return (Before == null ? 0 : Before.GetHashCode()) ^
             (After == null ? 0 : After.GetHashCode()) ^
             (int)Kind;
    }
  }

主要class:

public sealed class EditProcedure<T> : IReadOnlyList<EditOperation<T>> {
    private List<EditOperation<T>> m_Sequence;

    private void CorePerform(T[] source,
                             T[] target,
                             Func<T, double> insertCost,
                             Func<T, double> deleteCost,
                             Func<T, T, double> editCost) {
      // Best operation (among insert, update, delete) to perform 
      EditOperationKind[][] M = Enumerable
        .Range(0, source.Length + 1)
        .Select(line => new EditOperationKind[target.Length + 1])
        .ToArray();

      // Minimum cost so far
      double[][] D = Enumerable
        .Range(0, source.Length + 1)
        .Select(line => new double[target.Length + 1])
        .ToArray();

      // Edge: all removes
      double sum = 0.0;

      for (int i = 1; i <= source.Length; ++i) {
        M[i][0] = EditOperationKind.Delete;
        D[i][0] = (sum += deleteCost(source[i - 1]));
      }

      // Edge: all inserts
      sum = 0.0;

      for (int i = 1; i <= target.Length; ++i) {
        M[0][i] = EditOperationKind.Insert;
        D[0][i] = (sum += insertCost(target[i - 1]));
      }

      // Having fit N - 1, K - 1 characters let's fit N, K
      for (int i = 1; i <= source.Length; ++i)
        for (int j = 1; j <= target.Length; ++j) {
          // here we choose the operation with the least cost
          double insert = D[i][j - 1] + insertCost(target[j - 1]);
          double delete = D[i - 1][j] + deleteCost(source[i - 1]);
          double edit = D[i - 1][j - 1] + editCost(source[i - 1], target[j - 1]);

          double min = Math.Min(Math.Min(insert, delete), edit);

          if (min == insert)
            M[i][j] = EditOperationKind.Insert;
          else if (min == delete)
            M[i][j] = EditOperationKind.Delete;
          else if (min == edit)
            M[i][j] = object.Equals(source[i - 1], target[j - 1])
              ? EditOperationKind.None
              : EditOperationKind.Edit;

          D[i][j] = min;
        }

      EditDistance = D[source.Length][target.Length];

      // Backward: knowing scores (D) and actions (M) let's building edit sequence
      m_Sequence =
        new List<EditOperation<T>>(source.Length + target.Length);

      for (int x = target.Length, y = source.Length; (x > 0) || (y > 0);) {
        EditOperationKind op = M[y][x];

        if (op == EditOperationKind.Insert) {
          x -= 1;
          m_Sequence.Add(new EditOperation<T>(op, default, target[x], D[y][x + 1] - D[y][x]));
        }
        else if (op == EditOperationKind.Delete) {
          y -= 1;
          m_Sequence.Add(new EditOperation<T>(op, source[y], default, D[y + 1][x] - D[y][x]));
        }
        else if (op == EditOperationKind.Edit || op == EditOperationKind.None) {
          x -= 1;
          y -= 1;
          m_Sequence.Add(new EditOperation<T>(op, source[y], target[x], D[y + 1][x + 1] - D[y][x]));
        }
        else // Start of the matching (EditOperationKind.None)
          break;
      }

      m_Sequence.Reverse();
    }

    public EditProcedure(IEnumerable<T> source,
                         IEnumerable<T> target,
                         Func<T, double> insertCost,
                         Func<T, double> deleteCost,
                         Func<T, T, double> editCost) {
      if (null == source)
        throw new ArgumentNullException(nameof(source));
      else if (null == target)
        throw new ArgumentNullException(nameof(target));
      else if (null == insertCost)
        throw new ArgumentNullException(nameof(insertCost));
      else if (null == deleteCost)
        throw new ArgumentNullException(nameof(deleteCost));
      else if (null == editCost)
        throw new ArgumentNullException(nameof(editCost));

      CorePerform(source.ToArray(),
                  target.ToArray(),
                  insertCost,
                  deleteCost,
                  editCost);
    }

    public double EditDistance { get; private set; }

    public IReadOnlyList<EditOperation<T>> EditSequence => m_Sequence;
      
    public override string ToString() {
      return string.Join(Environment.NewLine,
        $"Distance: {EditDistance}",
        $"Sequence ({m_Sequence.Count} steps):",
          string.Join(Environment.NewLine, m_Sequence
            .Select(item => $"  {item.ToReport()}")));
    }
        
    public int Count => m_Sequence.Count;

    public EditOperation<T> this[int index] => m_Sequence[index];

    public IEnumerator<EditOperation<T>> GetEnumerator() => m_Sequence.GetEnumerator();

    System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator() => m_Sequence.GetEnumerator();
  }

演示:

  string source = "abracadabra";
  string target = "alakazam";

  var edit = new EditProcedure<char>(
    source, 
    target,
    insertChar => 1.5,
    deleteChar => 1.2,
    (fromChar, toChar) => fromChar == toChar ? 0 : 1.0);

  Console.WriteLine(edit.EditDistance);
  Console.WriteLine();
  Console.WriteLine(edit.ToString());

结果:

7.6

Distance: 7.6
Sequence (11 steps):
  Keep   'a', (0)
  Edit   'b' into 'l', (1)
  Delete 'r', (1.2)
  Keep   'a', (0)
  Edit   'c' into 'k', (1)
  Keep   'a', (0)
  Edit   'd' into 'z', (1)
  Keep   'a', (0)
  Edit   'b' into 'm', (1)
  Delete 'r', (1.2)
  Delete 'a', (1.2)

有人发布了一个答案,只是在 1 小时后将其删除。我稍微修改了一下,结果如下:

    public class Levenshtein
    {
        public double ReplaceCost { get; set; } = 1;
        public double DeleteCost { get; set; } = 1;
        public double InsertCost { get; set; } = 1;

        public double Distance(string source, string target, bool caseSensitive = true)
        {
            if (!caseSensitive)
            {
                source = source.ToUpperInvariant();
                target = target.ToUpperInvariant();
            }

            int xLength = source.Length;
            int yLength = target.Length;

            //short circuit
            if (xLength == 0) return yLength;
            if (yLength == 0) return xLength;

            //create path matrix
            var d = new double[xLength + 1, yLength + 1];
            for (int i = 0; i <= xLength; i++)
            {
                d[i, 0] = i;
            }
            for (int j = 0; j <= yLength; j++)
            {
                d[0, j] = j;
            }

            //navigate best route
            for (int i = 1; i <= xLength; i++)
            {
                for (int j = 1; j <= yLength; j++)
                {
                    double diagCost = (target[j - 1] == source[i - 1]) ? 0 : ReplaceCost;
                    double diagMovement = d[i - 1, j - 1] + diagCost; //replace
                    double horzMovement = d[i - 1, j] + DeleteCost; //delete
                    double vertMovement = d[i, j - 1] + InsertCost; //insert (does not appear in source)
                    d[i, j] = Math.Min(Math.Min(horzMovement, vertMovement), diagMovement);
                }
            }
            return d[xLength, yLength];
        }
    }