计算 LevenshteinDistance,为不同的操作设置自定义成本
Calculate LevenshteinDistance, set custom cost for different operations
有没有办法为不同的操作设置自定义成本?喜欢:
Replace = 1
Insert = 1.5
Delete = 1.2
我找到的最接近的是 https://github.com/glienard/StringSimilarity.NET with Weighted Levenshtein,但我找不到如何实际使用它。
我的(简化)版本是这样的。
帮助classes:
[Flags]
public enum EditOperationKind {
None = 0,
Insert = 1,
Delete = 2,
Edit = Insert | Delete
}
public sealed class EditOperation<T>
: IEquatable<EditOperation<T>> {
internal string ToReport() {
switch (Kind) {
case EditOperationKind.None: return $"Keep '{Before}', ({Cost})";
case EditOperationKind.Insert: return $"Insert '{After}', ({Cost})";
case EditOperationKind.Delete: return $"Delete '{Before}', ({Cost})";
case EditOperationKind.Edit: return $"Edit '{Before}' into '{After}', ({Cost})";
default: return $"??? '{Before}' into '{After}', ({Cost})";
};
}
internal EditOperation(EditOperationKind kind, T before, T after, double cost)
: base() {
Kind = kind;
Before = before;
After = after;
Cost = cost;
}
public EditOperationKind Kind { get; }
public T Before { get; }
public T After { get; }
public double Cost { get; private set; }
public override string ToString() {
switch (Kind)
{
case EditOperationKind.None: return $"Keep '{Before}'";
case EditOperationKind.Insert: return $"Insert '{After}'";
case EditOperationKind.Delete: return $"Delete '{Before}'";
case EditOperationKind.Edit: return $"Edit '{Before}' into '{After}'";
default: return $"Unknown '{Before}' into '{After}'";
};
}
public static bool operator ==(EditOperation<T> left, EditOperation<T> right) {
if (ReferenceEquals(left, right))
return true;
else if (null == left || null == right)
return false;
return left.Equals(right);
}
public static bool operator !=(EditOperation<T> left, EditOperation<T> right) {
if (ReferenceEquals(left, right))
return false;
else if (null == left || null == right)
return true;
return !left.Equals(right);
}
public bool Equals(EditOperation<T> other) {
if (ReferenceEquals(this, other))
return true;
else if (null == other)
return false;
return object.Equals(this.Before, other.Before) &&
object.Equals(this.After, other.After) &&
this.Kind == other.Kind;
}
public override bool Equals(object obj) => Equals(obj as EditOperation<T>);
public override int GetHashCode() {
return (Before == null ? 0 : Before.GetHashCode()) ^
(After == null ? 0 : After.GetHashCode()) ^
(int)Kind;
}
}
主要class:
public sealed class EditProcedure<T> : IReadOnlyList<EditOperation<T>> {
private List<EditOperation<T>> m_Sequence;
private void CorePerform(T[] source,
T[] target,
Func<T, double> insertCost,
Func<T, double> deleteCost,
Func<T, T, double> editCost) {
// Best operation (among insert, update, delete) to perform
EditOperationKind[][] M = Enumerable
.Range(0, source.Length + 1)
.Select(line => new EditOperationKind[target.Length + 1])
.ToArray();
// Minimum cost so far
double[][] D = Enumerable
.Range(0, source.Length + 1)
.Select(line => new double[target.Length + 1])
.ToArray();
// Edge: all removes
double sum = 0.0;
for (int i = 1; i <= source.Length; ++i) {
M[i][0] = EditOperationKind.Delete;
D[i][0] = (sum += deleteCost(source[i - 1]));
}
// Edge: all inserts
sum = 0.0;
for (int i = 1; i <= target.Length; ++i) {
M[0][i] = EditOperationKind.Insert;
D[0][i] = (sum += insertCost(target[i - 1]));
}
// Having fit N - 1, K - 1 characters let's fit N, K
for (int i = 1; i <= source.Length; ++i)
for (int j = 1; j <= target.Length; ++j) {
// here we choose the operation with the least cost
double insert = D[i][j - 1] + insertCost(target[j - 1]);
double delete = D[i - 1][j] + deleteCost(source[i - 1]);
double edit = D[i - 1][j - 1] + editCost(source[i - 1], target[j - 1]);
double min = Math.Min(Math.Min(insert, delete), edit);
if (min == insert)
M[i][j] = EditOperationKind.Insert;
else if (min == delete)
M[i][j] = EditOperationKind.Delete;
else if (min == edit)
M[i][j] = object.Equals(source[i - 1], target[j - 1])
? EditOperationKind.None
: EditOperationKind.Edit;
D[i][j] = min;
}
EditDistance = D[source.Length][target.Length];
// Backward: knowing scores (D) and actions (M) let's building edit sequence
m_Sequence =
new List<EditOperation<T>>(source.Length + target.Length);
for (int x = target.Length, y = source.Length; (x > 0) || (y > 0);) {
EditOperationKind op = M[y][x];
if (op == EditOperationKind.Insert) {
x -= 1;
m_Sequence.Add(new EditOperation<T>(op, default, target[x], D[y][x + 1] - D[y][x]));
}
else if (op == EditOperationKind.Delete) {
y -= 1;
m_Sequence.Add(new EditOperation<T>(op, source[y], default, D[y + 1][x] - D[y][x]));
}
else if (op == EditOperationKind.Edit || op == EditOperationKind.None) {
x -= 1;
y -= 1;
m_Sequence.Add(new EditOperation<T>(op, source[y], target[x], D[y + 1][x + 1] - D[y][x]));
}
else // Start of the matching (EditOperationKind.None)
break;
}
m_Sequence.Reverse();
}
public EditProcedure(IEnumerable<T> source,
IEnumerable<T> target,
Func<T, double> insertCost,
Func<T, double> deleteCost,
Func<T, T, double> editCost) {
if (null == source)
throw new ArgumentNullException(nameof(source));
else if (null == target)
throw new ArgumentNullException(nameof(target));
else if (null == insertCost)
throw new ArgumentNullException(nameof(insertCost));
else if (null == deleteCost)
throw new ArgumentNullException(nameof(deleteCost));
else if (null == editCost)
throw new ArgumentNullException(nameof(editCost));
CorePerform(source.ToArray(),
target.ToArray(),
insertCost,
deleteCost,
editCost);
}
public double EditDistance { get; private set; }
public IReadOnlyList<EditOperation<T>> EditSequence => m_Sequence;
public override string ToString() {
return string.Join(Environment.NewLine,
$"Distance: {EditDistance}",
$"Sequence ({m_Sequence.Count} steps):",
string.Join(Environment.NewLine, m_Sequence
.Select(item => $" {item.ToReport()}")));
}
public int Count => m_Sequence.Count;
public EditOperation<T> this[int index] => m_Sequence[index];
public IEnumerator<EditOperation<T>> GetEnumerator() => m_Sequence.GetEnumerator();
System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator() => m_Sequence.GetEnumerator();
}
演示:
string source = "abracadabra";
string target = "alakazam";
var edit = new EditProcedure<char>(
source,
target,
insertChar => 1.5,
deleteChar => 1.2,
(fromChar, toChar) => fromChar == toChar ? 0 : 1.0);
Console.WriteLine(edit.EditDistance);
Console.WriteLine();
Console.WriteLine(edit.ToString());
结果:
7.6
Distance: 7.6
Sequence (11 steps):
Keep 'a', (0)
Edit 'b' into 'l', (1)
Delete 'r', (1.2)
Keep 'a', (0)
Edit 'c' into 'k', (1)
Keep 'a', (0)
Edit 'd' into 'z', (1)
Keep 'a', (0)
Edit 'b' into 'm', (1)
Delete 'r', (1.2)
Delete 'a', (1.2)
有人发布了一个答案,只是在 1 小时后将其删除。我稍微修改了一下,结果如下:
public class Levenshtein
{
public double ReplaceCost { get; set; } = 1;
public double DeleteCost { get; set; } = 1;
public double InsertCost { get; set; } = 1;
public double Distance(string source, string target, bool caseSensitive = true)
{
if (!caseSensitive)
{
source = source.ToUpperInvariant();
target = target.ToUpperInvariant();
}
int xLength = source.Length;
int yLength = target.Length;
//short circuit
if (xLength == 0) return yLength;
if (yLength == 0) return xLength;
//create path matrix
var d = new double[xLength + 1, yLength + 1];
for (int i = 0; i <= xLength; i++)
{
d[i, 0] = i;
}
for (int j = 0; j <= yLength; j++)
{
d[0, j] = j;
}
//navigate best route
for (int i = 1; i <= xLength; i++)
{
for (int j = 1; j <= yLength; j++)
{
double diagCost = (target[j - 1] == source[i - 1]) ? 0 : ReplaceCost;
double diagMovement = d[i - 1, j - 1] + diagCost; //replace
double horzMovement = d[i - 1, j] + DeleteCost; //delete
double vertMovement = d[i, j - 1] + InsertCost; //insert (does not appear in source)
d[i, j] = Math.Min(Math.Min(horzMovement, vertMovement), diagMovement);
}
}
return d[xLength, yLength];
}
}
有没有办法为不同的操作设置自定义成本?喜欢:
Replace = 1
Insert = 1.5
Delete = 1.2
我找到的最接近的是 https://github.com/glienard/StringSimilarity.NET with Weighted Levenshtein,但我找不到如何实际使用它。
我的(简化)版本是这样的。
帮助classes:
[Flags]
public enum EditOperationKind {
None = 0,
Insert = 1,
Delete = 2,
Edit = Insert | Delete
}
public sealed class EditOperation<T>
: IEquatable<EditOperation<T>> {
internal string ToReport() {
switch (Kind) {
case EditOperationKind.None: return $"Keep '{Before}', ({Cost})";
case EditOperationKind.Insert: return $"Insert '{After}', ({Cost})";
case EditOperationKind.Delete: return $"Delete '{Before}', ({Cost})";
case EditOperationKind.Edit: return $"Edit '{Before}' into '{After}', ({Cost})";
default: return $"??? '{Before}' into '{After}', ({Cost})";
};
}
internal EditOperation(EditOperationKind kind, T before, T after, double cost)
: base() {
Kind = kind;
Before = before;
After = after;
Cost = cost;
}
public EditOperationKind Kind { get; }
public T Before { get; }
public T After { get; }
public double Cost { get; private set; }
public override string ToString() {
switch (Kind)
{
case EditOperationKind.None: return $"Keep '{Before}'";
case EditOperationKind.Insert: return $"Insert '{After}'";
case EditOperationKind.Delete: return $"Delete '{Before}'";
case EditOperationKind.Edit: return $"Edit '{Before}' into '{After}'";
default: return $"Unknown '{Before}' into '{After}'";
};
}
public static bool operator ==(EditOperation<T> left, EditOperation<T> right) {
if (ReferenceEquals(left, right))
return true;
else if (null == left || null == right)
return false;
return left.Equals(right);
}
public static bool operator !=(EditOperation<T> left, EditOperation<T> right) {
if (ReferenceEquals(left, right))
return false;
else if (null == left || null == right)
return true;
return !left.Equals(right);
}
public bool Equals(EditOperation<T> other) {
if (ReferenceEquals(this, other))
return true;
else if (null == other)
return false;
return object.Equals(this.Before, other.Before) &&
object.Equals(this.After, other.After) &&
this.Kind == other.Kind;
}
public override bool Equals(object obj) => Equals(obj as EditOperation<T>);
public override int GetHashCode() {
return (Before == null ? 0 : Before.GetHashCode()) ^
(After == null ? 0 : After.GetHashCode()) ^
(int)Kind;
}
}
主要class:
public sealed class EditProcedure<T> : IReadOnlyList<EditOperation<T>> {
private List<EditOperation<T>> m_Sequence;
private void CorePerform(T[] source,
T[] target,
Func<T, double> insertCost,
Func<T, double> deleteCost,
Func<T, T, double> editCost) {
// Best operation (among insert, update, delete) to perform
EditOperationKind[][] M = Enumerable
.Range(0, source.Length + 1)
.Select(line => new EditOperationKind[target.Length + 1])
.ToArray();
// Minimum cost so far
double[][] D = Enumerable
.Range(0, source.Length + 1)
.Select(line => new double[target.Length + 1])
.ToArray();
// Edge: all removes
double sum = 0.0;
for (int i = 1; i <= source.Length; ++i) {
M[i][0] = EditOperationKind.Delete;
D[i][0] = (sum += deleteCost(source[i - 1]));
}
// Edge: all inserts
sum = 0.0;
for (int i = 1; i <= target.Length; ++i) {
M[0][i] = EditOperationKind.Insert;
D[0][i] = (sum += insertCost(target[i - 1]));
}
// Having fit N - 1, K - 1 characters let's fit N, K
for (int i = 1; i <= source.Length; ++i)
for (int j = 1; j <= target.Length; ++j) {
// here we choose the operation with the least cost
double insert = D[i][j - 1] + insertCost(target[j - 1]);
double delete = D[i - 1][j] + deleteCost(source[i - 1]);
double edit = D[i - 1][j - 1] + editCost(source[i - 1], target[j - 1]);
double min = Math.Min(Math.Min(insert, delete), edit);
if (min == insert)
M[i][j] = EditOperationKind.Insert;
else if (min == delete)
M[i][j] = EditOperationKind.Delete;
else if (min == edit)
M[i][j] = object.Equals(source[i - 1], target[j - 1])
? EditOperationKind.None
: EditOperationKind.Edit;
D[i][j] = min;
}
EditDistance = D[source.Length][target.Length];
// Backward: knowing scores (D) and actions (M) let's building edit sequence
m_Sequence =
new List<EditOperation<T>>(source.Length + target.Length);
for (int x = target.Length, y = source.Length; (x > 0) || (y > 0);) {
EditOperationKind op = M[y][x];
if (op == EditOperationKind.Insert) {
x -= 1;
m_Sequence.Add(new EditOperation<T>(op, default, target[x], D[y][x + 1] - D[y][x]));
}
else if (op == EditOperationKind.Delete) {
y -= 1;
m_Sequence.Add(new EditOperation<T>(op, source[y], default, D[y + 1][x] - D[y][x]));
}
else if (op == EditOperationKind.Edit || op == EditOperationKind.None) {
x -= 1;
y -= 1;
m_Sequence.Add(new EditOperation<T>(op, source[y], target[x], D[y + 1][x + 1] - D[y][x]));
}
else // Start of the matching (EditOperationKind.None)
break;
}
m_Sequence.Reverse();
}
public EditProcedure(IEnumerable<T> source,
IEnumerable<T> target,
Func<T, double> insertCost,
Func<T, double> deleteCost,
Func<T, T, double> editCost) {
if (null == source)
throw new ArgumentNullException(nameof(source));
else if (null == target)
throw new ArgumentNullException(nameof(target));
else if (null == insertCost)
throw new ArgumentNullException(nameof(insertCost));
else if (null == deleteCost)
throw new ArgumentNullException(nameof(deleteCost));
else if (null == editCost)
throw new ArgumentNullException(nameof(editCost));
CorePerform(source.ToArray(),
target.ToArray(),
insertCost,
deleteCost,
editCost);
}
public double EditDistance { get; private set; }
public IReadOnlyList<EditOperation<T>> EditSequence => m_Sequence;
public override string ToString() {
return string.Join(Environment.NewLine,
$"Distance: {EditDistance}",
$"Sequence ({m_Sequence.Count} steps):",
string.Join(Environment.NewLine, m_Sequence
.Select(item => $" {item.ToReport()}")));
}
public int Count => m_Sequence.Count;
public EditOperation<T> this[int index] => m_Sequence[index];
public IEnumerator<EditOperation<T>> GetEnumerator() => m_Sequence.GetEnumerator();
System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator() => m_Sequence.GetEnumerator();
}
演示:
string source = "abracadabra";
string target = "alakazam";
var edit = new EditProcedure<char>(
source,
target,
insertChar => 1.5,
deleteChar => 1.2,
(fromChar, toChar) => fromChar == toChar ? 0 : 1.0);
Console.WriteLine(edit.EditDistance);
Console.WriteLine();
Console.WriteLine(edit.ToString());
结果:
7.6
Distance: 7.6
Sequence (11 steps):
Keep 'a', (0)
Edit 'b' into 'l', (1)
Delete 'r', (1.2)
Keep 'a', (0)
Edit 'c' into 'k', (1)
Keep 'a', (0)
Edit 'd' into 'z', (1)
Keep 'a', (0)
Edit 'b' into 'm', (1)
Delete 'r', (1.2)
Delete 'a', (1.2)
有人发布了一个答案,只是在 1 小时后将其删除。我稍微修改了一下,结果如下:
public class Levenshtein
{
public double ReplaceCost { get; set; } = 1;
public double DeleteCost { get; set; } = 1;
public double InsertCost { get; set; } = 1;
public double Distance(string source, string target, bool caseSensitive = true)
{
if (!caseSensitive)
{
source = source.ToUpperInvariant();
target = target.ToUpperInvariant();
}
int xLength = source.Length;
int yLength = target.Length;
//short circuit
if (xLength == 0) return yLength;
if (yLength == 0) return xLength;
//create path matrix
var d = new double[xLength + 1, yLength + 1];
for (int i = 0; i <= xLength; i++)
{
d[i, 0] = i;
}
for (int j = 0; j <= yLength; j++)
{
d[0, j] = j;
}
//navigate best route
for (int i = 1; i <= xLength; i++)
{
for (int j = 1; j <= yLength; j++)
{
double diagCost = (target[j - 1] == source[i - 1]) ? 0 : ReplaceCost;
double diagMovement = d[i - 1, j - 1] + diagCost; //replace
double horzMovement = d[i - 1, j] + DeleteCost; //delete
double vertMovement = d[i, j - 1] + InsertCost; //insert (does not appear in source)
d[i, j] = Math.Min(Math.Min(horzMovement, vertMovement), diagMovement);
}
}
return d[xLength, yLength];
}
}