为什么我得到不正确的 Levenshtein 距离?
Why Am I Getting The Incorrect Levenshtein Distance?
我正在尝试在 C# 中实现 Levenshtein 距离算法(用于练习,因为它很方便)。我使用了 Wikipedia page 的实现,但由于某种原因,我在一组单词上的距离错误。这是代码(来自 LinqPad):
void Main()
{
var ld = new LevenshteinDistance();
int dist = ld.LevenshteinDistanceCalc("sitting","kitten");
dist.Dump();
}
// Define other methods and classes here
public class LevenshteinDistance
{
private int[,] distance;
public int LevenshteinDistanceCalc(string source, string target)
{
int sourceSize = source.Length, targetSize = target.Length;
distance = new int[sourceSize, targetSize];
for (int sIndex = 0; sIndex < sourceSize; sIndex++)
{
distance[sIndex, 0] = sIndex;
}
for (int tIndex = 0; tIndex < targetSize; tIndex++)
{
distance[0,tIndex] = tIndex;
}
// for j from 1 to n:
// for i from 1 to m:
// if s[i] = t[j]:
// substitutionCost:= 0
// else:
// substitutionCost:= 1
// d[i, j] := minimum(d[i - 1, j] + 1, // deletion
// d[i, j - 1] + 1, // insertion
// d[i - 1, j - 1] + substitutionCost) // substitution
//
//
// return d[m, n]
for (int tIndex = 1; tIndex < targetSize; tIndex++)
{
for (int sIndex = 1; sIndex < sourceSize; sIndex++)
{
int substitutionCost = source[sIndex] == target[tIndex] ? 0 : 1;
int deletion = distance[sIndex-1, tIndex]+1;
int insertion = distance[sIndex,tIndex-1]+1;
int substitution = distance[sIndex-1, tIndex-1] + substitutionCost;
distance[sIndex, tIndex] = leastOfThree(deletion, insertion, substitution);
}
}
return distance[sourceSize-1,targetSize-1];
}
private int leastOfThree(int a, int b, int c)
{
return Math.Min(a,(Math.Min(b,c)));
}
}
当我尝试 "sitting" 和 "kitten" 时,我得到的 LD 为 2(应该是 3)。然而,当我尝试 "Saturday" 和 "Sunday" 时,我得到的 LD 为 3(这是正确的)。我知道出了什么问题,但我不知道我错过了什么。
维基百科上的示例使用从 1 开始的字符串。在 C# 中,我们使用基于 0 的字符串。
在他们的矩阵中确实存在 0 行和 0 列。所以他们矩阵的大小是 [source.Length + 1, source.Length + 1] 在你的代码中它不存在。
public int LevenshteinDistanceCalc(string source, string target)
{
int sourceSize = source.Length, targetSize = target.Length;
distance = new int[sourceSize + 1, targetSize + 1];
for (int sIndex = 1; sIndex <= sourceSize; sIndex++)
distance[sIndex, 0] = sIndex;
for (int tIndex = 1; tIndex <= targetSize; tIndex++)
distance[0, tIndex] = tIndex;
for (int tIndex = 1; tIndex <= targetSize; tIndex++)
{
for (int sIndex = 1; sIndex <= sourceSize; sIndex++)
{
int substitutionCost = source[sIndex-1] == target[tIndex-1] ? 0 : 1;
int deletion = distance[sIndex - 1, tIndex] + 1;
int insertion = distance[sIndex, tIndex - 1] + 1;
int substitution = distance[sIndex - 1, tIndex - 1] + substitutionCost;
distance[sIndex, tIndex] = leastOfThree(deletion, insertion, substitution);
}
}
return distance[sourceSize, targetSize];
}
你的矩阵不够大。
在pseudo-code中,s
和t
的长度分别为m
和n
(char s[1..m], char t[1..n]
)。然而,该矩阵具有维度 [0..m, 0..n]
- 即比每个方向上的字符串长度多一个。您可以在 pseudo-code.
下方的 table 中看到它
所以 "sitting" 和 "kitten" 的矩阵是 7x8,但你的矩阵只有 6x7。
您还对字符串进行了错误的索引,因为 pseudo-code 中的字符串是 1 索引的,而 C# 的字符串是 0 索引的。
修复这些后,您将获得此代码,它适用于 "sitting" 和 "kitten":
public static class LevenshteinDistance
{
public static int LevenshteinDistanceCalc(string source, string target)
{
int sourceSize = source.Length + 1, targetSize = target.Length + 1;
int[,] distance = new int[sourceSize, targetSize];
for (int sIndex = 0; sIndex < sourceSize; sIndex++)
{
distance[sIndex, 0] = sIndex;
}
for (int tIndex = 0; tIndex < targetSize; tIndex++)
{
distance[0, tIndex] = tIndex;
}
// for j from 1 to n:
// for i from 1 to m:
// if s[i] = t[j]:
// substitutionCost:= 0
// else:
// substitutionCost:= 1
// d[i, j] := minimum(d[i - 1, j] + 1, // deletion
// d[i, j - 1] + 1, // insertion
// d[i - 1, j - 1] + substitutionCost) // substitution
//
//
// return d[m, n]
for (int tIndex = 1; tIndex < targetSize; tIndex++)
{
for (int sIndex = 1; sIndex < sourceSize; sIndex++)
{
int substitutionCost = source[sIndex - 1] == target[tIndex - 1] ? 0 : 1;
int deletion = distance[sIndex - 1, tIndex] + 1;
int insertion = distance[sIndex, tIndex - 1] + 1;
int substitution = distance[sIndex - 1, tIndex - 1] + substitutionCost;
distance[sIndex, tIndex] = leastOfThree(deletion, insertion, substitution);
}
}
return distance[sourceSize - 1, targetSize - 1];
}
private static int leastOfThree(int a, int b, int c)
{
return Math.Min(a, (Math.Min(b, c)));
}
}
(我还冒昧地将 distance
设为局部变量,因为它不需要成为一个字段(它只会让你的 class non-threadsafe),并且将其设为静态以避免不必要的实例化)。
为了调试它,我在 return distance[sourceSize - 1, targetSize - 1]
上放置了一个断点,并将 distance
与维基百科上的 table 进行了比较。很明显是太小了
我正在尝试在 C# 中实现 Levenshtein 距离算法(用于练习,因为它很方便)。我使用了 Wikipedia page 的实现,但由于某种原因,我在一组单词上的距离错误。这是代码(来自 LinqPad):
void Main()
{
var ld = new LevenshteinDistance();
int dist = ld.LevenshteinDistanceCalc("sitting","kitten");
dist.Dump();
}
// Define other methods and classes here
public class LevenshteinDistance
{
private int[,] distance;
public int LevenshteinDistanceCalc(string source, string target)
{
int sourceSize = source.Length, targetSize = target.Length;
distance = new int[sourceSize, targetSize];
for (int sIndex = 0; sIndex < sourceSize; sIndex++)
{
distance[sIndex, 0] = sIndex;
}
for (int tIndex = 0; tIndex < targetSize; tIndex++)
{
distance[0,tIndex] = tIndex;
}
// for j from 1 to n:
// for i from 1 to m:
// if s[i] = t[j]:
// substitutionCost:= 0
// else:
// substitutionCost:= 1
// d[i, j] := minimum(d[i - 1, j] + 1, // deletion
// d[i, j - 1] + 1, // insertion
// d[i - 1, j - 1] + substitutionCost) // substitution
//
//
// return d[m, n]
for (int tIndex = 1; tIndex < targetSize; tIndex++)
{
for (int sIndex = 1; sIndex < sourceSize; sIndex++)
{
int substitutionCost = source[sIndex] == target[tIndex] ? 0 : 1;
int deletion = distance[sIndex-1, tIndex]+1;
int insertion = distance[sIndex,tIndex-1]+1;
int substitution = distance[sIndex-1, tIndex-1] + substitutionCost;
distance[sIndex, tIndex] = leastOfThree(deletion, insertion, substitution);
}
}
return distance[sourceSize-1,targetSize-1];
}
private int leastOfThree(int a, int b, int c)
{
return Math.Min(a,(Math.Min(b,c)));
}
}
当我尝试 "sitting" 和 "kitten" 时,我得到的 LD 为 2(应该是 3)。然而,当我尝试 "Saturday" 和 "Sunday" 时,我得到的 LD 为 3(这是正确的)。我知道出了什么问题,但我不知道我错过了什么。
维基百科上的示例使用从 1 开始的字符串。在 C# 中,我们使用基于 0 的字符串。
在他们的矩阵中确实存在 0 行和 0 列。所以他们矩阵的大小是 [source.Length + 1, source.Length + 1] 在你的代码中它不存在。
public int LevenshteinDistanceCalc(string source, string target)
{
int sourceSize = source.Length, targetSize = target.Length;
distance = new int[sourceSize + 1, targetSize + 1];
for (int sIndex = 1; sIndex <= sourceSize; sIndex++)
distance[sIndex, 0] = sIndex;
for (int tIndex = 1; tIndex <= targetSize; tIndex++)
distance[0, tIndex] = tIndex;
for (int tIndex = 1; tIndex <= targetSize; tIndex++)
{
for (int sIndex = 1; sIndex <= sourceSize; sIndex++)
{
int substitutionCost = source[sIndex-1] == target[tIndex-1] ? 0 : 1;
int deletion = distance[sIndex - 1, tIndex] + 1;
int insertion = distance[sIndex, tIndex - 1] + 1;
int substitution = distance[sIndex - 1, tIndex - 1] + substitutionCost;
distance[sIndex, tIndex] = leastOfThree(deletion, insertion, substitution);
}
}
return distance[sourceSize, targetSize];
}
你的矩阵不够大。
在pseudo-code中,s
和t
的长度分别为m
和n
(char s[1..m], char t[1..n]
)。然而,该矩阵具有维度 [0..m, 0..n]
- 即比每个方向上的字符串长度多一个。您可以在 pseudo-code.
所以 "sitting" 和 "kitten" 的矩阵是 7x8,但你的矩阵只有 6x7。
您还对字符串进行了错误的索引,因为 pseudo-code 中的字符串是 1 索引的,而 C# 的字符串是 0 索引的。
修复这些后,您将获得此代码,它适用于 "sitting" 和 "kitten":
public static class LevenshteinDistance
{
public static int LevenshteinDistanceCalc(string source, string target)
{
int sourceSize = source.Length + 1, targetSize = target.Length + 1;
int[,] distance = new int[sourceSize, targetSize];
for (int sIndex = 0; sIndex < sourceSize; sIndex++)
{
distance[sIndex, 0] = sIndex;
}
for (int tIndex = 0; tIndex < targetSize; tIndex++)
{
distance[0, tIndex] = tIndex;
}
// for j from 1 to n:
// for i from 1 to m:
// if s[i] = t[j]:
// substitutionCost:= 0
// else:
// substitutionCost:= 1
// d[i, j] := minimum(d[i - 1, j] + 1, // deletion
// d[i, j - 1] + 1, // insertion
// d[i - 1, j - 1] + substitutionCost) // substitution
//
//
// return d[m, n]
for (int tIndex = 1; tIndex < targetSize; tIndex++)
{
for (int sIndex = 1; sIndex < sourceSize; sIndex++)
{
int substitutionCost = source[sIndex - 1] == target[tIndex - 1] ? 0 : 1;
int deletion = distance[sIndex - 1, tIndex] + 1;
int insertion = distance[sIndex, tIndex - 1] + 1;
int substitution = distance[sIndex - 1, tIndex - 1] + substitutionCost;
distance[sIndex, tIndex] = leastOfThree(deletion, insertion, substitution);
}
}
return distance[sourceSize - 1, targetSize - 1];
}
private static int leastOfThree(int a, int b, int c)
{
return Math.Min(a, (Math.Min(b, c)));
}
}
(我还冒昧地将 distance
设为局部变量,因为它不需要成为一个字段(它只会让你的 class non-threadsafe),并且将其设为静态以避免不必要的实例化)。
为了调试它,我在 return distance[sourceSize - 1, targetSize - 1]
上放置了一个断点,并将 distance
与维基百科上的 table 进行了比较。很明显是太小了