修改现有的 Levenshtein 距离代码以适应不同的操作成本
Revise existing Levenshtein distance code to accommodate different operation costs
我找到了很多确定两个字符串之间编辑距离 (LD) 的来源。然而,它们都假设替换、插入和删除操作的成本都设置为 1。
C++ 的 source 非常高效,我正在尝试在下面对其进行调整以允许每次操作的不同成本。
#include <vector>
#include <string>
#include <iostream>
size_t levenshtein_distance(const std::string &a, const std::string &b);
int main()
{
std::string a, b;
a = "roger"; b = "Roger";
std::cout << levenshtein_distance(a, b) << std::endl;
a = "roger"; b = "oger";
std::cout << levenshtein_distance(a, b) << std::endl;
a = "oger"; b = "roger";
std::cout << levenshtein_distance(a, b) << std::endl;
return 0;
}
size_t levenshtein_distance(const std::string &a, const std::string &b)
{
// Costs of operations
const size_t substitution = 5;
const size_t deletion = 2;
const size_t insertion = 2;
size_t a_size = a.size(), b_size = b.size();
std::vector<size_t> P(b_size + 1), Q(b_size + 1);
for (size_t i = 0; i < Q.size(); i++)
Q[i] = i;
for (size_t i = 0; i < a_size; i++)
{
P[0] = i + 1;
for (size_t j = 0; j < b_size; j++)
P[j + 1] = std::min(
std::min(Q[j + 1] + 1, P[j] + 1), Q[j] + ((a[i] == b[j])? 0: substitution));
P.swap(Q);
}
return Q[b_size];
}
我想我 substitution
在正确的地方。如果我将它更改为 5,它会为该操作提供相应大的 LD,但似乎无法找到应用 insertion
或 deletion
的位置。我尝试更改文字 1
,但它们似乎没有任何效果 - 对于插入或删除操作,结果始终为 1。
您可以按如下方式改编来自维基百科的算法:
size_t levenshtein_distance(const std::string &s1, const std::string &s2)
{
const size_t substitution = 5;
const size_t deletion = 2;
const size_t insertion = 3;
const size_t len1 = s1.size(), len2 = s2.size();
vector<vector<unsigned int> > d(len1 + 1, vector<unsigned int>(len2 + 1));
d[0][0] = 0;
for(unsigned int i = 1; i <= len1; ++i) d[i][0] = deletion * i;
for(unsigned int i = 1; i <= len2; ++i) d[0][i] = insertion * i;
for(unsigned int i = 1; i <= len1; ++i)
for(unsigned int j = 1; j <= len2; ++j)
d[i][j] = std::min(
std::min(d[i - 1][j] + deletion, d[i][j - 1] + insertion),
d[i - 1][j - 1] + (s1[i - 1] == s2[j - 1] ? 0 : substitution)
);
return d[len1][len2];
}
例如 d[i][0]
表示将第一个字符串的前 i
个字符转换为第二个字符串的前零个字符的成本。这显然来自删除等 d[i][0] = deletion * i
。同样,d[0][i] = insertion * i
.
如果不想使用二维数组,那么只能保留最后一行:
int levenshtein_distance(const std::string &s1, const std::string &s2)
{
const int substitution = 5;
const int deletion = 2;
const int insertion = 3;
const int len1 = s1.size(), len2 = s2.size();
std::vector<int> P(len2 + 1), Q(len2 + 1);
for(int j = 1; j <= len2; ++j) P[j] = insertion * j;
for(int i = 1; i <= len1; ++i) {
Q[0] = deletion * i;
for(int j = 1; j <= len2; ++j) {
Q[j] = std::min(Q[j - 1] + insertion, P[j] + deletion);
Q[j] = std::min(Q[j], P[j - 1] +
(s1[i - 1] == s2[j - 1] ? 0 : substitution)
);
}
std::swap(P, Q);
}
return P[len2];
}
我找到了很多确定两个字符串之间编辑距离 (LD) 的来源。然而,它们都假设替换、插入和删除操作的成本都设置为 1。
C++ 的 source 非常高效,我正在尝试在下面对其进行调整以允许每次操作的不同成本。
#include <vector>
#include <string>
#include <iostream>
size_t levenshtein_distance(const std::string &a, const std::string &b);
int main()
{
std::string a, b;
a = "roger"; b = "Roger";
std::cout << levenshtein_distance(a, b) << std::endl;
a = "roger"; b = "oger";
std::cout << levenshtein_distance(a, b) << std::endl;
a = "oger"; b = "roger";
std::cout << levenshtein_distance(a, b) << std::endl;
return 0;
}
size_t levenshtein_distance(const std::string &a, const std::string &b)
{
// Costs of operations
const size_t substitution = 5;
const size_t deletion = 2;
const size_t insertion = 2;
size_t a_size = a.size(), b_size = b.size();
std::vector<size_t> P(b_size + 1), Q(b_size + 1);
for (size_t i = 0; i < Q.size(); i++)
Q[i] = i;
for (size_t i = 0; i < a_size; i++)
{
P[0] = i + 1;
for (size_t j = 0; j < b_size; j++)
P[j + 1] = std::min(
std::min(Q[j + 1] + 1, P[j] + 1), Q[j] + ((a[i] == b[j])? 0: substitution));
P.swap(Q);
}
return Q[b_size];
}
我想我 substitution
在正确的地方。如果我将它更改为 5,它会为该操作提供相应大的 LD,但似乎无法找到应用 insertion
或 deletion
的位置。我尝试更改文字 1
,但它们似乎没有任何效果 - 对于插入或删除操作,结果始终为 1。
您可以按如下方式改编来自维基百科的算法:
size_t levenshtein_distance(const std::string &s1, const std::string &s2)
{
const size_t substitution = 5;
const size_t deletion = 2;
const size_t insertion = 3;
const size_t len1 = s1.size(), len2 = s2.size();
vector<vector<unsigned int> > d(len1 + 1, vector<unsigned int>(len2 + 1));
d[0][0] = 0;
for(unsigned int i = 1; i <= len1; ++i) d[i][0] = deletion * i;
for(unsigned int i = 1; i <= len2; ++i) d[0][i] = insertion * i;
for(unsigned int i = 1; i <= len1; ++i)
for(unsigned int j = 1; j <= len2; ++j)
d[i][j] = std::min(
std::min(d[i - 1][j] + deletion, d[i][j - 1] + insertion),
d[i - 1][j - 1] + (s1[i - 1] == s2[j - 1] ? 0 : substitution)
);
return d[len1][len2];
}
例如 d[i][0]
表示将第一个字符串的前 i
个字符转换为第二个字符串的前零个字符的成本。这显然来自删除等 d[i][0] = deletion * i
。同样,d[0][i] = insertion * i
.
如果不想使用二维数组,那么只能保留最后一行:
int levenshtein_distance(const std::string &s1, const std::string &s2)
{
const int substitution = 5;
const int deletion = 2;
const int insertion = 3;
const int len1 = s1.size(), len2 = s2.size();
std::vector<int> P(len2 + 1), Q(len2 + 1);
for(int j = 1; j <= len2; ++j) P[j] = insertion * j;
for(int i = 1; i <= len1; ++i) {
Q[0] = deletion * i;
for(int j = 1; j <= len2; ++j) {
Q[j] = std::min(Q[j - 1] + insertion, P[j] + deletion);
Q[j] = std::min(Q[j], P[j - 1] +
(s1[i - 1] == s2[j - 1] ? 0 : substitution)
);
}
std::swap(P, Q);
}
return P[len2];
}