Javascript (MDLD) 版本改进的 Damerau-Levenshtein 距离算法
Javascript Version of (MDLD) Modified Damerau-Levenshtein Distance Algorithm
我想测试 MDLD 的性能,以便将一些浏览器内字符串比较集成到 Web 应用程序中。用例涉及比较字符串,如“300mm,Packed Wall”和 "Packed Wall - 300mm",因此我正在寻找模糊字符串匹配,它对标点符号和错别字有一定的容忍度,并允许块字符换位。
我无法在网上找到 Javascript 的实现。我在 CSIRO's Taxamatch Wiki 找到了为 PL/SQL 编写的版本。
这是我尝试将代码转换为 JS;基本函数的结果看起来相当准确,但是块转置计算并没有给出预期的结果。例如。 "Hi There" vs "There Hi" returns “6”,无论块限制设置为多少。
如果有人知道有效的实施方式,可以指点一下吗?或者,我的改编或源代码本身有什么问题?我所做的唯一主要更改是在源似乎使用整数除法的两个实例中使用 "Math.ceil()",这将始终占据发言权——这会导致输入出现奇怪的问题,导致 1 个字符串——但似乎并没有影响我测试过的其他案例的行为。
function mdld(str1, str2, block_lim)
{
mycol = [];
cost = 0;
len1 = str1.length;
len2 = str2.length;
if( str1 === str2 )
return 0;
else if ( len1 === 0 || len2 === 0 )
return Math.max(len1, len2);
else if ( len1 === 1 && len2 === 1 && str1 !== str2 )
return 1;
else
{
// Temporary strings which will be pre-processed
// Speeds up calcs & retains correct measurement
temp1 = str1;
temp2 = str2;
// Trim any common initial characters
while ( temp1.substr(0,1) === temp2.substr(0,1) )
{
temp1 = temp1.substr(1, temp1.length);
temp2 = temp2.substr(1, temp2.length);
}
// Trim any trailing characters
while ( temp1.substr(-1,1) === temp2.substr(-1,1) )
{
temp1 = temp1.substr(0,temp1.length-1);
temp2 = temp2.substr(0,temp2.length-1);
}
len1 = temp1.length;
len2 = temp2.length;
// Calc Levenshtein Distance
if (len1 === 0 || len2 === 0)
return Math.max(len1, len2);
else if (len1 === 1 && len2 === 1 && str1 !== str2)
return 1;
else
{
// Create columns
var s, t;
for(s = 0; s <= len1; s++)
mycol[s] = [];
// Enter values into leftmost column
for(t = 0; t <= len2; t++)
mycol[0][t] = t;
// Populate remaining columns
for(s = 1; s <= len1; s++)
{
mycol[s][0] = s;
// Populate first row (each cell of one column)
for(t = 1; t <= len2; t++)
{
//Calculate cost
if (temp1.substr(s-1,1) === temp2.substr(t-1,1))
cost = 0;
else
cost = 1;
// extension to cover multiple character transpositions
// that includes calculation of original Levenshtein distance when no transposition
tempBlockLen = Math.min( Math.ceil(len1/2), Math.ceil(len2/2), !block_lim ? 1 : block_lim );
while (tempBlockLen >= 1)
{
if ((s >= tempBlockLen * 2) &&
(t >= tempBlockLen * 2) &&
(temp1.substr(s-tempBlockLen*2, tempBlockLen) === temp2.substr(t-tempBlockLen, tempBlockLen)) &&
(temp1.substr(s-tempBlockLen, tempBlockLen) === temp2.substr(t-tempBlockLen*2, tempBlockLen)))
{
// Transposition found
mycol[s][t] = Math.min( mycol[s][t-1] + 1,
mycol[s-1][t] + 1,
mycol[s-tempBlockLen*2][t-tempBlockLen*2] + cost + tempBlockLen - 1 );
tempBlockLen = 0;
}
else if (tempBlockLen === 1)
{
// No Transposition
mycol[s][t] = Math.min( mycol[s][t-1] + 1,
mycol[s-1][t] + 1,
mycol[s-1][t-1] + cost );
}
tempBlockLen = tempBlockLen - 1;
}
}
}
}
return mycol[len1][len2];
}
}
最后我也没弄清楚我是从CSIRO改编的代码有什么问题。找到了一个 github repo,它在 C 中实现了带有 Ruby 扩展的函数,https://github.com/GlobalNamesArchitecture/damerau-levenshtein.
对其进行调整以获得功能实现。似乎工作正常,但不适合我的用例。 MDLD 可以交换文本块,但仅限于不需要多次连续交换来构造源字符串的情况。转而查看 N-Grams。
对于那些感兴趣的人,这是我的最终结果。性能方面,块限制为 5,它在大约 5 秒内比较了大约 1000、20-40 个字符串。
function method_internal_distance(s, t, block_size, max_distance)
{
s = s.toLocaleLowerCase();
t = t.toLocaleLowerCase();
var pure_levenshtein = false;
var stop_execution = false;
var sl = s.length;
var tl = t.length;
var i, i1, j, j1, k, half_sl, half_tl, cost;
var distance, del, ins, subs, transp, block;
var current_distance = 0;
var min = 0;
if (block_size == 0)
{
pure_levenshtein = true;
block_size = 1;
}
if (sl == 0)
return tl;
if (tl == 0)
return sl;
// case of lengths 1 must present or it will break further in the code
if( sl == 1 && tl ==1 && s != t)
return 1;
// Increment length value of both strings, to include 0 for matrix
sl++;
tl++;
//one-dimentional representation of 2 dimentional array len(s)+1 * len(t)+1
d = [];
//populate 'vertical' row starting from the 2nd position (first one is filled already)
for(i=0; i < tl; i++)
{
d[i*sl] = i;
}
//fill up array with scores
for(i = 1; i<sl; i++)
{
d[i] = i;
if (stop_execution)
break;
current_distance = 10000;
for(j=1; j<tl; j++)
{
cost = 1;
if(s[i-1] == t[j-1])
cost = 0;
half_sl = Math.floor((sl-1)/2);
half_tl = Math.floor((tl-1)/2);
block = block_size < half_sl || half_sl == 0 ? block_size : half_sl;
block = block < half_tl || half_tl == 0 ? block : half_tl;
while (block >=1)
{
var swap1 = 1;
var swap2 = 1;
i1 = i - (block * 2);
j1 = j - (block * 2);
for (k = i1; k < (i1 + block); k++)
{
if(s[k] != t[k+block])
{
swap = 0;
break;
}
}
for (k = j1; k < (j1 + block); k++)
{
if (t[k] != s[k+block])
{
swap2 = 0;
break;
}
}
del = d[j*sl + i - 1] + 1;
ins = d[(j-1)*sl + i] + 1;
min = del;
if (ins < min)
min = ins;
if (!pure_levenshtein && i >= 2*block && j >= 2*block && swap1 == 1 && swap2 == 1)
{
transp = d[(j-block*2)*sl+i-block*2] + cost + block - 1;
if (transp < min)
min = transp;
block = 0;
}
else if (block == 1)
{
subs = d[(j-1)*sl + i - 1] + cost;
if (subs < min)
min = subs;
}
block--;
}
d[j*sl+i] = min;
if (current_distance > d[j*sl+i])
current_distance = d[j*sl+i];
}
if (current_distance > max_distance)
stop_execution = true;
}
distance = d[sl*tl-1];
if (stop_execution)
distance = current_distance;
return distance;
}
我想测试 MDLD 的性能,以便将一些浏览器内字符串比较集成到 Web 应用程序中。用例涉及比较字符串,如“300mm,Packed Wall”和 "Packed Wall - 300mm",因此我正在寻找模糊字符串匹配,它对标点符号和错别字有一定的容忍度,并允许块字符换位。
我无法在网上找到 Javascript 的实现。我在 CSIRO's Taxamatch Wiki 找到了为 PL/SQL 编写的版本。
这是我尝试将代码转换为 JS;基本函数的结果看起来相当准确,但是块转置计算并没有给出预期的结果。例如。 "Hi There" vs "There Hi" returns “6”,无论块限制设置为多少。
如果有人知道有效的实施方式,可以指点一下吗?或者,我的改编或源代码本身有什么问题?我所做的唯一主要更改是在源似乎使用整数除法的两个实例中使用 "Math.ceil()",这将始终占据发言权——这会导致输入出现奇怪的问题,导致 1 个字符串——但似乎并没有影响我测试过的其他案例的行为。
function mdld(str1, str2, block_lim)
{
mycol = [];
cost = 0;
len1 = str1.length;
len2 = str2.length;
if( str1 === str2 )
return 0;
else if ( len1 === 0 || len2 === 0 )
return Math.max(len1, len2);
else if ( len1 === 1 && len2 === 1 && str1 !== str2 )
return 1;
else
{
// Temporary strings which will be pre-processed
// Speeds up calcs & retains correct measurement
temp1 = str1;
temp2 = str2;
// Trim any common initial characters
while ( temp1.substr(0,1) === temp2.substr(0,1) )
{
temp1 = temp1.substr(1, temp1.length);
temp2 = temp2.substr(1, temp2.length);
}
// Trim any trailing characters
while ( temp1.substr(-1,1) === temp2.substr(-1,1) )
{
temp1 = temp1.substr(0,temp1.length-1);
temp2 = temp2.substr(0,temp2.length-1);
}
len1 = temp1.length;
len2 = temp2.length;
// Calc Levenshtein Distance
if (len1 === 0 || len2 === 0)
return Math.max(len1, len2);
else if (len1 === 1 && len2 === 1 && str1 !== str2)
return 1;
else
{
// Create columns
var s, t;
for(s = 0; s <= len1; s++)
mycol[s] = [];
// Enter values into leftmost column
for(t = 0; t <= len2; t++)
mycol[0][t] = t;
// Populate remaining columns
for(s = 1; s <= len1; s++)
{
mycol[s][0] = s;
// Populate first row (each cell of one column)
for(t = 1; t <= len2; t++)
{
//Calculate cost
if (temp1.substr(s-1,1) === temp2.substr(t-1,1))
cost = 0;
else
cost = 1;
// extension to cover multiple character transpositions
// that includes calculation of original Levenshtein distance when no transposition
tempBlockLen = Math.min( Math.ceil(len1/2), Math.ceil(len2/2), !block_lim ? 1 : block_lim );
while (tempBlockLen >= 1)
{
if ((s >= tempBlockLen * 2) &&
(t >= tempBlockLen * 2) &&
(temp1.substr(s-tempBlockLen*2, tempBlockLen) === temp2.substr(t-tempBlockLen, tempBlockLen)) &&
(temp1.substr(s-tempBlockLen, tempBlockLen) === temp2.substr(t-tempBlockLen*2, tempBlockLen)))
{
// Transposition found
mycol[s][t] = Math.min( mycol[s][t-1] + 1,
mycol[s-1][t] + 1,
mycol[s-tempBlockLen*2][t-tempBlockLen*2] + cost + tempBlockLen - 1 );
tempBlockLen = 0;
}
else if (tempBlockLen === 1)
{
// No Transposition
mycol[s][t] = Math.min( mycol[s][t-1] + 1,
mycol[s-1][t] + 1,
mycol[s-1][t-1] + cost );
}
tempBlockLen = tempBlockLen - 1;
}
}
}
}
return mycol[len1][len2];
}
}
最后我也没弄清楚我是从CSIRO改编的代码有什么问题。找到了一个 github repo,它在 C 中实现了带有 Ruby 扩展的函数,https://github.com/GlobalNamesArchitecture/damerau-levenshtein.
对其进行调整以获得功能实现。似乎工作正常,但不适合我的用例。 MDLD 可以交换文本块,但仅限于不需要多次连续交换来构造源字符串的情况。转而查看 N-Grams。
对于那些感兴趣的人,这是我的最终结果。性能方面,块限制为 5,它在大约 5 秒内比较了大约 1000、20-40 个字符串。
function method_internal_distance(s, t, block_size, max_distance)
{
s = s.toLocaleLowerCase();
t = t.toLocaleLowerCase();
var pure_levenshtein = false;
var stop_execution = false;
var sl = s.length;
var tl = t.length;
var i, i1, j, j1, k, half_sl, half_tl, cost;
var distance, del, ins, subs, transp, block;
var current_distance = 0;
var min = 0;
if (block_size == 0)
{
pure_levenshtein = true;
block_size = 1;
}
if (sl == 0)
return tl;
if (tl == 0)
return sl;
// case of lengths 1 must present or it will break further in the code
if( sl == 1 && tl ==1 && s != t)
return 1;
// Increment length value of both strings, to include 0 for matrix
sl++;
tl++;
//one-dimentional representation of 2 dimentional array len(s)+1 * len(t)+1
d = [];
//populate 'vertical' row starting from the 2nd position (first one is filled already)
for(i=0; i < tl; i++)
{
d[i*sl] = i;
}
//fill up array with scores
for(i = 1; i<sl; i++)
{
d[i] = i;
if (stop_execution)
break;
current_distance = 10000;
for(j=1; j<tl; j++)
{
cost = 1;
if(s[i-1] == t[j-1])
cost = 0;
half_sl = Math.floor((sl-1)/2);
half_tl = Math.floor((tl-1)/2);
block = block_size < half_sl || half_sl == 0 ? block_size : half_sl;
block = block < half_tl || half_tl == 0 ? block : half_tl;
while (block >=1)
{
var swap1 = 1;
var swap2 = 1;
i1 = i - (block * 2);
j1 = j - (block * 2);
for (k = i1; k < (i1 + block); k++)
{
if(s[k] != t[k+block])
{
swap = 0;
break;
}
}
for (k = j1; k < (j1 + block); k++)
{
if (t[k] != s[k+block])
{
swap2 = 0;
break;
}
}
del = d[j*sl + i - 1] + 1;
ins = d[(j-1)*sl + i] + 1;
min = del;
if (ins < min)
min = ins;
if (!pure_levenshtein && i >= 2*block && j >= 2*block && swap1 == 1 && swap2 == 1)
{
transp = d[(j-block*2)*sl+i-block*2] + cost + block - 1;
if (transp < min)
min = transp;
block = 0;
}
else if (block == 1)
{
subs = d[(j-1)*sl + i - 1] + cost;
if (subs < min)
min = subs;
}
block--;
}
d[j*sl+i] = min;
if (current_distance > d[j*sl+i])
current_distance = d[j*sl+i];
}
if (current_distance > max_distance)
stop_execution = true;
}
distance = d[sl*tl-1];
if (stop_execution)
distance = current_distance;
return distance;
}