Javascript (MDLD) 版本改进的 Damerau-Levenshtein 距离算法

Javascript Version of (MDLD) Modified Damerau-Levenshtein Distance Algorithm

我想测试 MDLD 的性能,以便将一些浏览器内字符串比较集成到 Web 应用程序中。用例涉及比较字符串,如“300mm,Packed Wall”和 "Packed Wall - 300mm",因此我正在寻找模糊字符串匹配,它对标点符号和错别字有一定的容忍度,并允许块字符换位。

我无法在网上找到 Javascript 的实现。我在 CSIRO's Taxamatch Wiki 找到了为 PL/SQL 编写的版本。

这是我尝试将代码转换为 JS;基本函数的结果看起来相当准确,但是块转置计算并没有给出预期的结果。例如。 "Hi There" vs "There Hi" returns “6”,无论块限制设置为多少。

如果有人知道有效的实施方式,可以指点一下吗?或者,我的改编或源代码本身有什么问题?我所做的唯一主要更改是在源似乎使用整数除法的两个实例中使用 "Math.ceil()",这将始终占据发言权——这会导致输入出现奇怪的问题,导致 1 个字符串——但似乎并没有影响我测试过的其他案例的行为。

function mdld(str1, str2, block_lim)
{
    mycol = [];
    cost = 0;
    len1 = str1.length;
    len2 = str2.length;

    if( str1 === str2 )
        return 0;
    else if ( len1 === 0 || len2 === 0 )
        return Math.max(len1, len2);
    else if ( len1 === 1 && len2 === 1 && str1 !== str2 )
        return 1;
    else
    {
        // Temporary strings which will be pre-processed
        // Speeds up calcs & retains correct measurement
        temp1 = str1;
        temp2 = str2;

        // Trim any common initial characters
        while ( temp1.substr(0,1) === temp2.substr(0,1) )
        {
            temp1 = temp1.substr(1, temp1.length);
            temp2 = temp2.substr(1, temp2.length);
        }

        // Trim any trailing characters
        while ( temp1.substr(-1,1) === temp2.substr(-1,1) )
        {
            temp1 = temp1.substr(0,temp1.length-1);
            temp2 = temp2.substr(0,temp2.length-1);
        }

        len1 = temp1.length;
        len2 = temp2.length;

        // Calc Levenshtein Distance
        if (len1 === 0 || len2 === 0)
            return Math.max(len1, len2);
        else if (len1 === 1 && len2 === 1 && str1 !== str2)
            return 1;
        else
        {
            // Create columns
            var s, t;
            for(s = 0; s <= len1; s++)
                mycol[s] = [];

            // Enter values into leftmost column
            for(t = 0; t <= len2; t++)
                mycol[0][t] = t;

            // Populate remaining columns
            for(s = 1; s <= len1; s++)
            {
                mycol[s][0] = s;
                // Populate first row (each cell of one column)
                for(t = 1; t <= len2; t++)
                {
                    //Calculate cost
                    if (temp1.substr(s-1,1) === temp2.substr(t-1,1))
                        cost = 0;
                    else
                        cost = 1;

                    // extension to cover multiple character transpositions
                    // that includes calculation of original Levenshtein distance when no transposition 
                    tempBlockLen = Math.min( Math.ceil(len1/2), Math.ceil(len2/2), !block_lim ? 1 : block_lim );

                    while (tempBlockLen >= 1)
                    {
                        if ((s >= tempBlockLen * 2) && 
                            (t >= tempBlockLen * 2) &&
                            (temp1.substr(s-tempBlockLen*2, tempBlockLen) === temp2.substr(t-tempBlockLen, tempBlockLen)) &&
                            (temp1.substr(s-tempBlockLen, tempBlockLen) === temp2.substr(t-tempBlockLen*2, tempBlockLen)))
                        {
                            // Transposition found
                            mycol[s][t] = Math.min( mycol[s][t-1] + 1,
                                                    mycol[s-1][t] + 1,
                                                    mycol[s-tempBlockLen*2][t-tempBlockLen*2] + cost + tempBlockLen - 1 );
                            tempBlockLen = 0;
                        }
                        else if (tempBlockLen === 1)
                        {
                            // No Transposition
                            mycol[s][t] = Math.min( mycol[s][t-1] + 1,
                                                    mycol[s-1][t] + 1,
                                                    mycol[s-1][t-1] + cost );
                        }
                        tempBlockLen = tempBlockLen - 1;    
                    }
                }
            }
        }
        return mycol[len1][len2];
    }
}

最后我也没弄清楚我是从CSIRO改编的代码有什么问题。找到了一个 github repo,它在 C 中实现了带有 Ruby 扩展的函数,https://github.com/GlobalNamesArchitecture/damerau-levenshtein.

对其进行调整以获得功能实现。似乎工作正常,但不适合我的用例。 MDLD 可以交换文本块,但仅限于不需要多次连续交换来构造源字符串的情况。转而查看 N-Grams。

对于那些感兴趣的人,这是我的最终结果。性能方面,块限制为 5,它在大约 5 秒内比较了大约 1000、20-40 个字符串。

function method_internal_distance(s, t, block_size, max_distance)
{
    s = s.toLocaleLowerCase();
    t = t.toLocaleLowerCase();
    var pure_levenshtein = false;
    var stop_execution = false;
    var sl = s.length;
    var tl = t.length;

    var i, i1, j, j1, k, half_sl, half_tl, cost;
    var distance, del, ins, subs, transp, block;
    var current_distance = 0;
    var min = 0;

    if (block_size == 0)
    {
        pure_levenshtein = true;
        block_size = 1;
    }

    if (sl == 0) 
        return tl;
    if (tl == 0)
        return sl;

    // case of lengths 1 must present or it will break further in the code
    if( sl == 1 && tl ==1 && s != t)
        return 1;

    // Increment length value of both strings, to include 0 for matrix
    sl++;
    tl++;

    //one-dimentional representation of 2 dimentional array len(s)+1 * len(t)+1
    d = [];

    //populate 'vertical' row starting from the 2nd position (first one is filled already)
    for(i=0; i < tl; i++)
    {
        d[i*sl] = i;
    }

    //fill up array with scores
    for(i = 1; i<sl; i++)
    {
        d[i] = i;
        if (stop_execution)
            break;

        current_distance = 10000;
        for(j=1; j<tl; j++)
        {
            cost = 1;
            if(s[i-1] == t[j-1])
                cost = 0;

            half_sl = Math.floor((sl-1)/2);
            half_tl = Math.floor((tl-1)/2);

            block = block_size < half_sl || half_sl == 0 ? block_size : half_sl;
            block = block < half_tl || half_tl == 0 ? block : half_tl;

            while (block >=1)
            {
                var swap1 = 1;
                var swap2 = 1;

                i1 = i - (block * 2);
                j1 = j - (block * 2);

                for (k = i1; k < (i1 + block); k++)
                {
                    if(s[k] != t[k+block])
                    {
                        swap = 0;
                        break;
                    }
                }

                for (k = j1; k < (j1 + block); k++)
                {
                    if (t[k] != s[k+block])
                    {
                        swap2 = 0;
                        break;
                    }
                }

                del = d[j*sl + i - 1] + 1;
                ins = d[(j-1)*sl + i] + 1;
                min = del;

                if (ins < min)
                    min = ins;

                if (!pure_levenshtein && i >= 2*block && j >= 2*block && swap1 == 1 && swap2 == 1)
                {
                    transp = d[(j-block*2)*sl+i-block*2] + cost + block - 1;
                    if (transp < min)
                        min = transp;
                    block = 0;
                }
                else if (block == 1)
                {
                    subs = d[(j-1)*sl + i - 1] + cost;
                    if (subs < min)
                        min = subs;
                }
                block--;
            }
            d[j*sl+i] = min;
            if (current_distance > d[j*sl+i])
                current_distance = d[j*sl+i];
        }
        if (current_distance > max_distance)
            stop_execution = true;
    }
    distance = d[sl*tl-1];
    if (stop_execution)
        distance = current_distance;

    return distance;
}