有没有办法在 javascript 中使用正则表达式来实现高效准确的搜索功能

Is there a way to use regex in javascript to implement an efficient and accurate search functionality

我一直在从事一个项目,该项目要求其用户搜索类似沃尔玛或亚马逊搜索功能的产品。但似乎每当我觉得我有一个可行的解决方案时,我都会面临另一个问题。这是我当前的代码片段。我将在下面解释代码的作用。

const keywords = [
  'oranges',
  'red bull',
  'red bull energy drink',
  'carnation breakfast essentials',
  'carnation instant breakfast',
  'organic oranges',
  'oargens',
  'nesquik powder',
  "welch's orange juice",
  'mandarin oranges',
  'bananas',
  'nesquik chocolate powder',
  'jimmy dean sausage',
  'organic bananas',
  'nesquik',
  'nesquik no sugar',
  "welch's white grape",
  'great value',
  'great value apple juice',
  'lemon',
  'lemon fruit',
  'avocados',
  'avocados, each',
  'apple juice',
];

class SearchApi {
  constructor(keywords, query) {
    this.keywords = keywords;
    this.query = query;
  }

  findWithRegex(keyword, query) {
    const pattern = query
      .split('')
      .map(q => {
        return `(?=.*${q})`;
      })
      .join('');

    const regex = new RegExp(`${pattern}`, 'g');

    return keyword.match(regex);
  }

  matchKeywords() {
    const str = this.query.trim().toLowerCase().substring(0, 3);
    const queryLength = this.query.trim().length;

      return this.keywords.filter(keyword => {
        const keywordSubstr = keyword.substring(0, 3);
        const equalInitials = keyword.substring(0, 1) === this.query.toLowerCase().substring(0, 1);

        return this.findWithRegex(keywordSubstr, str) && equalInitials && this.findWithRegex(keyword.substring(queryLength, queryLength - 3), this.query.trim().substring(queryLength, queryLength - 3));
    });
  }
}

const searchApi = new SearchApi(keywords, 'organic banan');
searchApi.matchKeywords();

代码解释

我在这里基本上做的是在进行查询时,我比较查询和关键字的前三个字符,并检查查询和关键字中的首字母是否相同,因为如果有人键入字母“o” 我只想显示以该字母开头的关键字。

它工作正常但不幸的是在测试时当我输入“organic banan”作为查询时我得到 ["organic oranges", "organic bananas"] 应该是 ["organic bananas"].

这是因为正则表达式函数在有机橙的最后三个字母中找到字符“a”和“n” .关于如何有效执行此操作的任何进一步帮助都将对我有所帮助。

Search-as-you-type 或 auto-complete 功能通常使用专门的数据结构和算法而不是正则表达式来实现(除非您的搜索功能完全是关于正则表达式搜索..)

您可能希望使用二进制搜索来搜索数组中的字符串,如下所示(我在下面选择此功能仅用于演示目的,不推荐使用。Source)。 您可能会发现许多适合您的环境和需求的软件包,例如 fast-string-search 使用 N-API 和 boyer-moore-magiclen 来加快速度。

在数据结构方面,经常建议使用前缀树(TRIE)来实现快速自动完成功能。 Here 是一个简单的实现,展示了 TRIE 的基本概念。

var movies = [
    "ACADEMY DINOSAUR",
    "ACE GOLDFINGER",
    "ADAPTATION HOLES",
    "AFFAIR PREJUDICE",
    "BENEATH RUSH",
    "BERETS AGENT",
    "BETRAYED REAR",
    "BEVERLY OUTLAW",
    "BIKINI BORROWERS",
    "YENTL IDAHO",
    "YOUNG LANGUAGE",
    "YOUTH KICK",
    "ZHIVAGO CORE",
    "ZOOLANDER FICTION",
    "ZORRO ARK"
];

var searchBinary = function (needle, haystack, case_insensitive) {
    if (needle == "") return [];
    var haystackLength = haystack.length;
    var letterNumber = needle.length;
    case_insensitive = (typeof (case_insensitive) === 'undefined' || case_insensitive) ? true : false;
    needle = (case_insensitive) ? needle.toLowerCase() : needle;

    /* start binary search, Get middle position */
    var getElementPosition = findElement()

    /* get interval and return result array */
    if (getElementPosition == -1) return [];
    return getRangeElement = findRangeElement()

    function findElement() {
        if (typeof (haystack) === 'undefined' || !haystackLength) return -1;

        var high = haystack.length - 1;
        var low = 0;

        while (low <= high) {
            mid = parseInt((low + high) / 2);
            var element = haystack[mid].substr(0, letterNumber);
            element = (case_insensitive) ? element.toLowerCase() : element;

            if (element > needle) {
                high = mid - 1;
            } else if (element < needle) {
                low = mid + 1;
            } else {

                return mid;
            }
        }
        return -1;
    }

    function findRangeElement() {    
        for (i = getElementPosition; i > 0; i--) {
            var element = (case_insensitive) ? haystack[i].substr(0, letterNumber).toLowerCase() : haystack[i].substr(0, letterNumber);
            if (element != needle) {
                var start = i + 1;
                break;
            } else {
                var start = 0;
            }
        }
        for (i = getElementPosition; i < haystackLength; i++) {
            var element = (case_insensitive) ? haystack[i].substr(0, letterNumber).toLowerCase() : haystack[i].substr(0, letterNumber);
            if (element != needle) {
                var end = i;
                break;
            } else {
                var end = haystackLength - 1;
            }
        }
        var result = [];
        for (i = start; i < end; i++) {
            result.push(haystack[i])
        }    
        return result;
    }    
};


testBinary = searchBinary("BIKINI", movies, false);
console.log('searchBinary("BIKINI", movies, false) = [' + testBinary + ']');

testBinary = searchBinary("b", movies, false);
console.log('searchBinary("b", movies, false) = [' + testBinary + ']');

testBinary = searchBinary("b", movies, true);
console.log('searchBinary("b", movies, true) = [' + testBinary + ']');