Javascript:在前缀树中准确找到 10 个以给定前缀开头的单词

Javascript: Find exactly 10 words in a prefix tree that start with a given prefix

我有一个特里树(也叫前缀树)。给定一个前缀,我想获得以该前缀开头的 10 个单词的列表。

这个问题的独特之处在于我只需要 10 个以给定前缀开头的单词——而不是全部。鉴于此,可以进行一些优化。

我知道下面的代码工作正常。 trie 中的每个节点都有一个 children 属性 和一个 this_is_the_end_of_a_word 属性。例如,当您插入 "hi" 时,这就是特里树的样子:

.

问题:给定一个前缀,我想得到一个以前缀开头的 10 个单词的列表。

我解决这个问题的方法是:沿着前缀树向下,沿着 prefix 的字符,直到到达与 prefix 的最后一个字符对应的节点。现在您应该在此节点上执行 DFS,跟踪列表中具有 this_is_the_end_of_a_word === true 的节点。但是当列表的长度等于 10 并且 return 列表时,您应该停止搜索。

我认为我的方法是合理的,但我在实施它时遇到了问题——特别是因为我正在尝试使用递归 DFS,所以我不确定如何传递 "global"在递归调用之间列出。我知道我应该使用闭包,但我是 javascript 的新手,我不确定如何去做。下面是我已经尝试过的示例。

我的 Trie class(我知道这段代码有效,这只是为了让您了解我是如何组织我的数据结构的。)

var Trie = function() {

    var that = Object.create(Trie.prototype);
    that.children = {}; //mapping: next character -> child nodes
    that.this_is_the_end_of_a_word = false;

    that.insertWord = function(word) {

        var current_node = that;

        for (var i = 0; i < word.length; i++) {
            var c = word[i]
                //if character is not in the trie already, add it
            if (!(c in current_node.children)) {
                current_node.children[c] = Trie();
            }
            //update current_node
            current_node = current_node.children[c];
        };

        //after adding all the chars of the word, 
        //you are at the end of a word
        current_node.this_is_the_end_of_a_word = true;
    }

    that.insertWords = function(words) {
        for (var i = 0; i < words.length; i++) {
            that.insertWord(words[i]);
        }
    }

    that.contains = function(word) {
        //start at the root
        var current_node = that;
        for (var i = 0; i < word.length; i++) {
            var c = word[i];

            //if the word's character isn't a child of the current_node, 
            //the word isn't in the trie
            if (!(c in current_node.children)) {
                return false;
            }
            //move down the trie, update current_node
            current_node = current_node.children[c];
        };
        return current_node.this_is_the_end_of_a_word;
    }

    Object.freeze(that);
    return that;
}

我的第一种方法(有很多错误)

num_words_to_go = 10; 
//this global is bad practice; 
//I want to put this as the argument to a closure 
//so it's passed between recursive calls

that.getWords = function(start_node, prefix) {
   console.log(0);
   var words = [];

   //if start node is a word, add it
   if (start_node.this_is_the_end_of_a_word) {
       words.push(start_node);
       num_words_to_go--;
   }

   if (num_words_to_go <= 0 || !start_node.children) {
       return words;
   }

   return start_node.children.forEach(
                              currentValue.getWords(
                                    currentValue, prefix + <character for this child>)); 

   /*I can't think of a nice way to write this without going through all of the children. 
   I know I don't need to, because I only need to find 10 words and get out. 
   This is why I was leaning towards the recursive DFS. 
   */

}

第二种方法:我还找到了一个我正在查看的 python 示例: http://v1v3kn.tumblr.com/post/18238156967/roll-your-own-autocomplete-solution-using-tries 我尝试将他的示例翻译成 JavaScript,但 all_suffixes.

仍然有问题
that.all_suffixes = function (prefix){
    results = [];
    if (that.this_is_the_end_of_a_word) results.push(prefix);
    if (!(that.children)) return results;
    if (results.length > 2) return results;
    var callback = function(currentValue, i, array){
        return currentValue.all_suffixes(prefix+array[i]);
    }
    arr = that.children.forEach(callback, that);
        //[child.all_suffixes(prefix + char) for (char, child) in self.children.items()]
    return concat(reduce(concat, arr), results);        
}

 that.autocomplete = function(prefix){
    current_node = that;
    for(var i = 0; i < prefix.length; i++){
        var c = prefix[i];
        //if there is nothing in the trie with this prefix
        if (!(c in current_node.children)){
            return [];
        }
        current_node = current_node.children[c];
    }
    return list(current_node.all_suffixes(prefix))
 }

基本上我采用了您的模型并将新方法 getWords(word[, count]) 应用于 Trie class。我更改了方法 contains,因为我也需要 getWords 中的功能。所以我创建了一个新方法 getNode,其中 returns 找到单词或部分的节点。

方法getWords首先查找词(部分),然后遍历数据结构。当找到一个词时,它会被推送到结果集中。如果结果集长度大于或等于所需长度,则迭代(因此 Array.prototype.some)终止,fork 的递归调用停止。

    that.getWords = function (word, count) {

        function fork(n, w) {

            function child(c) {
                return fork(n.children[c], w + c);
            }

            n.isWord && words.push(w);
            return words.length >= count || Object.keys(n.children).some(child);
        }

        var words = [],
            current_node = that.getNode(word);

        if (current_node) {
            fork(current_node, word);
            return words;
        }
    }

旁注:我将 this_is_the_end_of_a_word 更改为 isWord

输入

  1. 创建 Trie 的新实例。
  2. 插入一些测试用词。

输出

  1. 测试 trie 是否包含 'motor', returns false.
  2. 测试 trie 是否包含 'te', returns false.
  3. 测试 trie 是否包含 'ten', returns true.
  4. 获取所有个以'ind'开头的单词(可用8个,显示8个)。
  5. 获取以 'in' 开头的前 10 个单词(可用 16 个,显示 10 个)。
  6. 整个 trie。

var Trie = function () {

    var that = Object.create(Trie.prototype);
    that.children = {}; //mapping: next character -> child nodes
    that.isWord = false;

    that.insertWord = function (word) {
        var current_node = that;
        for (var i = 0; i < word.length; i++) {
            var c = word[i]
            //if character is not in the trie already, add it
            if (!(c in current_node.children)) {
                current_node.children[c] = Trie();
            }
            //update current_node
            current_node = current_node.children[c];
        };

        //after adding all the chars of the word,
        //you are at the end of a word
        current_node.isWord = true;
    }

    that.insertWords = function (words) {
        for (var i = 0; i < words.length; i++) {
            that.insertWord(words[i]);
        }
    }

    that.getNode = function (word) {
        //start at the root
        var current_node = that;
        for (var i = 0; i < word.length; i++) {
            var c = word[i];

            //if the word's character isn't a child of the current_node,
            //the word isn't in the trie
            if (!(c in current_node.children)) {
                return;
            }
            //move down the trie, update current_node
            current_node = current_node.children[c];
        };
        return current_node;
    }

    that.contains = function (word) {
        var current_node = that.getNode(word);
        if (current_node) {
            return current_node.isWord;
        }
        return false;
    }

    that.getWords = function (word, count) {

        function fork(n, w) {

            function child(c) {
                return fork(n.children[c], w + c);
            }

            n.isWord && words.push(w);
            return words.length >= count || Object.keys(n.children).some(child);
        }

        var words = [],
            current_node = that.getNode(word);

        if (current_node) {
            fork(current_node, word);
            return words;
        }
    }

    // freeze does lock the isWord property, which is not required here
    //Object.freeze(that);
    return that;
}

var trie = new Trie();
trie.insertWords([
    'car', 'cool', 'i', 'in', 'indeed', 'independence', 'india', 'indoor', 'induction',
    'industrial', 'industry', 'indwell', 'inferior', 'informal', 'inhale', 'inn',
    'inside', 'instance', 'intrepid', 'of', 'off', 'other', 'tea', 'ted', 'ten',
    'to', 'zoo', 'zoom'
]);
document.write(trie.contains('motor') + '<br>'); // false
document.write(trie.contains('te') + '<br>'); // false
document.write(trie.contains('ten') + '<br>'); // true
document.write('<pre>' + JSON.stringify(trie.getWords('ind'), 0, 4) + '</pre>');
document.write('<pre>' + JSON.stringify(trie.getWords('in', 10), 0, 4) + '</pre>');
document.write('<pre>' + JSON.stringify(trie, 0, 4) + '</pre>');