Javascript 词频小书签在输出中省略了一些字母

Javascript word frequency bookmarklet omits some letters in the output

我找不到 Chrome 的词频扩展,它列出了一个词在页面上的使用次数(我需要一个至少包含 100 个按使用频率排列的结果的列表),所以我采取了复制 javascript 小书签并稍微调整它以过滤掉常用词的方法。

然而,原始代码和修改后的代码都输出了一个列表,其中省略了一些单词的首字母,例如 "roperty" 而不是 "property","ubversion" 而不是 "subversion",等等。可能是什么原因造成的?

下面是一个link原代码:https://gist.github.com/RonnyO/3004194

这是我稍微调整后的代码:

javascript: (function () {
            var settings = {
                           listLength: 100,
                           ignore: ['the', 'be', 'to', 'of', 'and', 'in', 'that', 'have', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at', 'this', 'but', 'his', 'by', 'from', 'they', 'we', 'say', 'her', 'she', 'or', 'an', 'will', 'my', 'one', 'all', 'would', 'there', 'their', 'what', 'so', 'up', 'out', 'if', 'about', 'who', 'get', 'which', 'go', 'me', 'when', 'make', 'can', 'like', 'time', 'no', 'just', 'him', 'know', 'fake', 'people', 'into', 'year', 'your', 'good', 'some', 'could', 'them', 'see', 'other', 'than', 'then', 'now', 'look', 'only', 'come', 'its', 'over', 'think', 'also', 'back', 'after', 'use', 'two', 'how', 'our', 'work', 'first', 'well', 'way', 'even', 'new', 'want', 'because', 'any', 'these', 'give', 'day', 'most', 'us']
                    },
                    w, s;
            function getBodyText() {
                    var doc = document,
                            body = doc.body,
                            selection, range, bodyText;
            if (body.createTextRange) {
                            return body.createTextRange().text;
            } else if (getSelection) {
                            selection = getSelection();
                            range = doc.createRange();
                            range.selectNodeContents(body);
                            selection.addRange(range);
                            bodyText = selection.toString();
                            selection.removeAllRanges();
                            return bodyText;
            }
     }

     var punctuation = /[\/\.\*\+\+\?\|\(\)\[\]\{\}\^\,:;-`~!@#$%&_]+/g;
     var words = getBodyText().trim().replace(punctuation, ' ').replace(/\s+/g, ' ').split(' '),
               count = {},
               sorted = [];

    for (w in words) {if (words.hasOwnProperty(w) && settings.ignore.indexOf(words[w]) == -1) {
        var word = words[w];
        count[word] = count[word] ? count[word] + 1 : 1;
    }
}

for (w in count) if (count.hasOwnProperty(w)) {
    sorted.push([w, count[w]]);
}

s = sorted.sort(function (a, b) {
    return b[1] - a[1];
});

var output = '<title>word frequency</title><ul style="direction: ltr; text-align: left; font-family: sans-serif; line-height: 130%;">';
for (s in sorted.slice(0, settings.listLength)) {
    var c = sorted[s];
    output += '<li>' + c[1] + ': ' + c[0] + '</li>';
}
output += '</ul>';

with(open().document){
    write(output);
    close();
}
})();

抱歉缩进太糟糕了..

更改标点符号以转义连字符。

var punctuation = /[\/\.\*\+\+\?\|\(\)\[\]\{\}\^\,:;\-`~!@#$%&_]+/g;