Javascript 词频小书签在输出中省略了一些字母
Javascript word frequency bookmarklet omits some letters in the output
我找不到 Chrome 的词频扩展,它列出了一个词在页面上的使用次数(我需要一个至少包含 100 个按使用频率排列的结果的列表),所以我采取了复制 javascript 小书签并稍微调整它以过滤掉常用词的方法。
然而,原始代码和修改后的代码都输出了一个列表,其中省略了一些单词的首字母,例如 "roperty" 而不是 "property","ubversion" 而不是 "subversion",等等。可能是什么原因造成的?
下面是一个link原代码:https://gist.github.com/RonnyO/3004194
这是我稍微调整后的代码:
javascript: (function () {
var settings = {
listLength: 100,
ignore: ['the', 'be', 'to', 'of', 'and', 'in', 'that', 'have', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at', 'this', 'but', 'his', 'by', 'from', 'they', 'we', 'say', 'her', 'she', 'or', 'an', 'will', 'my', 'one', 'all', 'would', 'there', 'their', 'what', 'so', 'up', 'out', 'if', 'about', 'who', 'get', 'which', 'go', 'me', 'when', 'make', 'can', 'like', 'time', 'no', 'just', 'him', 'know', 'fake', 'people', 'into', 'year', 'your', 'good', 'some', 'could', 'them', 'see', 'other', 'than', 'then', 'now', 'look', 'only', 'come', 'its', 'over', 'think', 'also', 'back', 'after', 'use', 'two', 'how', 'our', 'work', 'first', 'well', 'way', 'even', 'new', 'want', 'because', 'any', 'these', 'give', 'day', 'most', 'us']
},
w, s;
function getBodyText() {
var doc = document,
body = doc.body,
selection, range, bodyText;
if (body.createTextRange) {
return body.createTextRange().text;
} else if (getSelection) {
selection = getSelection();
range = doc.createRange();
range.selectNodeContents(body);
selection.addRange(range);
bodyText = selection.toString();
selection.removeAllRanges();
return bodyText;
}
}
var punctuation = /[\/\.\*\+\+\?\|\(\)\[\]\{\}\^\,:;-`~!@#$%&_]+/g;
var words = getBodyText().trim().replace(punctuation, ' ').replace(/\s+/g, ' ').split(' '),
count = {},
sorted = [];
for (w in words) {if (words.hasOwnProperty(w) && settings.ignore.indexOf(words[w]) == -1) {
var word = words[w];
count[word] = count[word] ? count[word] + 1 : 1;
}
}
for (w in count) if (count.hasOwnProperty(w)) {
sorted.push([w, count[w]]);
}
s = sorted.sort(function (a, b) {
return b[1] - a[1];
});
var output = '<title>word frequency</title><ul style="direction: ltr; text-align: left; font-family: sans-serif; line-height: 130%;">';
for (s in sorted.slice(0, settings.listLength)) {
var c = sorted[s];
output += '<li>' + c[1] + ': ' + c[0] + '</li>';
}
output += '</ul>';
with(open().document){
write(output);
close();
}
})();
抱歉缩进太糟糕了..
更改标点符号以转义连字符。
var punctuation = /[\/\.\*\+\+\?\|\(\)\[\]\{\}\^\,:;\-`~!@#$%&_]+/g;
我找不到 Chrome 的词频扩展,它列出了一个词在页面上的使用次数(我需要一个至少包含 100 个按使用频率排列的结果的列表),所以我采取了复制 javascript 小书签并稍微调整它以过滤掉常用词的方法。
然而,原始代码和修改后的代码都输出了一个列表,其中省略了一些单词的首字母,例如 "roperty" 而不是 "property","ubversion" 而不是 "subversion",等等。可能是什么原因造成的?
下面是一个link原代码:https://gist.github.com/RonnyO/3004194
这是我稍微调整后的代码:
javascript: (function () {
var settings = {
listLength: 100,
ignore: ['the', 'be', 'to', 'of', 'and', 'in', 'that', 'have', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at', 'this', 'but', 'his', 'by', 'from', 'they', 'we', 'say', 'her', 'she', 'or', 'an', 'will', 'my', 'one', 'all', 'would', 'there', 'their', 'what', 'so', 'up', 'out', 'if', 'about', 'who', 'get', 'which', 'go', 'me', 'when', 'make', 'can', 'like', 'time', 'no', 'just', 'him', 'know', 'fake', 'people', 'into', 'year', 'your', 'good', 'some', 'could', 'them', 'see', 'other', 'than', 'then', 'now', 'look', 'only', 'come', 'its', 'over', 'think', 'also', 'back', 'after', 'use', 'two', 'how', 'our', 'work', 'first', 'well', 'way', 'even', 'new', 'want', 'because', 'any', 'these', 'give', 'day', 'most', 'us']
},
w, s;
function getBodyText() {
var doc = document,
body = doc.body,
selection, range, bodyText;
if (body.createTextRange) {
return body.createTextRange().text;
} else if (getSelection) {
selection = getSelection();
range = doc.createRange();
range.selectNodeContents(body);
selection.addRange(range);
bodyText = selection.toString();
selection.removeAllRanges();
return bodyText;
}
}
var punctuation = /[\/\.\*\+\+\?\|\(\)\[\]\{\}\^\,:;-`~!@#$%&_]+/g;
var words = getBodyText().trim().replace(punctuation, ' ').replace(/\s+/g, ' ').split(' '),
count = {},
sorted = [];
for (w in words) {if (words.hasOwnProperty(w) && settings.ignore.indexOf(words[w]) == -1) {
var word = words[w];
count[word] = count[word] ? count[word] + 1 : 1;
}
}
for (w in count) if (count.hasOwnProperty(w)) {
sorted.push([w, count[w]]);
}
s = sorted.sort(function (a, b) {
return b[1] - a[1];
});
var output = '<title>word frequency</title><ul style="direction: ltr; text-align: left; font-family: sans-serif; line-height: 130%;">';
for (s in sorted.slice(0, settings.listLength)) {
var c = sorted[s];
output += '<li>' + c[1] + ': ' + c[0] + '</li>';
}
output += '</ul>';
with(open().document){
write(output);
close();
}
})();
抱歉缩进太糟糕了..
更改标点符号以转义连字符。
var punctuation = /[\/\.\*\+\+\?\|\(\)\[\]\{\}\^\,:;\-`~!@#$%&_]+/g;