如何正确加粗来自 Twitter 的搜索词,JS 中奇怪的正则表达式
How to properly bold search terms from Twitter, strange regex case in JS
我正在使用 Twitter API 从 Twitter 检索推文并在我自己的客户端中显示它们。
但是,我在正确突出显示正确的搜索字词时遇到了一些困难。我想要如下效果:
我尝试在 JS 中执行此操作的方法是使用一个名为 highlightSearchTerms() 的函数,该函数将推文的文本和一组加粗的关键字作为参数。它 returns 固定推文的文本。我通过将关键字包装在具有 class .search-term.
的关键字中来加粗关键字
我有很多问题,包括:
- 运行 简单替换不保留大小写
- href 标签中的关键字存在很多冲突
- 如果我尝试使用替换执行 for 循环,我不知道如何只修改不在 href 中的搜索词,而且我还没有用上面的跨度包装
我希望能够处理的示例推文:
Input:
This is a keyword. This is a <a href="http://search.twitter.com/q=%23keyword">
#keyword</a> with a hashtag. This is a link with kEyWoRd:
<a href="http://thiskeyword.com">http://thiskeyword.com</a>.
Expected Output:
This is a
<span class="search-term">keyword</span>
. This is a <a href="http://search.twitter.com/q=%23keyword"> #
<span class="search-term">keyword</span>
</a> with a hashtag. This is a link with
<span class="search-term">kEyWoRd</span>
:<a href="http://thiskeyword.com">http://this
<span class="search-term>keyword.com</span>
</a>.
我已经尝试了很多东西,但不幸的是我无法找到解决问题的正确方法。任何建议都将不胜感激。
这是我的代码,适用于某些情况,但最终没有达到我的要求。当关键字位于 link 的后半部分(例如 http://twitter.com/this_keyword)时,它无法处理。有时它也会奇怪地突出显示关键字前的 2 个字符。我怀疑最好的解决方案是否与我的代码太相似。
function _highlightSearchTerms(text, keywords){
for (var i=0;i<keywords.length;i++) {
// create regex to find all instances of the keyword, catch the links that potentially come before so we can filter them out in the next step
var searchString = new RegExp("[http://twitter.com/||q=%23]*"+keywords[i], "ig");
// create an array of all the matched keyword terms in the tweet, we can't simply run a replace all as we need them to retain their initial case
var keywordOccurencesInitial = text.match(searchString);
// create an array of the keyword occurences we want to actually use, I'm sure there's a better way to create this array but rather than try to optimize, I just worked with code I know should work because my problem isn't centered around this block
var keywordOccurences = [];
if (keywordOccurencesInitial != null) {
for(var i3=0;i3<keywordOccurencesInitial.length;i3++){
if (keywordOccurencesInitial[i3].indexOf("http://twitter.com/") > -1 || keywordOccurencesInitial[i3].indexOf("q=%23") > -1)
continue;
else
keywordOccurences.push(keywordOccurencesInitial[i3]);
}
}
// replace our matches with search term
// the regex should ensure to NOT catch terms we've already wrapped in the span
// i took the negative lookbehind workaround from
if (keywordOccurences != null) {
for(var i2=0;i2<keywordOccurences.length;i2++){
var searchString2 = new RegExp("(q=%23||http://twitter.com/||<span class='search-term'>)?"+keywordOccurences[i2].trim(), "g"); // don't replace what we've alrdy replaced
text = text.replace(searchString2,
function([=11=],){
return ?[=11=]:"<span class='search-term'>"+keywordOccurences[i2].trim()+"</span>";
});
}
}
return text;
}
Here's something 您或许可以使用:
var getv = document.getElementById('tekt').value;
var keywords = "keyword,big elephant"; // comma delimited keyword list
var rekeywords = "(" + keywords.replace(/\, ?/ig,"|") + ")"; // wraps keywords in ( and ), and changes , to a pipe (character for regex alternation)
var keyrex = new RegExp("(#?\b" + rekeywords + "\b)(?=[^>]*?<[^>]*>|(?![^>]*>))","igm")
alert(keyrex);
document.getElementById('tekt').value = document.getElementById('tekt').value.replace(keyrex,"<span class=\"search-term\"></span>");
And here 是一种尝试处理单词形式的变体。如果单词以 ed、es、s、ing 等结尾,它会将其切掉,并且在查找单词末尾的单词边界时,它还会查找以常见后缀结尾的单词。它并不完美,例如 ride 的过去式是 roed。如果不让自己面临大量误报,几乎不可能用正则表达式来解决这个问题。
var getv = document.getElementById('tekt').value;
var keywords = "keywords,big elephant";
var rekeywords = "(" + keywords.replace(/(es|ing|ed|d|s|e)?\b(\s*,\s*|$)/ig,"(es|ing|ed|d|s|e)?").replace(/,/g,"|") + ")";
var keyrex = new RegExp("(#?\b" + rekeywords + "\b)(?=[^>]*?<[^>]*>|(?![^>]*>))","igm")
console.log(keyrex);
document.getElementById('tekt').value = document.getElementById('tekt').value.replace(keyrex,"<span class=\"search-term\"></span>");
编辑
This is just about perfect. Do you know how to slightly modify it so the keyword in thiskeyword.com would also be highlighted?
更改此行
var keyrex = new RegExp("(#?\b" + rekeywords + "\b)(?=[^>]*?<[^>]*>|(?![^>]*>))","igm")
到(我所做的就是删除两个 \b
):
var keyrex = new RegExp("(#?" + rekeywords + ")(?=[^>]*?<[^>]*>|(?![^>]*>))","igm")
但请注意,您会遇到 smiles
之类的问题,最终会以 smiles(如果用户搜索 mile),并且正则表达式无法解决做那件事。 Regex对单词的定义是字母数字字符,没有字典可以查。
我正在使用 Twitter API 从 Twitter 检索推文并在我自己的客户端中显示它们。
但是,我在正确突出显示正确的搜索字词时遇到了一些困难。我想要如下效果:
我尝试在 JS 中执行此操作的方法是使用一个名为 highlightSearchTerms() 的函数,该函数将推文的文本和一组加粗的关键字作为参数。它 returns 固定推文的文本。我通过将关键字包装在具有 class .search-term.
的关键字中来加粗关键字我有很多问题,包括:
- 运行 简单替换不保留大小写
- href 标签中的关键字存在很多冲突
- 如果我尝试使用替换执行 for 循环,我不知道如何只修改不在 href 中的搜索词,而且我还没有用上面的跨度包装
我希望能够处理的示例推文:
Input:
This is a keyword. This is a <a href="http://search.twitter.com/q=%23keyword">
#keyword</a> with a hashtag. This is a link with kEyWoRd:
<a href="http://thiskeyword.com">http://thiskeyword.com</a>.
Expected Output:
This is a
<span class="search-term">keyword</span>
. This is a <a href="http://search.twitter.com/q=%23keyword"> #
<span class="search-term">keyword</span>
</a> with a hashtag. This is a link with
<span class="search-term">kEyWoRd</span>
:<a href="http://thiskeyword.com">http://this
<span class="search-term>keyword.com</span>
</a>.
我已经尝试了很多东西,但不幸的是我无法找到解决问题的正确方法。任何建议都将不胜感激。
这是我的代码,适用于某些情况,但最终没有达到我的要求。当关键字位于 link 的后半部分(例如 http://twitter.com/this_keyword)时,它无法处理。有时它也会奇怪地突出显示关键字前的 2 个字符。我怀疑最好的解决方案是否与我的代码太相似。
function _highlightSearchTerms(text, keywords){
for (var i=0;i<keywords.length;i++) {
// create regex to find all instances of the keyword, catch the links that potentially come before so we can filter them out in the next step
var searchString = new RegExp("[http://twitter.com/||q=%23]*"+keywords[i], "ig");
// create an array of all the matched keyword terms in the tweet, we can't simply run a replace all as we need them to retain their initial case
var keywordOccurencesInitial = text.match(searchString);
// create an array of the keyword occurences we want to actually use, I'm sure there's a better way to create this array but rather than try to optimize, I just worked with code I know should work because my problem isn't centered around this block
var keywordOccurences = [];
if (keywordOccurencesInitial != null) {
for(var i3=0;i3<keywordOccurencesInitial.length;i3++){
if (keywordOccurencesInitial[i3].indexOf("http://twitter.com/") > -1 || keywordOccurencesInitial[i3].indexOf("q=%23") > -1)
continue;
else
keywordOccurences.push(keywordOccurencesInitial[i3]);
}
}
// replace our matches with search term
// the regex should ensure to NOT catch terms we've already wrapped in the span
// i took the negative lookbehind workaround from
if (keywordOccurences != null) {
for(var i2=0;i2<keywordOccurences.length;i2++){
var searchString2 = new RegExp("(q=%23||http://twitter.com/||<span class='search-term'>)?"+keywordOccurences[i2].trim(), "g"); // don't replace what we've alrdy replaced
text = text.replace(searchString2,
function([=11=],){
return ?[=11=]:"<span class='search-term'>"+keywordOccurences[i2].trim()+"</span>";
});
}
}
return text;
}
Here's something 您或许可以使用:
var getv = document.getElementById('tekt').value;
var keywords = "keyword,big elephant"; // comma delimited keyword list
var rekeywords = "(" + keywords.replace(/\, ?/ig,"|") + ")"; // wraps keywords in ( and ), and changes , to a pipe (character for regex alternation)
var keyrex = new RegExp("(#?\b" + rekeywords + "\b)(?=[^>]*?<[^>]*>|(?![^>]*>))","igm")
alert(keyrex);
document.getElementById('tekt').value = document.getElementById('tekt').value.replace(keyrex,"<span class=\"search-term\"></span>");
And here 是一种尝试处理单词形式的变体。如果单词以 ed、es、s、ing 等结尾,它会将其切掉,并且在查找单词末尾的单词边界时,它还会查找以常见后缀结尾的单词。它并不完美,例如 ride 的过去式是 roed。如果不让自己面临大量误报,几乎不可能用正则表达式来解决这个问题。
var getv = document.getElementById('tekt').value;
var keywords = "keywords,big elephant";
var rekeywords = "(" + keywords.replace(/(es|ing|ed|d|s|e)?\b(\s*,\s*|$)/ig,"(es|ing|ed|d|s|e)?").replace(/,/g,"|") + ")";
var keyrex = new RegExp("(#?\b" + rekeywords + "\b)(?=[^>]*?<[^>]*>|(?![^>]*>))","igm")
console.log(keyrex);
document.getElementById('tekt').value = document.getElementById('tekt').value.replace(keyrex,"<span class=\"search-term\"></span>");
编辑
This is just about perfect. Do you know how to slightly modify it so the keyword in thiskeyword.com would also be highlighted?
更改此行
var keyrex = new RegExp("(#?\b" + rekeywords + "\b)(?=[^>]*?<[^>]*>|(?![^>]*>))","igm")
到(我所做的就是删除两个 \b
):
var keyrex = new RegExp("(#?" + rekeywords + ")(?=[^>]*?<[^>]*>|(?![^>]*>))","igm")
但请注意,您会遇到 smiles
之类的问题,最终会以 smiles(如果用户搜索 mile),并且正则表达式无法解决做那件事。 Regex对单词的定义是字母数字字符,没有字典可以查。