如何在 JavaScript 正则表达式中将文本与标记表达式 with/without 负先行匹配
How to match a text with a token an expression with/without negative lookahead in JavaScript Regex
应该有一个逗号分隔的文本字符串,其中每个文本有或没有 - 逗号分隔 - 列表中的标记
var tokens=['Inc.','Ltd','LLC'];
所以字符串就像
var companies="Apple, Inc., Microsoft, Inc., Buzzfeed, Treasure, LLC";
我想获取这个数组作为输出
var companiesList = [
"Apple Inc.",
"Microsoft Inc.",
"Buzzfeed",
"Treasure LLC"
];
所以我首先做了一个RegExp
那样的
var regex=new RegExp("([a-zA-Z&/? ]*),\s+("+token+")", "gi" )
我得到了匹配项并搜索了像
这样的正则表达式
var regex=new RegExp("([a-zA-Z&/? ]*),\s+("+item+")", "i" )
对于每个标记:
tokens.forEach((item) => {
var regex = new RegExp("([a-zA-Z&/? ]*),\s+(" + item + ")", "gi")
var matches = companies.match(regex) || []
console.log(item, regex.toString(), matches)
matches.forEach((m) => {
var regex = new RegExp("([a-zA-Z&/? ]*),\s+(" + item + ")", "i")
var match = m.match(regex)
if (match && match.length > 2) {
var n = match[1].trim();
var c = match[2].trim();
companiesList.push(n + ' ' + c);
}
});
});
通过这种方式,我可以捕获令牌并连接匹配组 1 和 2。
var tokens = ['inc.', 'ltd', 'llc'],
companies = "Apple, Inc., Microsoft, Inc., Buzzfeed, Treasure, LLC",
companiesList = [];
tokens.forEach((item) => {
var regex = new RegExp("([a-zA-Z&/? ]*),\s+(" + item + ")", "gi")
var matches = companies.match(regex) || []
console.log( item, regex.toString(), matches )
matches.forEach((m) => {
var regex = new RegExp("([a-zA-Z&/? ]*),\s+(" + item + ")", "i")
var match = m.match(regex)
if (match && match.length > 2) {
var n = match[1].trim();
var c = match[2].trim();
companiesList.push(n + ' ' + c);
}
});
});
console.log(companiesList)
问题是我在逗号后缺少没有标记的逗号分隔文本,例如:Buzzfeed
.
想法是在负面展望中使用非捕获组(参见 here 关于正则表达式匹配中的非捕获组)
/([a-zA-Z]*)^(?:(?!ltd).)+$/gi
但是通过这种方式,当输入字符串中存在标记时,我有任何匹配:
"Apple, Inc., Microsoft, Inc., Buzzfeed, Treasure LLC".match( /([a-zA-Z]*)^(?:(?!llc).)+$/gi )
虽然我只想匹配没有它的文本,所以我想得到 - 就像之前相反的那样:
["Buzzfeed"]
那么如何negate/modify前面的代码在两种情况下都能工作以在最后获得组合数组:
var companiesList = [
"Apple Inc.",
"Microsoft Inc.",
"Buzzfeed",
"Treasure LLC"
];
减少它会不会容易很多,边走边检查令牌列表
var tokens = ['Inc.','Ltd','LLC'];
var companies = "Apple, Inc., Microsoft, Inc., Buzzfeed, Treasure, LLC";
var result = companies.split(',').reduce( (a,b,i) => {
return tokens.indexOf(b.trim()) === -1 ? a.push(b.trim()) : a[a.length-1] += b,a;
}, []);
console.log(result);
您可以使用正则表达式进行拆分。
var companies = "Apple, Inc., Microsoft, Inc., Buzzfeed, Treasure, LLC";
console.log(companies.split(/,\s(?!Inc\.|Ltd|LLC)/i).map(s => s.replace(', ', ' ')));
应该有一个逗号分隔的文本字符串,其中每个文本有或没有 - 逗号分隔 - 列表中的标记
var tokens=['Inc.','Ltd','LLC'];
所以字符串就像
var companies="Apple, Inc., Microsoft, Inc., Buzzfeed, Treasure, LLC";
我想获取这个数组作为输出
var companiesList = [
"Apple Inc.",
"Microsoft Inc.",
"Buzzfeed",
"Treasure LLC"
];
所以我首先做了一个RegExp
那样的
var regex=new RegExp("([a-zA-Z&/? ]*),\s+("+token+")", "gi" )
我得到了匹配项并搜索了像
这样的正则表达式var regex=new RegExp("([a-zA-Z&/? ]*),\s+("+item+")", "i" )
对于每个标记:
tokens.forEach((item) => {
var regex = new RegExp("([a-zA-Z&/? ]*),\s+(" + item + ")", "gi")
var matches = companies.match(regex) || []
console.log(item, regex.toString(), matches)
matches.forEach((m) => {
var regex = new RegExp("([a-zA-Z&/? ]*),\s+(" + item + ")", "i")
var match = m.match(regex)
if (match && match.length > 2) {
var n = match[1].trim();
var c = match[2].trim();
companiesList.push(n + ' ' + c);
}
});
});
通过这种方式,我可以捕获令牌并连接匹配组 1 和 2。
var tokens = ['inc.', 'ltd', 'llc'],
companies = "Apple, Inc., Microsoft, Inc., Buzzfeed, Treasure, LLC",
companiesList = [];
tokens.forEach((item) => {
var regex = new RegExp("([a-zA-Z&/? ]*),\s+(" + item + ")", "gi")
var matches = companies.match(regex) || []
console.log( item, regex.toString(), matches )
matches.forEach((m) => {
var regex = new RegExp("([a-zA-Z&/? ]*),\s+(" + item + ")", "i")
var match = m.match(regex)
if (match && match.length > 2) {
var n = match[1].trim();
var c = match[2].trim();
companiesList.push(n + ' ' + c);
}
});
});
console.log(companiesList)
问题是我在逗号后缺少没有标记的逗号分隔文本,例如:Buzzfeed
.
想法是在负面展望中使用非捕获组(参见 here 关于正则表达式匹配中的非捕获组)
/([a-zA-Z]*)^(?:(?!ltd).)+$/gi
但是通过这种方式,当输入字符串中存在标记时,我有任何匹配:
"Apple, Inc., Microsoft, Inc., Buzzfeed, Treasure LLC".match( /([a-zA-Z]*)^(?:(?!llc).)+$/gi )
虽然我只想匹配没有它的文本,所以我想得到 - 就像之前相反的那样:
["Buzzfeed"]
那么如何negate/modify前面的代码在两种情况下都能工作以在最后获得组合数组:
var companiesList = [
"Apple Inc.",
"Microsoft Inc.",
"Buzzfeed",
"Treasure LLC"
];
减少它会不会容易很多,边走边检查令牌列表
var tokens = ['Inc.','Ltd','LLC'];
var companies = "Apple, Inc., Microsoft, Inc., Buzzfeed, Treasure, LLC";
var result = companies.split(',').reduce( (a,b,i) => {
return tokens.indexOf(b.trim()) === -1 ? a.push(b.trim()) : a[a.length-1] += b,a;
}, []);
console.log(result);
您可以使用正则表达式进行拆分。
var companies = "Apple, Inc., Microsoft, Inc., Buzzfeed, Treasure, LLC";
console.log(companies.split(/,\s(?!Inc\.|Ltd|LLC)/i).map(s => s.replace(', ', ' ')));