node/io 中的复杂正则表达式提取
Complex regex extraction in node/io
有我尝试使用的库 Twitter 文本。
它有大量在运行时计算的复杂正则表达式。我不需要所有库,所以我决定只使用其中的一些正则表达式。
所以我有一个脚本来提取它并保存到另一个 js 文件:
var _ = require('lodash');
var fs = require('fs');
var twitterText = require('twitter-text'); // 1.11.0 (latest)
var content = [
'/**',
' * @preserve https://github.com/twitter/twitter-text-js',
' */',
'var regexps = { };'
];
_.forEach({
url: 'extractUrl', // <- this regexp is the problem
hash: 'validHashtag',
mention: 'validMentionOrList'
}, function(twitterTextRegexpName, regexpName) {
var regexp = twitterText.regexen[twitterTextRegexpName];
if (undefined === regexp) {
throw new Error('Failed to find regexp ' + twitterTextRegexpName);
}
content.push(
'regexps.' + regexpName + ' = ' + regexp + ';'
);
} );
content.push('export default regexps;');
fs.writeFile(targetPath, content.join('\n'), {}, callback);
于是结果截图:
如您所见,url 正则表达式已损坏,我无法使用模块。
> require('./result');
.../result.js:5
regexps.url = /(((?:[^A-Za-z0-9@@$##-]|^))((https?:\/\/)?((?:(?:(?:[^\/\!
^
SyntaxError: Invalid regular expression: missing /
at exports.runInThisContext (vm.js:53:16)
at Module._compile (module.js:393:25)
at Object.Module._extensions..js (module.js:428:10)
at Module.load (module.js:335:32)
at Function.Module._load (module.js:290:12)
at Module.require (module.js:345:17)
at require (module.js:364:17)
at repl:1:1
at REPLServer.defaultEval (repl.js:124:27)
at bound (domain.js:254:14)
有什么想法吗?尝试使用最新的节点和 io.
此脚本似乎有效(它使用 this package):
var _ = require('lodash');
var fs = require('fs');
var twitterText = require('twitter-text'); // 1.11.0 (latest)
jsStringEscape = require('js-string-escape');
var content = [
'/**',
' * @preserve https://github.com/twitter/twitter-text-js',
' */',
'var regexps = { };'
];
_.forEach({
url: 'extractUrl', // <- this regexp is the problem
hash: 'validHashtag',
mention: 'validMentionOrList'
}, function(twitterTextRegexpName, regexpName) {
var regexp = twitterText.regexen[twitterTextRegexpName];
if (undefined === regexp) {
throw new Error('Failed to find regexp ' + twitterTextRegexpName);
}
var regexpStr = jsStringEscape(regexp.toString());
var regexpRegexp = /\/(.*)\/([^\/]*)/; // :)
var result = regexpStr.match(regexpRegexp);
var regexpBody = result[1];
var regexpModifiers = result[2];
content.push(
'regexps.' + regexpName + ' = new RegExp("' + regexpBody + '","' + regexpModifiers + '");'
);
} );
content.push('module.exports = regexps;');
fs.writeFile("./out.js", content.join('\n'), {});
输出:
node
> var r = require("./out.js");
undefined
> "www.google.com".match(r.url)
[ 'www.google.com' ]
> "something".match(r.url)
null
生成的 url 正则表达式通过了原始包中的这些测试:
Test for invalid characters in url(结果应为空):
var r = require("./out.js");
var invalidChars = ['\u202A', '\u202B', '\u202C', '\u202D', '\u202E'];
for (var i = 0; i < invalidChars.length; i++) {
console.log(("http://twitt" + invalidChars[i] + "er.com").match(r.url));
}
// Result:
// null
// null
// null
// null
// null
> var r = require("./out.js");
> var message_with_hyphenated_url = "Message with hyphenated-url.com";
undefined
> var message_with_www_hyphenated_url = "Message with www.123-hyphenated-url.com";
undefined
> message_with_hyphenated_url.match(r.url)
[ ' hyphenated-url.com' ]
> message_with_www_hyphenated_url.match(r.url)
[ ' www.123-hyphenated-url.com' ]
// Load original regex:
> var twitterText = require('twitter-text');
undefined
> var rr = twitterText.regexen["extractUrl"];
undefined
> message_with_hyphenated_url.match(rr)
[ ' hyphenated-url.com' ]
> message_with_www_hyphenated_url.match(rr)
[ ' www.123-hyphenated-url.com' ]
// The same results as with original regex.
有我尝试使用的库 Twitter 文本。 它有大量在运行时计算的复杂正则表达式。我不需要所有库,所以我决定只使用其中的一些正则表达式。
所以我有一个脚本来提取它并保存到另一个 js 文件:
var _ = require('lodash');
var fs = require('fs');
var twitterText = require('twitter-text'); // 1.11.0 (latest)
var content = [
'/**',
' * @preserve https://github.com/twitter/twitter-text-js',
' */',
'var regexps = { };'
];
_.forEach({
url: 'extractUrl', // <- this regexp is the problem
hash: 'validHashtag',
mention: 'validMentionOrList'
}, function(twitterTextRegexpName, regexpName) {
var regexp = twitterText.regexen[twitterTextRegexpName];
if (undefined === regexp) {
throw new Error('Failed to find regexp ' + twitterTextRegexpName);
}
content.push(
'regexps.' + regexpName + ' = ' + regexp + ';'
);
} );
content.push('export default regexps;');
fs.writeFile(targetPath, content.join('\n'), {}, callback);
于是结果截图:
如您所见,url 正则表达式已损坏,我无法使用模块。
> require('./result');
.../result.js:5
regexps.url = /(((?:[^A-Za-z0-9@@$##-]|^))((https?:\/\/)?((?:(?:(?:[^\/\!
^
SyntaxError: Invalid regular expression: missing /
at exports.runInThisContext (vm.js:53:16)
at Module._compile (module.js:393:25)
at Object.Module._extensions..js (module.js:428:10)
at Module.load (module.js:335:32)
at Function.Module._load (module.js:290:12)
at Module.require (module.js:345:17)
at require (module.js:364:17)
at repl:1:1
at REPLServer.defaultEval (repl.js:124:27)
at bound (domain.js:254:14)
有什么想法吗?尝试使用最新的节点和 io.
此脚本似乎有效(它使用 this package):
var _ = require('lodash');
var fs = require('fs');
var twitterText = require('twitter-text'); // 1.11.0 (latest)
jsStringEscape = require('js-string-escape');
var content = [
'/**',
' * @preserve https://github.com/twitter/twitter-text-js',
' */',
'var regexps = { };'
];
_.forEach({
url: 'extractUrl', // <- this regexp is the problem
hash: 'validHashtag',
mention: 'validMentionOrList'
}, function(twitterTextRegexpName, regexpName) {
var regexp = twitterText.regexen[twitterTextRegexpName];
if (undefined === regexp) {
throw new Error('Failed to find regexp ' + twitterTextRegexpName);
}
var regexpStr = jsStringEscape(regexp.toString());
var regexpRegexp = /\/(.*)\/([^\/]*)/; // :)
var result = regexpStr.match(regexpRegexp);
var regexpBody = result[1];
var regexpModifiers = result[2];
content.push(
'regexps.' + regexpName + ' = new RegExp("' + regexpBody + '","' + regexpModifiers + '");'
);
} );
content.push('module.exports = regexps;');
fs.writeFile("./out.js", content.join('\n'), {});
输出:
node
> var r = require("./out.js");
undefined
> "www.google.com".match(r.url)
[ 'www.google.com' ]
> "something".match(r.url)
null
生成的 url 正则表达式通过了原始包中的这些测试:
Test for invalid characters in url(结果应为空):
var r = require("./out.js");
var invalidChars = ['\u202A', '\u202B', '\u202C', '\u202D', '\u202E'];
for (var i = 0; i < invalidChars.length; i++) {
console.log(("http://twitt" + invalidChars[i] + "er.com").match(r.url));
}
// Result:
// null
// null
// null
// null
// null
> var r = require("./out.js");
> var message_with_hyphenated_url = "Message with hyphenated-url.com";
undefined
> var message_with_www_hyphenated_url = "Message with www.123-hyphenated-url.com";
undefined
> message_with_hyphenated_url.match(r.url)
[ ' hyphenated-url.com' ]
> message_with_www_hyphenated_url.match(r.url)
[ ' www.123-hyphenated-url.com' ]
// Load original regex:
> var twitterText = require('twitter-text');
undefined
> var rr = twitterText.regexen["extractUrl"];
undefined
> message_with_hyphenated_url.match(rr)
[ ' hyphenated-url.com' ]
> message_with_www_hyphenated_url.match(rr)
[ ' www.123-hyphenated-url.com' ]
// The same results as with original regex.