JavaScript Unicode 正则表达式 - 字符范围乱序 class

JavaScript Unicode Regex - Range out of order in character class

为什么下面的代码会报'Range out of order in character class'错误?

var min_wordsafe_length = 1;
var max_length = 20;
var string = 'some-slug-like-string-with-!@£!%-special-chars-';

var PREG_CLASS_UNICODE_WORD_BOUNDARY = [
  '\x{0}-\x{2F}\x{3A}-\x{40}\x{5B}-\x{60}\x{7B}-\x{A9}\x{AB}-\x{B1}\x{B4}',
  '\x{B6}-\x{B8}\x{BB}\x{BF}\x{D7}\x{F7}\x{2C2}-\x{2C5}\x{2D2}-\x{2DF}',
  '\x{2E5}-\x{2EB}\x{2ED}\x{2EF}-\x{2FF}\x{375}\x{37E}-\x{385}\x{387}\x{3F6}',
  '\x{482}\x{55A}-\x{55F}\x{589}-\x{58A}\x{5BE}\x{5C0}\x{5C3}\x{5C6}',
  '\x{5F3}-\x{60F}\x{61B}-\x{61F}\x{66A}-\x{66D}\x{6D4}\x{6DD}\x{6E9}',
  '\x{6FD}-\x{6FE}\x{700}-\x{70F}\x{7F6}-\x{7F9}\x{830}-\x{83E}',
  '\x{964}-\x{965}\x{970}\x{9F2}-\x{9F3}\x{9FA}-\x{9FB}\x{AF1}\x{B70}',
  '\x{BF3}-\x{BFA}\x{C7F}\x{CF1}-\x{CF2}\x{D79}\x{DF4}\x{E3F}\x{E4F}',
  '\x{E5A}-\x{E5B}\x{F01}-\x{F17}\x{F1A}-\x{F1F}\x{F34}\x{F36}\x{F38}',
  '\x{F3A}-\x{F3D}\x{F85}\x{FBE}-\x{FC5}\x{FC7}-\x{FD8}\x{104A}-\x{104F}',
  '\x{109E}-\x{109F}\x{10FB}\x{1360}-\x{1368}\x{1390}-\x{1399}\x{1400}',
  '\x{166D}-\x{166E}\x{1680}\x{169B}-\x{169C}\x{16EB}-\x{16ED}',
  '\x{1735}-\x{1736}\x{17B4}-\x{17B5}\x{17D4}-\x{17D6}\x{17D8}-\x{17DB}',
  '\x{1800}-\x{180A}\x{180E}\x{1940}-\x{1945}\x{19DE}-\x{19FF}',
  '\x{1A1E}-\x{1A1F}\x{1AA0}-\x{1AA6}\x{1AA8}-\x{1AAD}\x{1B5A}-\x{1B6A}',
  '\x{1B74}-\x{1B7C}\x{1C3B}-\x{1C3F}\x{1C7E}-\x{1C7F}\x{1CD3}\x{1FBD}',
  '\x{1FBF}-\x{1FC1}\x{1FCD}-\x{1FCF}\x{1FDD}-\x{1FDF}\x{1FED}-\x{1FEF}',
  '\x{1FFD}-\x{206F}\x{207A}-\x{207E}\x{208A}-\x{208E}\x{20A0}-\x{20B8}',
  '\x{2100}-\x{2101}\x{2103}-\x{2106}\x{2108}-\x{2109}\x{2114}',
  '\x{2116}-\x{2118}\x{211E}-\x{2123}\x{2125}\x{2127}\x{2129}\x{212E}',
  '\x{213A}-\x{213B}\x{2140}-\x{2144}\x{214A}-\x{214D}\x{214F}',
  '\x{2190}-\x{244A}\x{249C}-\x{24E9}\x{2500}-\x{2775}\x{2794}-\x{2B59}',
  '\x{2CE5}-\x{2CEA}\x{2CF9}-\x{2CFC}\x{2CFE}-\x{2CFF}\x{2E00}-\x{2E2E}',
  '\x{2E30}-\x{3004}\x{3008}-\x{3020}\x{3030}\x{3036}-\x{3037}',
  '\x{303D}-\x{303F}\x{309B}-\x{309C}\x{30A0}\x{30FB}\x{3190}-\x{3191}',
  '\x{3196}-\x{319F}\x{31C0}-\x{31E3}\x{3200}-\x{321E}\x{322A}-\x{3250}',
  '\x{3260}-\x{327F}\x{328A}-\x{32B0}\x{32C0}-\x{33FF}\x{4DC0}-\x{4DFF}',
  '\x{A490}-\x{A4C6}\x{A4FE}-\x{A4FF}\x{A60D}-\x{A60F}\x{A673}\x{A67E}',
  '\x{A6F2}-\x{A716}\x{A720}-\x{A721}\x{A789}-\x{A78A}\x{A828}-\x{A82B}',
  '\x{A836}-\x{A839}\x{A874}-\x{A877}\x{A8CE}-\x{A8CF}\x{A8F8}-\x{A8FA}',
  '\x{A92E}-\x{A92F}\x{A95F}\x{A9C1}-\x{A9CD}\x{A9DE}-\x{A9DF}',
  '\x{AA5C}-\x{AA5F}\x{AA77}-\x{AA79}\x{AADE}-\x{AADF}\x{ABEB}',
  '\x{E000}-\x{F8FF}\x{FB29}\x{FD3E}-\x{FD3F}\x{FDFC}-\x{FDFD}',
  '\x{FE10}-\x{FE19}\x{FE30}-\x{FE6B}\x{FEFF}-\x{FF0F}\x{FF1A}-\x{FF20}',
  '\x{FF3B}-\x{FF40}\x{FF5B}-\x{FF65}\x{FFE0}-\x{FFFD}'].join('');

new RegExp("^(.{" + min_wordsafe_length + ","+ max_length +"})[" + PREG_CLASS_UNICODE_WORD_BOUNDARY + "]");

http://jsfiddle.net/52zz0drz/

错误似乎与 PREG_CLASS_UNICODE_WORD_BOUNDARY 有关,但我无法确定范围在哪里出现问题。

我正在尝试将以下行从 Drupal 的 truncate_utf8 函数移植到 JavaScript:

// Find the last word boundary, if there is one within $min_wordsafe_length
// to $max_length characters. preg_match() is always greedy, so it will
// find the longest string possible.
$found = preg_match('/^(.{' . $min_wordsafe_length . ',' . $max_length . '})[' . PREG_CLASS_UNICODE_WORD_BOUNDARY . ']/u', $string, $matches);

我在 JavaScript 中对 Drupal 的 PREG_CLASS_UNICODE_WORD_BOUNDARY 所做的唯一更改是双重转义 \x 以防止 \ 在正则表达式之前丢失,并将 PHP 的字符串连接替换为数组连接。我没有做任何改变范围顺序的事情。

如何修复此代码?我需要 JavaScript 正则表达式来尽可能匹配 PHP 正则表达式的行为。

JavaScript 使用语法 \uhhhh(恰好 4 个十六进制数字)在正则表达式中指定 UTF-16 代码单元。对于BMP范围内的字符,除范围D800-DFFF外,字符的码位直接映射到一个具有相同码位值的UTF-16编码单元。这正是这里的情况,所以我们不需要处理代理及其怪癖。

在这种情况下,只需将 PCRE(在 PHP 中)的 \x{h...hh} 语法替换为 \uhhhh 语法:

var PREG_CLASS_UNICODE_WORD_BOUNDARY = [
    "\u0000-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u00A9\u00AB-\u00B1\u00B4",
    "\u00B6-\u00B8\u00BB\u00BF\u00D7\u00F7\u02C2-\u02C5\u02D2-\u02DF",
    "\u02E5-\u02EB\u02ED\u02EF-\u02FF\u0375\u037E-\u0385\u0387\u03F6",
    "\u0482\u055A-\u055F\u0589-\u058A\u05BE\u05C0\u05C3\u05C6",
    "\u05F3-\u060F\u061B-\u061F\u066A-\u066D\u06D4\u06DD\u06E9",
    "\u06FD-\u06FE\u0700-\u070F\u07F6-\u07F9\u0830-\u083E",
    "\u0964-\u0965\u0970\u09F2-\u09F3\u09FA-\u09FB\u0AF1\u0B70",
    "\u0BF3-\u0BFA\u0C7F\u0CF1-\u0CF2\u0D79\u0DF4\u0E3F\u0E4F",
    "\u0E5A-\u0E5B\u0F01-\u0F17\u0F1A-\u0F1F\u0F34\u0F36\u0F38",
    "\u0F3A-\u0F3D\u0F85\u0FBE-\u0FC5\u0FC7-\u0FD8\u104A-\u104F",
    "\u109E-\u109F\u10FB\u1360-\u1368\u1390-\u1399\u1400",
    "\u166D-\u166E\u1680\u169B-\u169C\u16EB-\u16ED",
    "\u1735-\u1736\u17B4-\u17B5\u17D4-\u17D6\u17D8-\u17DB",
    "\u1800-\u180A\u180E\u1940-\u1945\u19DE-\u19FF",
    "\u1A1E-\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B6A",
    "\u1B74-\u1B7C\u1C3B-\u1C3F\u1C7E-\u1C7F\u1CD3\u1FBD",
    "\u1FBF-\u1FC1\u1FCD-\u1FCF\u1FDD-\u1FDF\u1FED-\u1FEF",
    "\u1FFD-\u206F\u207A-\u207E\u208A-\u208E\u20A0-\u20B8",
    "\u2100-\u2101\u2103-\u2106\u2108-\u2109\u2114",
    "\u2116-\u2118\u211E-\u2123\u2125\u2127\u2129\u212E",
    "\u213A-\u213B\u2140-\u2144\u214A-\u214D\u214F",
    "\u2190-\u244A\u249C-\u24E9\u2500-\u2775\u2794-\u2B59",
    "\u2CE5-\u2CEA\u2CF9-\u2CFC\u2CFE-\u2CFF\u2E00-\u2E2E",
    "\u2E30-\u3004\u3008-\u3020\u3030\u3036-\u3037",
    "\u303D-\u303F\u309B-\u309C\u30A0\u30FB\u3190-\u3191",
    "\u3196-\u319F\u31C0-\u31E3\u3200-\u321E\u322A-\u3250",
    "\u3260-\u327F\u328A-\u32B0\u32C0-\u33FF\u4DC0-\u4DFF",
    "\uA490-\uA4C6\uA4FE-\uA4FF\uA60D-\uA60F\uA673\uA67E",
    "\uA6F2-\uA716\uA720-\uA721\uA789-\uA78A\uA828-\uA82B",
    "\uA836-\uA839\uA874-\uA877\uA8CE-\uA8CF\uA8F8-\uA8FA",
    "\uA92E-\uA92F\uA95F\uA9C1-\uA9CD\uA9DE-\uA9DF",
    "\uAA5C-\uAA5F\uAA77-\uAA79\uAADE-\uAADF\uABEB",
    "\uE000-\uF8FF\uFB29\uFD3E-\uFD3F\uFDFC-\uFDFD",
    "\uFE10-\uFE19\uFE30-\uFE6B\uFEFF-\uFF0F\uFF1A-\uFF20",
    "\uFF3B-\uFF40\uFF5B-\uFF65\uFFE0-\uFFFD"].join('');