如何从字符串中提取符号、数字和单词并将它们分别存储到相应分类的数组中?
How to extract symbols, numbers and words from a string and store each into an accordingly categorized array?
如何从一个字符串中提取符号、数字、最多3个和至少4个字母的单词,并将它们分别存储到相应的分类数组中?
给定的字符串是:
const string = 'There are usually 100 to 200 words + in a paragraph';
预期的响应是:
const numbers = ['200', '100'];
const wordsMoreThanThreeLetters = ['There', 'words ', 'paragraph', 'usually'];
const symbols = ['+'];
const words = ['are', 'to', 'in', 'a'];
const string = 'There are usually 100 to 200 words + in a paragraph';
const response = [];
for (let i = 0; i < string.length; i++) {
response.push(string[i]);
// console.log(response); All process of the loop
}
console.log(response);
您可以为这些情况编写一个单独的函数:
const txt = 'There are usually 100 to 200 words in a paragraph';
console.log(txt);
console.log( ctrim(txt) )
function ctrim(txt) {
let w = txt.split(' ');
let _w = []
w.forEach((w) => {
if(w.length <= 3) {
_w.push( w )
}
})
return _w
}
一个有效的方法是在 split
方法的结果数组上使用 split
the string at any whitespace-sequence and then to operate a reduce
方法。
reducer 函数的实现方式是根据 OP 的类别收集和聚合特定数组中的字符串项(令牌),并由辅助方法支持,例如数字和单词测试 ...
function collectWordsDigitsAndRest(collector, token) {
const isDigitsOnly = value => (/^\d+$/).test(token);
const isWord = value => (/^\w+$/).test(token);
const listName = isDigitsOnly(token)
? 'digits'
: (
isWord(token)
? (token.length <= 3) && 'shortWords' || 'longWords'
: 'rest'
);
(collector[listName] ??= []).push(token);
return collector;
}
const {
longWords: wordsMoreThanThreeLetters = [],
shortWords: words = [],
digits: numbers = [],
rest: symbols = [],
} = 'There are usually 100 to 200 words + in a paragraph'
.split(/\s+/)
.reduce(collectWordsDigitsAndRest, {});
console.log({
wordsMoreThanThreeLetters,
words,
numbers,
symbols,
});
.as-console-wrapper { min-height: 100%!important; top: 0; }
当然也可以matchAll
the required tokens by a single regular expression / RegExp
which features named capturing groups and also uses Unicode escapes来实现更好的国际化(i18n)覆盖。
正则表达式本身的外观和工作方式如下...
...派生自...
第一种方法的 reducer 函数必须适应第二种方法,以便相应地处理每个捕获的组...
function collectWordsDigitsAndRest(collector, { groups }) {
const { shortWord, longWord, digit, rest } = groups;
const listName = (shortWord
&& 'shortWords') || (longWord
&& 'longWords') || (digit
&& 'digits') || (rest
&& 'rest');
if (listName) {
(collector[listName] ??= []).push(shortWord || longWord || digit || rest);
}
return collector;
}
// Unicode Categories ... [https://www.regularexpressions.info/unicode.html#category]
// regex101.com ... [https://regex101.com/r/nCga5u/2]
const regXWordDigitRestTokens =
/(?:\b(?<digit>\p{N}+)|(?<longWord>\p{L}{4,})|(?<shortWord>\p{L}+)\b)|(?<rest>[^\p{Z}]+)/gmu;
const {
longWords: wordsMoreThanThreeLetters = [],
shortWords: words = [],
digits: numbers = [],
rest: symbols = [],
} = Array
.from(
'There are usually 100 to 200 words ++ -- ** in a paragraph.'
.matchAll(regXWordDigitRestTokens)
)
.reduce(collectWordsDigitsAndRest, {});
console.log({
wordsMoreThanThreeLetters,
words,
numbers,
symbols,
});
.as-console-wrapper { min-height: 100%!important; top: 0; }
您正在尝试做的事情叫做 tokenization. Typically this is done with regular expressions. You write a regular expression for every token, you want to recognize. Every token is surrounded by white-space. The position between white-space and words is called word boundary, which is matched by \b
. The following regular expressions use Unicode character classes。符号不是单词,所以它们没有单词边界。
- 三个或更少字母的单词:
\b\p{Letter}{1,3}\b
。
- 超过三个字母的单词:
\b\p{Letter}{4,}\b
。
- 人数:
\b\p{Number}+\b
- 符号:
\p{Symbol}+
为了解析不同的标记,将正则表达式放入命名的捕获组中很有用:(?<anything>.*)
。这将匹配任何内容并将匹配项存储在捕获组 anything
.
中
const input = 'There are usually 100 to 200 words + in a paragraph';
let rx = new RegExp ([
'(?<wle3>\b\p{L}{1,3}\b)',
'(?<wgt3>\b\p{L}{4,}\b)',
'(?<n>\b\p{N}+\b)',
'(?<s>\p{S}+)'
].join ('|'),
'gmu');
let words_le_3 = [];
let words_gt_3 = [];
let numbers = [];
let symbols = [];
for (match of input.matchAll(rx)) {
let g = match.groups;
switch (true) {
case (!!g.wle3): words_le_3.push (g.wle3); break;
case (!!g.wgt3): words_gt_3.push (g.wgt3); break;
case (!!g.n): numbers .push (g.n); break;
case (!!g.s): symbols .push (g.s); break;
}
}
console.log (`Words with up to three letters: ${words_le_3}`);
console.log (`Words with more than three letters: ${words_gt_3}`);
console.log (`Numbers: ${numbers}`);
console.log (`Symbols: ${symbols}`);
如果将匹配项存储在对象而不是四个顶级数组中,代码会更简单。在这种情况下,switch 语句可以替换为对组的循环和赋值。
如何从一个字符串中提取符号、数字、最多3个和至少4个字母的单词,并将它们分别存储到相应的分类数组中?
给定的字符串是:
const string = 'There are usually 100 to 200 words + in a paragraph';
预期的响应是:
const numbers = ['200', '100'];
const wordsMoreThanThreeLetters = ['There', 'words ', 'paragraph', 'usually'];
const symbols = ['+'];
const words = ['are', 'to', 'in', 'a'];
const string = 'There are usually 100 to 200 words + in a paragraph';
const response = [];
for (let i = 0; i < string.length; i++) {
response.push(string[i]);
// console.log(response); All process of the loop
}
console.log(response);
您可以为这些情况编写一个单独的函数:
const txt = 'There are usually 100 to 200 words in a paragraph';
console.log(txt);
console.log( ctrim(txt) )
function ctrim(txt) {
let w = txt.split(' ');
let _w = []
w.forEach((w) => {
if(w.length <= 3) {
_w.push( w )
}
})
return _w
}
一个有效的方法是在 split
方法的结果数组上使用 split
the string at any whitespace-sequence and then to operate a reduce
方法。
reducer 函数的实现方式是根据 OP 的类别收集和聚合特定数组中的字符串项(令牌),并由辅助方法支持,例如数字和单词测试 ...
function collectWordsDigitsAndRest(collector, token) {
const isDigitsOnly = value => (/^\d+$/).test(token);
const isWord = value => (/^\w+$/).test(token);
const listName = isDigitsOnly(token)
? 'digits'
: (
isWord(token)
? (token.length <= 3) && 'shortWords' || 'longWords'
: 'rest'
);
(collector[listName] ??= []).push(token);
return collector;
}
const {
longWords: wordsMoreThanThreeLetters = [],
shortWords: words = [],
digits: numbers = [],
rest: symbols = [],
} = 'There are usually 100 to 200 words + in a paragraph'
.split(/\s+/)
.reduce(collectWordsDigitsAndRest, {});
console.log({
wordsMoreThanThreeLetters,
words,
numbers,
symbols,
});
.as-console-wrapper { min-height: 100%!important; top: 0; }
当然也可以matchAll
the required tokens by a single regular expression / RegExp
which features named capturing groups and also uses Unicode escapes来实现更好的国际化(i18n)覆盖。
正则表达式本身的外观和工作方式如下...
...派生自...
第一种方法的 reducer 函数必须适应第二种方法,以便相应地处理每个捕获的组...
function collectWordsDigitsAndRest(collector, { groups }) {
const { shortWord, longWord, digit, rest } = groups;
const listName = (shortWord
&& 'shortWords') || (longWord
&& 'longWords') || (digit
&& 'digits') || (rest
&& 'rest');
if (listName) {
(collector[listName] ??= []).push(shortWord || longWord || digit || rest);
}
return collector;
}
// Unicode Categories ... [https://www.regularexpressions.info/unicode.html#category]
// regex101.com ... [https://regex101.com/r/nCga5u/2]
const regXWordDigitRestTokens =
/(?:\b(?<digit>\p{N}+)|(?<longWord>\p{L}{4,})|(?<shortWord>\p{L}+)\b)|(?<rest>[^\p{Z}]+)/gmu;
const {
longWords: wordsMoreThanThreeLetters = [],
shortWords: words = [],
digits: numbers = [],
rest: symbols = [],
} = Array
.from(
'There are usually 100 to 200 words ++ -- ** in a paragraph.'
.matchAll(regXWordDigitRestTokens)
)
.reduce(collectWordsDigitsAndRest, {});
console.log({
wordsMoreThanThreeLetters,
words,
numbers,
symbols,
});
.as-console-wrapper { min-height: 100%!important; top: 0; }
您正在尝试做的事情叫做 tokenization. Typically this is done with regular expressions. You write a regular expression for every token, you want to recognize. Every token is surrounded by white-space. The position between white-space and words is called word boundary, which is matched by \b
. The following regular expressions use Unicode character classes。符号不是单词,所以它们没有单词边界。
- 三个或更少字母的单词:
\b\p{Letter}{1,3}\b
。 - 超过三个字母的单词:
\b\p{Letter}{4,}\b
。 - 人数:
\b\p{Number}+\b
- 符号:
\p{Symbol}+
为了解析不同的标记,将正则表达式放入命名的捕获组中很有用:(?<anything>.*)
。这将匹配任何内容并将匹配项存储在捕获组 anything
.
const input = 'There are usually 100 to 200 words + in a paragraph';
let rx = new RegExp ([
'(?<wle3>\b\p{L}{1,3}\b)',
'(?<wgt3>\b\p{L}{4,}\b)',
'(?<n>\b\p{N}+\b)',
'(?<s>\p{S}+)'
].join ('|'),
'gmu');
let words_le_3 = [];
let words_gt_3 = [];
let numbers = [];
let symbols = [];
for (match of input.matchAll(rx)) {
let g = match.groups;
switch (true) {
case (!!g.wle3): words_le_3.push (g.wle3); break;
case (!!g.wgt3): words_gt_3.push (g.wgt3); break;
case (!!g.n): numbers .push (g.n); break;
case (!!g.s): symbols .push (g.s); break;
}
}
console.log (`Words with up to three letters: ${words_le_3}`);
console.log (`Words with more than three letters: ${words_gt_3}`);
console.log (`Numbers: ${numbers}`);
console.log (`Symbols: ${symbols}`);
如果将匹配项存储在对象而不是四个顶级数组中,代码会更简单。在这种情况下,switch 语句可以替换为对组的循环和赋值。