如何使用 C 正则表达式提取子匹配项
How can I extract submatches using C regex
我已经使 GNU 正则表达式库的工作方式与我大约 2 年前写的广泛文本处理算法中所宣传的完全一样,但不幸的是,该平台已经消失,我不知道它的版本是旧版本还是新版本参考如下。
代码如下:
// GNU libc version: 2.28
// gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0
#include <stdio.h>
#include <regex.h>
int main() {
regex_t preg;
char str[] = "dave";
char regex[] = "\(.\)ave";
// flag REG_EXTENDED with unescaped parens in the r.e. doesn't fix anything
int ret, cflags = REG_ICASE;
// the elements of unused pmatches used to be set to -1 by regexec, but no longer. a clue perhaps.
regmatch_t pmatch[2] = {{-1,-1},{-1,-1}};
ret = regcomp(&preg, regex, cflags);
if (ret) {
puts("regcomp fail");
return ret;
}
else
// preg.re_nsub contains the correct number of groups that regcomp recognized in the r.e. Tests succeeded for 0, 1, 2, and 3 groups.
printf("regcomp ok; re_nsub=%zu\n", preg.re_nsub);
ret = regexec(&preg, str, 1, pmatch, 0);
if(ret)
puts("no match");
else {
printf("match offsets are %d %d\n", pmatch[0].rm_so, pmatch[0].rm_eo);
printf("match[0]=%*s<\n", pmatch[0].rm_eo, &str[pmatch[0].rm_so]);
printf("submatch offsets are %d %d\n", pmatch[1].rm_so, pmatch[1].rm_eo);
if(pmatch[1].rm_so != -1)
printf("match[1]=%*s<\n", pmatch[1].rm_eo, &str[pmatch[1].rm_so]);
}
return 0;
}
/*
output:
regcomp ok; re_nsub=1
match offsets are 0 4
match[0]=dave<
submatch offsets are -1 -1
*/
您没有获得第一个捕获组的偏移量的问题是您将 1
作为第三个 size_t __nmatch
参数传递给 regexec
。
1
值应更改为 2
,因为每当 \(.\)ave
正则表达式匹配时都会有两个组:第 0 组将举行整个比赛,第 1 组将举行整个比赛首先捕获组值。
所以,你需要使用
ret = regexec(&preg, str, 2, pmatch, 0);
// ^^^
此外,要打印第 1 组值,您可以使用
if(pmatch[1].rm_so != -1) {
printf("match[1]=%.*s<\n", pmatch[1].rm_eo, &str[pmatch[1].rm_so]);
}
参见this C demo:
#include <stdio.h>
#include <regex.h>
#include <string.h>
int main() {
regex_t preg;
char str[] = "dave";
char regex[] = "\(.\)ave";
// flag REG_EXTENDED with unescaped parens in the r.e. doesn't fix anything
int ret, cflags = REG_ICASE;
// the elements of unused pmatches used to be set to -1 by regexec, but no longer. a clue perhaps.
regmatch_t pmatch[2] = {{-1,-1},{-1,-1}};
ret = regcomp(&preg, regex, cflags);
if (ret) {
puts("regcomp fail");
return ret;
}
else
// preg.re_nsub contains the correct number of groups that regcomp recognized in the r.e. Tests succeeded for 0, 1, 2, and 3 groups.
printf("regcomp ok; re_nsub=%zu\n", preg.re_nsub);
ret = regexec(&preg, str, 2, pmatch, 0); // 1 changed to 2 as there is Group 0 (whole match) and Group 1 (for the first capturing group)
if(ret)
puts("no match");
else {
printf("match offsets are %d %d\n", pmatch[0].rm_so, pmatch[0].rm_eo);
printf("match[0]=%*s<\n", pmatch[0].rm_eo, &str[pmatch[0].rm_so]);
printf("submatch offsets are %d %d\n", pmatch[1].rm_so, pmatch[1].rm_eo);
if(pmatch[1].rm_so != -1) {
printf("match[1]=%.*s<\n", pmatch[1].rm_eo, &str[pmatch[1].rm_so]);
}
}
return 0;
}
/*
regcomp ok; re_nsub=1
match offsets are 0 4
match[0]=dave<
submatch offsets are 0 1
match[1]=d<
*/
我已经使 GNU 正则表达式库的工作方式与我大约 2 年前写的广泛文本处理算法中所宣传的完全一样,但不幸的是,该平台已经消失,我不知道它的版本是旧版本还是新版本参考如下。
代码如下:
// GNU libc version: 2.28
// gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0
#include <stdio.h>
#include <regex.h>
int main() {
regex_t preg;
char str[] = "dave";
char regex[] = "\(.\)ave";
// flag REG_EXTENDED with unescaped parens in the r.e. doesn't fix anything
int ret, cflags = REG_ICASE;
// the elements of unused pmatches used to be set to -1 by regexec, but no longer. a clue perhaps.
regmatch_t pmatch[2] = {{-1,-1},{-1,-1}};
ret = regcomp(&preg, regex, cflags);
if (ret) {
puts("regcomp fail");
return ret;
}
else
// preg.re_nsub contains the correct number of groups that regcomp recognized in the r.e. Tests succeeded for 0, 1, 2, and 3 groups.
printf("regcomp ok; re_nsub=%zu\n", preg.re_nsub);
ret = regexec(&preg, str, 1, pmatch, 0);
if(ret)
puts("no match");
else {
printf("match offsets are %d %d\n", pmatch[0].rm_so, pmatch[0].rm_eo);
printf("match[0]=%*s<\n", pmatch[0].rm_eo, &str[pmatch[0].rm_so]);
printf("submatch offsets are %d %d\n", pmatch[1].rm_so, pmatch[1].rm_eo);
if(pmatch[1].rm_so != -1)
printf("match[1]=%*s<\n", pmatch[1].rm_eo, &str[pmatch[1].rm_so]);
}
return 0;
}
/*
output:
regcomp ok; re_nsub=1
match offsets are 0 4
match[0]=dave<
submatch offsets are -1 -1
*/
您没有获得第一个捕获组的偏移量的问题是您将 1
作为第三个 size_t __nmatch
参数传递给 regexec
。
1
值应更改为 2
,因为每当 \(.\)ave
正则表达式匹配时都会有两个组:第 0 组将举行整个比赛,第 1 组将举行整个比赛首先捕获组值。
所以,你需要使用
ret = regexec(&preg, str, 2, pmatch, 0);
// ^^^
此外,要打印第 1 组值,您可以使用
if(pmatch[1].rm_so != -1) {
printf("match[1]=%.*s<\n", pmatch[1].rm_eo, &str[pmatch[1].rm_so]);
}
参见this C demo:
#include <stdio.h>
#include <regex.h>
#include <string.h>
int main() {
regex_t preg;
char str[] = "dave";
char regex[] = "\(.\)ave";
// flag REG_EXTENDED with unescaped parens in the r.e. doesn't fix anything
int ret, cflags = REG_ICASE;
// the elements of unused pmatches used to be set to -1 by regexec, but no longer. a clue perhaps.
regmatch_t pmatch[2] = {{-1,-1},{-1,-1}};
ret = regcomp(&preg, regex, cflags);
if (ret) {
puts("regcomp fail");
return ret;
}
else
// preg.re_nsub contains the correct number of groups that regcomp recognized in the r.e. Tests succeeded for 0, 1, 2, and 3 groups.
printf("regcomp ok; re_nsub=%zu\n", preg.re_nsub);
ret = regexec(&preg, str, 2, pmatch, 0); // 1 changed to 2 as there is Group 0 (whole match) and Group 1 (for the first capturing group)
if(ret)
puts("no match");
else {
printf("match offsets are %d %d\n", pmatch[0].rm_so, pmatch[0].rm_eo);
printf("match[0]=%*s<\n", pmatch[0].rm_eo, &str[pmatch[0].rm_so]);
printf("submatch offsets are %d %d\n", pmatch[1].rm_so, pmatch[1].rm_eo);
if(pmatch[1].rm_so != -1) {
printf("match[1]=%.*s<\n", pmatch[1].rm_eo, &str[pmatch[1].rm_so]);
}
}
return 0;
}
/*
regcomp ok; re_nsub=1
match offsets are 0 4
match[0]=dave<
submatch offsets are 0 1
match[1]=d<
*/