PCRE 库的 RegEx 问题
RegEx issue with PCRE library
在 Linux 下使用标准正则表达式库以及 ANSI C 中的 PCRE 库进行了尝试:
需要捕获括号中的内容,在同一个字符串中多次捕获,但我只能捕获到第一个或者它匹配整行(非贪婪匹配)。
src [] = "device=\"device 1\" device_name=\"the first device" address=\"192.168.1.10\" device=\"device 2\" device_name=\"the second device" address=\"192.168.1.12\" device=\"device 3\" device_name=\"the third device" address=\"192.168.1.13\"
所以我想要的结果是得到 3 个子字符串:
- 设备 1
- 设备 2
- 设备 3
int main(int argc, char *argv[]) {
pcre *re;
const char *error;
int erroffset;
int ovector[OVECCOUNT];
int rc, i;
char src [] = "device=\"device 1\" device_name=\"the first device" address=\"192.168.1.10\" device=\"device 2\" device_name=\"the second device" address=\"192.168.1.12\" device=\"device 3\" device_name=\"the third device" address=\"192.168.1.13\";
char pattern [] = ".+device=\"(.+(?R))\".+";
re = pcre_compile(pattern, 0, &error, &erroffset, NULL);
if (re == NULL) {
printf("PCRE compilation failed at offset %d: %s\n", erroffset, error);
return 1;
}
rc = pcre_exec(re, NULL, src, strlen(src), 0, 0, ovector, OVECCOUNT);
if (rc < 0) {
if (rc == PCRE_ERROR_NOMATCH) printf("Sorry, no match ...\n");
else printf("Matching error %d/n", rc);
free(re);
return 1;
}
printf("\nOK, has matched ...\n\n");
for (i = 0; i < rc; i++) {
char *substring_start = src + ovector[2*i];
int substring_length = ovector[2*i+1] - ovector[2*i];
printf("%2d: %.*s\n", i, substring_length, substring_start);
}
free(re);
return 0;
}
网络上的正则表达式'testers'可以设置全局标志,这似乎有效,但在 PCRE 中不可用。我能做什么?
理想情况下,我更愿意使用标准 regex.h 库,但如果需要,PCRE 也可以。
标准POSIX扩展模式(^|[\t\v\f\r ])device="([^"]*)"
工作得很好。然后,第零个匹配是整个匹配,第一个匹配是device=
之前的空白字符,如果从行首开始则为空字符串,第二个匹配是设备名称的内容:
#define _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <sys/types.h>
#include <regex.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
const char data[] = "device=\"device 1\" device_name=\"the first device\" address=\"192.168.1.10\""
" device=\"device 2\" device_name=\"the second device\" address=\"192.168.1.12\""
" device=\"device 3\" device_name=\"the third device\" address=\"192.168.1.13\"";
const char pattern[] = "(^|[\t\v\f\r ])device=\"([^\"]*)\"";
int main(void)
{
regex_t expression;
regmatch_t match[3];
int err, i;
err = regcomp(&expression, pattern, REG_EXTENDED);
if (err) {
char errbuf[1024];
(void)regerror(err, &expression, errbuf, sizeof errbuf);
fprintf(stderr, "Invalid basic POSIX regular expression: %s.\n", errbuf);
return EXIT_FAILURE;
}
for (i = 0; regexec(&expression, data + i, 3, match, 0) == 0; i += match[0].rm_eo)
if (match[2].rm_so >= 0 && match[2].rm_eo > match[2].rm_so) {
const size_t off = i + match[2].rm_so;
const size_t len = match[2].rm_eo - match[2].rm_so;
char part[len + 1];
memcpy(part, data + off, len);
part[len] = '[=10=]';
printf("Matched '%s'.\n", part);
}
regfree(&expression);
return EXIT_SUCCESS;
}
正如 Jonathan Leffler 在对该问题的评论中提到的,匹配是在一个循环中获得的,下一个查找从上一个匹配结束的地方开始。当没有更多匹配项时,循环结束。
如果你想支持多种引用样式,你可以使用类似 ^device="([^"]*)*"|^device='([^']*)'|^device=([^\t\v\f\r ]*)|[\t\v\f\r ]device="([^"]*)*"|[\t\v\f\r ]device='([^']*)'|[\t\v\f\r ]device=([^\t\v\f\r ]*)
的东西,在 match[]
数组中至少有七个元素。然后, match[1]
到 match[6]
中的一个条目将具有 .rm_so > 0
,这将标识所需的内容:
#define _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <sys/types.h>
#include <regex.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
const char data[] = "device=\"device 1\" device_name=\"the first device\" address=\"192.168.1.10\""
" device=\"device 2\" device_name=\"the second device\" address=\"192.168.1.12\""
" device=\"device 3\" device_name=\"the third device\" address=\"192.168.1.13\"";
const char pattern[] = "^device=\"([^\"]*)\""
"|" "^device='([^']*)'"
"|" "^device=([^\t\v\f\r ]*)"
"|" "[\t\v\f\r ]device=\"([^\"]*)\""
"|" "[\t\v\f\r ]device='([^']*)'"
"|" "[\t\v\f\r ]device=([^\t\v\f\r ]*)";
int main(void)
{
regex_t expression;
regmatch_t match[7];
int err, i, k;
err = regcomp(&expression, pattern, REG_EXTENDED);
if (err) {
char errbuf[1024];
(void)regerror(err, &expression, errbuf, sizeof errbuf);
fprintf(stderr, "Invalid basic POSIX regular expression: %s.\n", errbuf);
return EXIT_FAILURE;
}
for (i = 0; regexec(&expression, data + i, 7, match, 0) == 0; i += match[0].rm_eo) {
for (k = 1; k < 7; k++)
if (match[k].rm_so >= 0)
break;
if (k >= 7)
continue;
if (match[k].rm_so >= 0 && match[k].rm_eo > match[k].rm_so) {
const size_t off = i + match[k].rm_so;
const size_t len = match[k].rm_eo - match[k].rm_so;
char part[len + 1];
memcpy(part, data + off, len);
part[len] = '[=11=]';
printf("Matched '%s'.\n", part);
}
}
regfree(&expression);
return EXIT_SUCCESS;
}
但是,当 data[]
是
时,此变体也会检测到所需的内容
device="device 1" device_name="the first device" address="192.168.1.10"
device=device2 device_name=the_second_device address=192.168.1.12
device='device 3' device_name='the third device' address='192.168.1.13'
就个人而言,我会考虑在模式 (^|[\t\v\f\r ])([A-Za-z0-9][-_a-Za-z0-9]*)=("[^"]*"|'[^']*'|[^\t\v\f\r ]*)
上进行匹配,以便第零个匹配匹配每一对,首先匹配名称部分,然后匹配值部分(可能是单引号或双引号)。根据名称部分,您可以将值部分(如果引用则省略引号)复制到动态分配的缓冲区。
在 Linux 下使用标准正则表达式库以及 ANSI C 中的 PCRE 库进行了尝试:
需要捕获括号中的内容,在同一个字符串中多次捕获,但我只能捕获到第一个或者它匹配整行(非贪婪匹配)。
src [] = "device=\"device 1\" device_name=\"the first device" address=\"192.168.1.10\" device=\"device 2\" device_name=\"the second device" address=\"192.168.1.12\" device=\"device 3\" device_name=\"the third device" address=\"192.168.1.13\"
所以我想要的结果是得到 3 个子字符串:
- 设备 1
- 设备 2
- 设备 3
int main(int argc, char *argv[]) {
pcre *re;
const char *error;
int erroffset;
int ovector[OVECCOUNT];
int rc, i;
char src [] = "device=\"device 1\" device_name=\"the first device" address=\"192.168.1.10\" device=\"device 2\" device_name=\"the second device" address=\"192.168.1.12\" device=\"device 3\" device_name=\"the third device" address=\"192.168.1.13\";
char pattern [] = ".+device=\"(.+(?R))\".+";
re = pcre_compile(pattern, 0, &error, &erroffset, NULL);
if (re == NULL) {
printf("PCRE compilation failed at offset %d: %s\n", erroffset, error);
return 1;
}
rc = pcre_exec(re, NULL, src, strlen(src), 0, 0, ovector, OVECCOUNT);
if (rc < 0) {
if (rc == PCRE_ERROR_NOMATCH) printf("Sorry, no match ...\n");
else printf("Matching error %d/n", rc);
free(re);
return 1;
}
printf("\nOK, has matched ...\n\n");
for (i = 0; i < rc; i++) {
char *substring_start = src + ovector[2*i];
int substring_length = ovector[2*i+1] - ovector[2*i];
printf("%2d: %.*s\n", i, substring_length, substring_start);
}
free(re);
return 0;
}
网络上的正则表达式'testers'可以设置全局标志,这似乎有效,但在 PCRE 中不可用。我能做什么?
理想情况下,我更愿意使用标准 regex.h 库,但如果需要,PCRE 也可以。
标准POSIX扩展模式(^|[\t\v\f\r ])device="([^"]*)"
工作得很好。然后,第零个匹配是整个匹配,第一个匹配是device=
之前的空白字符,如果从行首开始则为空字符串,第二个匹配是设备名称的内容:
#define _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <sys/types.h>
#include <regex.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
const char data[] = "device=\"device 1\" device_name=\"the first device\" address=\"192.168.1.10\""
" device=\"device 2\" device_name=\"the second device\" address=\"192.168.1.12\""
" device=\"device 3\" device_name=\"the third device\" address=\"192.168.1.13\"";
const char pattern[] = "(^|[\t\v\f\r ])device=\"([^\"]*)\"";
int main(void)
{
regex_t expression;
regmatch_t match[3];
int err, i;
err = regcomp(&expression, pattern, REG_EXTENDED);
if (err) {
char errbuf[1024];
(void)regerror(err, &expression, errbuf, sizeof errbuf);
fprintf(stderr, "Invalid basic POSIX regular expression: %s.\n", errbuf);
return EXIT_FAILURE;
}
for (i = 0; regexec(&expression, data + i, 3, match, 0) == 0; i += match[0].rm_eo)
if (match[2].rm_so >= 0 && match[2].rm_eo > match[2].rm_so) {
const size_t off = i + match[2].rm_so;
const size_t len = match[2].rm_eo - match[2].rm_so;
char part[len + 1];
memcpy(part, data + off, len);
part[len] = '[=10=]';
printf("Matched '%s'.\n", part);
}
regfree(&expression);
return EXIT_SUCCESS;
}
正如 Jonathan Leffler 在对该问题的评论中提到的,匹配是在一个循环中获得的,下一个查找从上一个匹配结束的地方开始。当没有更多匹配项时,循环结束。
如果你想支持多种引用样式,你可以使用类似 ^device="([^"]*)*"|^device='([^']*)'|^device=([^\t\v\f\r ]*)|[\t\v\f\r ]device="([^"]*)*"|[\t\v\f\r ]device='([^']*)'|[\t\v\f\r ]device=([^\t\v\f\r ]*)
的东西,在 match[]
数组中至少有七个元素。然后, match[1]
到 match[6]
中的一个条目将具有 .rm_so > 0
,这将标识所需的内容:
#define _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <sys/types.h>
#include <regex.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
const char data[] = "device=\"device 1\" device_name=\"the first device\" address=\"192.168.1.10\""
" device=\"device 2\" device_name=\"the second device\" address=\"192.168.1.12\""
" device=\"device 3\" device_name=\"the third device\" address=\"192.168.1.13\"";
const char pattern[] = "^device=\"([^\"]*)\""
"|" "^device='([^']*)'"
"|" "^device=([^\t\v\f\r ]*)"
"|" "[\t\v\f\r ]device=\"([^\"]*)\""
"|" "[\t\v\f\r ]device='([^']*)'"
"|" "[\t\v\f\r ]device=([^\t\v\f\r ]*)";
int main(void)
{
regex_t expression;
regmatch_t match[7];
int err, i, k;
err = regcomp(&expression, pattern, REG_EXTENDED);
if (err) {
char errbuf[1024];
(void)regerror(err, &expression, errbuf, sizeof errbuf);
fprintf(stderr, "Invalid basic POSIX regular expression: %s.\n", errbuf);
return EXIT_FAILURE;
}
for (i = 0; regexec(&expression, data + i, 7, match, 0) == 0; i += match[0].rm_eo) {
for (k = 1; k < 7; k++)
if (match[k].rm_so >= 0)
break;
if (k >= 7)
continue;
if (match[k].rm_so >= 0 && match[k].rm_eo > match[k].rm_so) {
const size_t off = i + match[k].rm_so;
const size_t len = match[k].rm_eo - match[k].rm_so;
char part[len + 1];
memcpy(part, data + off, len);
part[len] = '[=11=]';
printf("Matched '%s'.\n", part);
}
}
regfree(&expression);
return EXIT_SUCCESS;
}
但是,当 data[]
是
device="device 1" device_name="the first device" address="192.168.1.10"
device=device2 device_name=the_second_device address=192.168.1.12
device='device 3' device_name='the third device' address='192.168.1.13'
就个人而言,我会考虑在模式 (^|[\t\v\f\r ])([A-Za-z0-9][-_a-Za-z0-9]*)=("[^"]*"|'[^']*'|[^\t\v\f\r ]*)
上进行匹配,以便第零个匹配匹配每一对,首先匹配名称部分,然后匹配值部分(可能是单引号或双引号)。根据名称部分,您可以将值部分(如果引用则省略引号)复制到动态分配的缓冲区。