PCRE 库的 RegEx 问题

RegEx issue with PCRE library

在 Linux 下使用标准正则表达式库以及 ANSI C 中的 PCRE 库进行了尝试:

需要捕获括号中的内容,在同一个字符串中多次捕获,但我只能捕获到第一个或者它匹配整行(非贪婪匹配)。

src    [] = "device=\"device 1\" device_name=\"the first device" address=\"192.168.1.10\" device=\"device 2\" device_name=\"the second device" address=\"192.168.1.12\" device=\"device 3\" device_name=\"the third device" address=\"192.168.1.13\"

所以我想要的结果是得到 3 个子字符串:

int main(int argc, char *argv[]) {
  pcre            *re;
  const char      *error;
  int             erroffset;
  int             ovector[OVECCOUNT];
  int             rc, i;

  char            src    [] = "device=\"device 1\" device_name=\"the first device" address=\"192.168.1.10\" device=\"device 2\" device_name=\"the second device" address=\"192.168.1.12\" device=\"device 3\" device_name=\"the third device" address=\"192.168.1.13\";
  char            pattern   [] = ".+device=\"(.+(?R))\".+";

  re = pcre_compile(pattern, 0, &error, &erroffset, NULL);
  if (re == NULL) {
          printf("PCRE compilation failed at offset %d: %s\n", erroffset, error);
          return 1;
  }

  rc = pcre_exec(re, NULL, src, strlen(src), 0, 0, ovector, OVECCOUNT);
  if (rc < 0) {
          if (rc == PCRE_ERROR_NOMATCH) printf("Sorry, no match ...\n");
          else    printf("Matching error %d/n", rc);
          free(re);
          return 1;
  }

  printf("\nOK, has matched ...\n\n");

  for (i = 0; i < rc; i++) {
          char *substring_start = src + ovector[2*i];
          int substring_length = ovector[2*i+1] - ovector[2*i];
          printf("%2d: %.*s\n", i, substring_length, substring_start);
  }

  free(re);

  return 0;
}

网络上的正则表达式'testers'可以设置全局标志,这似乎有效,但在 PCRE 中不可用。我能做什么?

理想情况下,我更愿意使用标准 regex.h 库,但如果需要,PCRE 也可以。

标准POSIX扩展模式(^|[\t\v\f\r ])device="([^"]*)"工作得很好。然后,第零个匹配是整个匹配,第一个匹配是device=之前的空白字符,如果从行首开始则为空字符串,第二个匹配是设备名称的内容:

#define  _POSIX_C_SOURCE  200809L
#include <stdlib.h>
#include <sys/types.h>
#include <regex.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>

const char  data[] = "device=\"device 1\" device_name=\"the first device\" address=\"192.168.1.10\""
                     " device=\"device 2\" device_name=\"the second device\" address=\"192.168.1.12\""
                     " device=\"device 3\" device_name=\"the third device\" address=\"192.168.1.13\"";

const char pattern[] = "(^|[\t\v\f\r ])device=\"([^\"]*)\"";

int main(void)
{
    regex_t     expression;
    regmatch_t  match[3];
    int         err, i;

    err = regcomp(&expression, pattern, REG_EXTENDED);
    if (err) {
        char  errbuf[1024];
        (void)regerror(err, &expression, errbuf, sizeof errbuf);
        fprintf(stderr, "Invalid basic POSIX regular expression: %s.\n", errbuf);
        return EXIT_FAILURE;
    }

    for (i = 0; regexec(&expression, data + i, 3, match, 0) == 0; i += match[0].rm_eo)
        if (match[2].rm_so >= 0 && match[2].rm_eo > match[2].rm_so) {
            const size_t  off = i + match[2].rm_so;
            const size_t  len = match[2].rm_eo - match[2].rm_so;
            char          part[len + 1];
            memcpy(part, data + off, len);
            part[len] = '[=10=]';
            printf("Matched '%s'.\n", part);
        }

    regfree(&expression);
    return EXIT_SUCCESS;
}

正如 Jonathan Leffler 在对该问题的评论中提到的,匹配是在一个循环中获得的,下一个查找从上一个匹配结束的地方开始。当没有更多匹配项时,循环结束。

如果你想支持多种引用样式,你可以使用类似 ^device="([^"]*)*"|^device='([^']*)'|^device=([^\t\v\f\r ]*)|[\t\v\f\r ]device="([^"]*)*"|[\t\v\f\r ]device='([^']*)'|[\t\v\f\r ]device=([^\t\v\f\r ]*) 的东西,在 match[] 数组中至少有七个元素。然后, match[1]match[6] 中的一个条目将具有 .rm_so > 0,这将标识所需的内容:

#define  _POSIX_C_SOURCE  200809L
#include <stdlib.h>
#include <sys/types.h>
#include <regex.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>

const char  data[] = "device=\"device 1\" device_name=\"the first device\" address=\"192.168.1.10\""
                     " device=\"device 2\" device_name=\"the second device\" address=\"192.168.1.12\""
                     " device=\"device 3\" device_name=\"the third device\" address=\"192.168.1.13\"";

const char pattern[] = "^device=\"([^\"]*)\""
                   "|" "^device='([^']*)'"
                   "|" "^device=([^\t\v\f\r ]*)"
                   "|" "[\t\v\f\r ]device=\"([^\"]*)\""
                   "|" "[\t\v\f\r ]device='([^']*)'"
                   "|" "[\t\v\f\r ]device=([^\t\v\f\r ]*)";

int main(void)
{
    regex_t     expression;
    regmatch_t  match[7];
    int         err, i, k;

    err = regcomp(&expression, pattern, REG_EXTENDED);
    if (err) {
        char  errbuf[1024];
        (void)regerror(err, &expression, errbuf, sizeof errbuf);
        fprintf(stderr, "Invalid basic POSIX regular expression: %s.\n", errbuf);
        return EXIT_FAILURE;
    }

    for (i = 0; regexec(&expression, data + i, 7, match, 0) == 0; i += match[0].rm_eo) {
        for (k = 1; k < 7; k++)
            if (match[k].rm_so >= 0)
                break;
        if (k >= 7)
            continue;

        if (match[k].rm_so >= 0 && match[k].rm_eo > match[k].rm_so) {
            const size_t  off = i + match[k].rm_so;
            const size_t  len = match[k].rm_eo - match[k].rm_so;
            char          part[len + 1];
            memcpy(part, data + off, len);
            part[len] = '[=11=]';
            printf("Matched '%s'.\n", part);
        }
    }

    regfree(&expression);
    return EXIT_SUCCESS;
}

但是,当 data[]

时,此变体也会检测到所需的内容
device="device 1" device_name="the first device" address="192.168.1.10"
device=device2 device_name=the_second_device address=192.168.1.12
device='device 3' device_name='the third device' address='192.168.1.13'

就个人而言,我会考虑在模式 (^|[\t\v\f\r ])([A-Za-z0-9][-_a-Za-z0-9]*)=("[^"]*"|'[^']*'|[^\t\v\f\r ]*) 上进行匹配,以便第零个匹配匹配每一对,首先匹配名称部分,然后匹配值部分(可能是单引号或双引号)。根据名称部分,您可以将值部分(如果引用则省略引号)复制到动态分配的缓冲区。