提取字符串中正则表达式的所有匹配组

Extract all the matched groups of a regular expression in a string

有没有办法根据正则表达式在一次调用中提取字符串的所有匹配子组。

我有一个这样的约会:

Thu, 07 Apr 2022 15:03:32 GMT

我创建了以下正则表达式来提取该日期的所有部分:

let re =
    Str.regexp
      {|\([a-zA-Z]+\), \([0-9]+\) \([a-zA-Z]+\) \([0-9]+\) \([0-9]+\):\([0-9]+\):\([0-9]+\).*|}

为了提取每个部分,我这样使用它:


let parse_date date =
  let re =
    Str.regexp
      {|\([a-zA-Z]+\), \([0-9]+\) \([a-zA-Z]+\) \([0-9]+\) \([0-9]+\):\([0-9]+\):\([0-9]+\).*|}
  in
  let wday = Str.replace_first re {||} date in
  let day = Str.replace_first re {||} date in
  let mon = Str.replace_first re {||} date in
  let year = Str.replace_first re {||} date in
  let hour = Str.replace_first re {||} date in
  let min = Str.replace_first re {||} date in
  let sec = Str.replace_first re {||} date in
  Format.eprintf "RE DATE: %s %s %s %s %s %s %s@." wday day mon year hour min
    sec

如果零件存储在数组中,我可以像这样轻松地使用它:


let parse_date date =
  let re =
    Str.regexp
      {|\([a-zA-Z]+\), \([0-9]+\) \([a-zA-Z]+\) \([0-9]+\) \([0-9]+\):\([0-9]+\):\([0-9]+\).*|}
  in
  let parts = Str.match_groups re date in (* this function doesn't exist *)
  let wday = parts.(1) in
  let day = parts.(2) in
  let mon = parts.(3) in
  let year = parts.(4) in
  let hour = parts.(5) in
  let min = parts.(6) in
  let sec = parts.(7) in
  Format.eprintf "RE DATE: %s %s %s %s %s %s %s@." wday day mon year hour min
    sec

但这似乎不存在。还有其他方法吗?还是我的解决方案是唯一可用的?

因为这不是 XY 问题,所以我的目标实际上是提取日期的每个部分,所以除了使用 Str 可能还有其他解决方案,我会很乐意使用它。

您可以使用 Str.matched_group 来 return 特定捕获组的匹配:

let parse_date date =
  let re = Str.regexp
      {|\([a-zA-Z]+\), \([0-9]+\) \([a-zA-Z]+\) \([0-9]+\) \([0-9]+\):\([0-9]+\):\([0-9]+\).*|} in
  if Str.string_match re date 0 then
    let wday = Str.matched_group 1 date in
    let day = Str.matched_group 2 date in
    let mon = Str.matched_group 3 date in
    let year = Str.matched_group 4 date in
    let hour = Str.matched_group 5 date in
    let min = Str.matched_group 6 date in
    let sec = Str.matched_group 7 date in
    Format.sprintf "RE DATE: %s %s %s %s %s %s %s@." wday day mon year hour min sec
  else
    "RE DATE: Not matched"

let _ = parse_date "Thu, 07 Apr 2022 15:03:32 GMT" |> print_endline

不过,Str 包非常原始。我建议对正则表达式使用不同的库,例如 PCRE-Ocaml。它确实有办法获取匹配组的数组:

let parse_date2 date =
  let rex = Pcre.regexp
      {|([a-zA-Z]+), ([0-9]+) ([a-zA-Z]+) ([0-9]+) ([0-9]+):([0-9]+):([0-9]+).*|} in
  try
    let parts = Pcre.exec ~rex date |> Pcre.get_substrings in
    let wday = parts.(1) in
    let day = parts.(2) in
    let mon = parts.(3) in
    let year = parts.(4) in
    let hour = parts.(5) in
    let min = parts.(6) in
    let sec = parts.(7) in
    Format.sprintf "RE DATE: %s %s %s %s %s %s %s@." wday day mon year hour min sec
  with Not_found -> "RE DATE: Not matched"

let _ = parse_date2 "Thu, 07 Apr 2022 15:03:32 GMT" |> print_endline

对于具有固定数量的字段和分隔符的简单格式,Scanf 可能就足够了:

let date s = Scanf.sscanf s "%s@, %02d %s %d %d:%d:%d %s" 
  (fun day_name day month year h m s timezone ->
    day_name,day,month,year,h,m,s,timezone
  )

let x = date "Thu, 07 Apr 2022 15:03:32 GMT"