提取字符串中正则表达式的所有匹配组
Extract all the matched groups of a regular expression in a string
有没有办法根据正则表达式在一次调用中提取字符串的所有匹配子组。
我有一个这样的约会:
Thu, 07 Apr 2022 15:03:32 GMT
我创建了以下正则表达式来提取该日期的所有部分:
let re =
Str.regexp
{|\([a-zA-Z]+\), \([0-9]+\) \([a-zA-Z]+\) \([0-9]+\) \([0-9]+\):\([0-9]+\):\([0-9]+\).*|}
为了提取每个部分,我这样使用它:
let parse_date date =
let re =
Str.regexp
{|\([a-zA-Z]+\), \([0-9]+\) \([a-zA-Z]+\) \([0-9]+\) \([0-9]+\):\([0-9]+\):\([0-9]+\).*|}
in
let wday = Str.replace_first re {||} date in
let day = Str.replace_first re {||} date in
let mon = Str.replace_first re {||} date in
let year = Str.replace_first re {||} date in
let hour = Str.replace_first re {||} date in
let min = Str.replace_first re {||} date in
let sec = Str.replace_first re {||} date in
Format.eprintf "RE DATE: %s %s %s %s %s %s %s@." wday day mon year hour min
sec
如果零件存储在数组中,我可以像这样轻松地使用它:
let parse_date date =
let re =
Str.regexp
{|\([a-zA-Z]+\), \([0-9]+\) \([a-zA-Z]+\) \([0-9]+\) \([0-9]+\):\([0-9]+\):\([0-9]+\).*|}
in
let parts = Str.match_groups re date in (* this function doesn't exist *)
let wday = parts.(1) in
let day = parts.(2) in
let mon = parts.(3) in
let year = parts.(4) in
let hour = parts.(5) in
let min = parts.(6) in
let sec = parts.(7) in
Format.eprintf "RE DATE: %s %s %s %s %s %s %s@." wday day mon year hour min
sec
但这似乎不存在。还有其他方法吗?还是我的解决方案是唯一可用的?
因为这不是 XY 问题,所以我的目标实际上是提取日期的每个部分,所以除了使用 Str 可能还有其他解决方案,我会很乐意使用它。
您可以使用 Str.matched_group
来 return 特定捕获组的匹配:
let parse_date date =
let re = Str.regexp
{|\([a-zA-Z]+\), \([0-9]+\) \([a-zA-Z]+\) \([0-9]+\) \([0-9]+\):\([0-9]+\):\([0-9]+\).*|} in
if Str.string_match re date 0 then
let wday = Str.matched_group 1 date in
let day = Str.matched_group 2 date in
let mon = Str.matched_group 3 date in
let year = Str.matched_group 4 date in
let hour = Str.matched_group 5 date in
let min = Str.matched_group 6 date in
let sec = Str.matched_group 7 date in
Format.sprintf "RE DATE: %s %s %s %s %s %s %s@." wday day mon year hour min sec
else
"RE DATE: Not matched"
let _ = parse_date "Thu, 07 Apr 2022 15:03:32 GMT" |> print_endline
不过,Str
包非常原始。我建议对正则表达式使用不同的库,例如 PCRE-Ocaml。它确实有办法获取匹配组的数组:
let parse_date2 date =
let rex = Pcre.regexp
{|([a-zA-Z]+), ([0-9]+) ([a-zA-Z]+) ([0-9]+) ([0-9]+):([0-9]+):([0-9]+).*|} in
try
let parts = Pcre.exec ~rex date |> Pcre.get_substrings in
let wday = parts.(1) in
let day = parts.(2) in
let mon = parts.(3) in
let year = parts.(4) in
let hour = parts.(5) in
let min = parts.(6) in
let sec = parts.(7) in
Format.sprintf "RE DATE: %s %s %s %s %s %s %s@." wday day mon year hour min sec
with Not_found -> "RE DATE: Not matched"
let _ = parse_date2 "Thu, 07 Apr 2022 15:03:32 GMT" |> print_endline
对于具有固定数量的字段和分隔符的简单格式,Scanf 可能就足够了:
let date s = Scanf.sscanf s "%s@, %02d %s %d %d:%d:%d %s"
(fun day_name day month year h m s timezone ->
day_name,day,month,year,h,m,s,timezone
)
let x = date "Thu, 07 Apr 2022 15:03:32 GMT"
有没有办法根据正则表达式在一次调用中提取字符串的所有匹配子组。
我有一个这样的约会:
Thu, 07 Apr 2022 15:03:32 GMT
我创建了以下正则表达式来提取该日期的所有部分:
let re =
Str.regexp
{|\([a-zA-Z]+\), \([0-9]+\) \([a-zA-Z]+\) \([0-9]+\) \([0-9]+\):\([0-9]+\):\([0-9]+\).*|}
为了提取每个部分,我这样使用它:
let parse_date date =
let re =
Str.regexp
{|\([a-zA-Z]+\), \([0-9]+\) \([a-zA-Z]+\) \([0-9]+\) \([0-9]+\):\([0-9]+\):\([0-9]+\).*|}
in
let wday = Str.replace_first re {||} date in
let day = Str.replace_first re {||} date in
let mon = Str.replace_first re {||} date in
let year = Str.replace_first re {||} date in
let hour = Str.replace_first re {||} date in
let min = Str.replace_first re {||} date in
let sec = Str.replace_first re {||} date in
Format.eprintf "RE DATE: %s %s %s %s %s %s %s@." wday day mon year hour min
sec
如果零件存储在数组中,我可以像这样轻松地使用它:
let parse_date date =
let re =
Str.regexp
{|\([a-zA-Z]+\), \([0-9]+\) \([a-zA-Z]+\) \([0-9]+\) \([0-9]+\):\([0-9]+\):\([0-9]+\).*|}
in
let parts = Str.match_groups re date in (* this function doesn't exist *)
let wday = parts.(1) in
let day = parts.(2) in
let mon = parts.(3) in
let year = parts.(4) in
let hour = parts.(5) in
let min = parts.(6) in
let sec = parts.(7) in
Format.eprintf "RE DATE: %s %s %s %s %s %s %s@." wday day mon year hour min
sec
但这似乎不存在。还有其他方法吗?还是我的解决方案是唯一可用的?
因为这不是 XY 问题,所以我的目标实际上是提取日期的每个部分,所以除了使用 Str 可能还有其他解决方案,我会很乐意使用它。
您可以使用 Str.matched_group
来 return 特定捕获组的匹配:
let parse_date date =
let re = Str.regexp
{|\([a-zA-Z]+\), \([0-9]+\) \([a-zA-Z]+\) \([0-9]+\) \([0-9]+\):\([0-9]+\):\([0-9]+\).*|} in
if Str.string_match re date 0 then
let wday = Str.matched_group 1 date in
let day = Str.matched_group 2 date in
let mon = Str.matched_group 3 date in
let year = Str.matched_group 4 date in
let hour = Str.matched_group 5 date in
let min = Str.matched_group 6 date in
let sec = Str.matched_group 7 date in
Format.sprintf "RE DATE: %s %s %s %s %s %s %s@." wday day mon year hour min sec
else
"RE DATE: Not matched"
let _ = parse_date "Thu, 07 Apr 2022 15:03:32 GMT" |> print_endline
不过,Str
包非常原始。我建议对正则表达式使用不同的库,例如 PCRE-Ocaml。它确实有办法获取匹配组的数组:
let parse_date2 date =
let rex = Pcre.regexp
{|([a-zA-Z]+), ([0-9]+) ([a-zA-Z]+) ([0-9]+) ([0-9]+):([0-9]+):([0-9]+).*|} in
try
let parts = Pcre.exec ~rex date |> Pcre.get_substrings in
let wday = parts.(1) in
let day = parts.(2) in
let mon = parts.(3) in
let year = parts.(4) in
let hour = parts.(5) in
let min = parts.(6) in
let sec = parts.(7) in
Format.sprintf "RE DATE: %s %s %s %s %s %s %s@." wday day mon year hour min sec
with Not_found -> "RE DATE: Not matched"
let _ = parse_date2 "Thu, 07 Apr 2022 15:03:32 GMT" |> print_endline
对于具有固定数量的字段和分隔符的简单格式,Scanf 可能就足够了:
let date s = Scanf.sscanf s "%s@, %02d %s %d %d:%d:%d %s"
(fun day_name day month year h m s timezone ->
day_name,day,month,year,h,m,s,timezone
)
let x = date "Thu, 07 Apr 2022 15:03:32 GMT"