解析长字符串以检索 channel_id
Parse long string to retrieve channel_id
我从 Telegram 中提取了很多数据。但是,我无法隔离 channel_id。现在我有一个很长的字符串,在许多其他信息中包含 channel_id。问题是如何删除 channel_id 以外的所有内容,即“channel_id=XXXXXXXXXX)?
之后的数字
我的子集 data.frame
df <- structure(list(channel_id = c("MessageFwdHeader(date=datetime.datetime(2021, 5, 13, 20, 50, 47, tzinfo=datetime.timezone.utc), imported=False, from_id=PeerChannel(channel_id=1292436059), from_name=None, channel_post=1404, post_author=None, saved_from_peer=None, saved_from_msg_id=None, psa_type=None)",
"MessageFwdHeader(date=datetime.datetime(2021, 5, 4, 9, 24, 16, tzinfo=datetime.timezone.utc), imported=False, from_id=PeerChannel(channel_id=1480423705), from_name=None, channel_post=224, post_author=None, saved_from_peer=None, saved_from_msg_id=None, psa_type=None)",
"MessageFwdHeader(date=datetime.datetime(2021, 3, 25, 14, 9, 38, tzinfo=datetime.timezone.utc), imported=False, from_id=PeerChannel(channel_id=1489900933), from_name=None, channel_post=627, post_author=None, saved_from_peer=None, saved_from_msg_id=None, psa_type=None)",
"MessageFwdHeader(date=datetime.datetime(2021, 3, 12, 22, 10, 3, tzinfo=datetime.timezone.utc), imported=False, from_id=PeerChannel(channel_id=1455689590), from_name=None, channel_post=1457, post_author=None, saved_from_peer=None, saved_from_msg_id=None, psa_type=None)",
"MessageFwdHeader(date=datetime.datetime(2021, 3, 9, 12, 52, 5, tzinfo=datetime.timezone.utc), imported=False, from_id=PeerChannel(channel_id=1348575245), from_name=None, channel_post=None, post_author=None, saved_from_peer=None, saved_from_msg_id=None, psa_type=None)"
)), row.names = c(NA, -5L), class = c("data.table", "data.frame"))
想要的结果
channel_id <- structure(list(channel_id = c("1292436059",
"1480423705",
"1489900933",
"1455689590",
"1348575245"
)), row.names = c(NA, -5L), class = c("data.table", "data.frame"))
您可以尝试 regexpr
并使用 (?<=\(channel_id=)
向后看 (channel_id=
,而不是匹配数字 \d+
并向前看 )
使用 (?=\))
并使用 regmatches
.
提取匹配项
regmatches(df$channel_id, regexpr("(?<=\(channel_id=)\d+(?=\))"
, df$channel_id, perl=TRUE))
#[1] "1292436059" "1480423705" "1489900933" "1455689590" "1348575245"
或合并两个 sub
.
sub(").*", "", sub(".*\(channel_id=", "", df$channel_id))
#[1] "1292436059" "1480423705" "1489900933" "1455689590" "1348575245
我们可以使用str_extract
library(stringr)
library(dplyr)
df %>%
transmute(channel_id = str_extract(channel_id, "(?<=channel_id\=)\d+"))
channel_id
1: 1292436059
2: 1480423705
3: 1489900933
4: 1455689590
5: 1348575245
我从 Telegram 中提取了很多数据。但是,我无法隔离 channel_id。现在我有一个很长的字符串,在许多其他信息中包含 channel_id。问题是如何删除 channel_id 以外的所有内容,即“channel_id=XXXXXXXXXX)?
之后的数字我的子集 data.frame
df <- structure(list(channel_id = c("MessageFwdHeader(date=datetime.datetime(2021, 5, 13, 20, 50, 47, tzinfo=datetime.timezone.utc), imported=False, from_id=PeerChannel(channel_id=1292436059), from_name=None, channel_post=1404, post_author=None, saved_from_peer=None, saved_from_msg_id=None, psa_type=None)",
"MessageFwdHeader(date=datetime.datetime(2021, 5, 4, 9, 24, 16, tzinfo=datetime.timezone.utc), imported=False, from_id=PeerChannel(channel_id=1480423705), from_name=None, channel_post=224, post_author=None, saved_from_peer=None, saved_from_msg_id=None, psa_type=None)",
"MessageFwdHeader(date=datetime.datetime(2021, 3, 25, 14, 9, 38, tzinfo=datetime.timezone.utc), imported=False, from_id=PeerChannel(channel_id=1489900933), from_name=None, channel_post=627, post_author=None, saved_from_peer=None, saved_from_msg_id=None, psa_type=None)",
"MessageFwdHeader(date=datetime.datetime(2021, 3, 12, 22, 10, 3, tzinfo=datetime.timezone.utc), imported=False, from_id=PeerChannel(channel_id=1455689590), from_name=None, channel_post=1457, post_author=None, saved_from_peer=None, saved_from_msg_id=None, psa_type=None)",
"MessageFwdHeader(date=datetime.datetime(2021, 3, 9, 12, 52, 5, tzinfo=datetime.timezone.utc), imported=False, from_id=PeerChannel(channel_id=1348575245), from_name=None, channel_post=None, post_author=None, saved_from_peer=None, saved_from_msg_id=None, psa_type=None)"
)), row.names = c(NA, -5L), class = c("data.table", "data.frame"))
想要的结果
channel_id <- structure(list(channel_id = c("1292436059",
"1480423705",
"1489900933",
"1455689590",
"1348575245"
)), row.names = c(NA, -5L), class = c("data.table", "data.frame"))
您可以尝试 regexpr
并使用 (?<=\(channel_id=)
向后看 (channel_id=
,而不是匹配数字 \d+
并向前看 )
使用 (?=\))
并使用 regmatches
.
regmatches(df$channel_id, regexpr("(?<=\(channel_id=)\d+(?=\))"
, df$channel_id, perl=TRUE))
#[1] "1292436059" "1480423705" "1489900933" "1455689590" "1348575245"
或合并两个 sub
.
sub(").*", "", sub(".*\(channel_id=", "", df$channel_id))
#[1] "1292436059" "1480423705" "1489900933" "1455689590" "1348575245
我们可以使用str_extract
library(stringr)
library(dplyr)
df %>%
transmute(channel_id = str_extract(channel_id, "(?<=channel_id\=)\d+"))
channel_id
1: 1292436059
2: 1480423705
3: 1489900933
4: 1455689590
5: 1348575245