从文本 R 中提取多个正则表达式
Multiple regex extract from a text R
我有以下df
df = data.frame(id = c(1,2,3), text = c('Label issues as ISS101 and ISS 201 on label 23 with x203 17','issue as ISS5051 with label 01 as l018','there is nothing here')
我想从 df
中提取并创建以下数据框
id iss label ext1 ext2
1 ISS101 23 x203 17
1 ISS201 23 x203 17
2 ISS5051 01 l018 NA
3 NA NA NA NA
iss 的长度可以变化,如示例所示。它在 "ISS" 和后续数字之间可能有也可能没有 space,这在例如
标签、ext1 和 ext2 的长度是固定的。
我已经使用 stringr & dplyr 尝试了正则表达式的各种可能性。但是其中 none 接近解决方案,因此值得一提。期待您的帮助,如果您需要更多详细信息,请告诉我。
您可以像这样使用 dplyr
和 stringr
...
library(dplyr)
library(stringr)
df2 <- df %>% mutate(iss=str_extract_all(str_replace_all(text,"ISS\s+(\d+)","ISS\1"),
"ISS\d+"), #remove spaces and then extract ISSnnn
label=str_match(text,"label\s+(\d+)")[,2], #extract label + nn
ext1=str_match(text,"label\s+\d+.*?([a-z]\d+)")[,2], #extract Xnnn after label
ext2=str_match(text,"\s(\d+)$")[,2]) %>% #extract digits at end of string
unnest(iss) %>% #unnest iss (creates one row for each iss)
select(iss,label,ext1,ext2) #select wanted variables
df2
iss label ext1 ext2
1 ISS101 23 x203 17
2 ISS201 23 x203 17
3 ISS5051 01 l018 <NA>
这可能是一个开始:
do.call(plyr::rbind.fill,
lapply(df$text, function(x) {
as.data.frame(cbind(
iss = unlist(stringr::str_extract_all(x, "(ISS\s?\d{3,4})")),
label = unlist(stringr::str_extract_all(x, "(?<=label)\s?(\d{1,2})")),
ext1 = unlist(stringr::str_extract_all(x, "((x|l)\d{3})")),
ext2 = unlist(stringr::str_extract_all(x, "(?<=x|l\d{3})\s?\d{1,3}"))
))}
))
iss label ext1 ext2
1 ISS101 23 x203 203
2 ISS 201 23 x203 203
3 ISS5051 01 l018 <NA>
根据您的描述,我已尽力而为。在没有看到更多数据的情况下,我不能保证这将是可推广的,但它会为您提供的 df 生成所需的输出,因此它应该是一个好的开始。
# create data frame
df = data.frame(id = c(1,2,3), text = c('Label issues as ISS101 and ISS 201 on label 23 with x203 17','issue as ISS5051 with label 01 as l018','there is nothing here'))
# parse text into fields
df <- df %>% mutate(
iss = str_extract(text, "ISS\d+\D"),
iss_space = str_extract(text, "ISS\s\d+\D"),
label = str_extract(text, "label.+\D"),
label = str_extract(label, "\d+\D"),
ext1 = str_extract(text, "\s\D\d{3}"),
ext2 = str_extract(text, "\s\D\d{3}\s\d{2}"),
ext2 = str_extract(ext2, "\s\d{2}"))
# clean up into correct format
df <- df %>%
gather(iss, iss_space, key = "type", value = "iss") %>%
select(-text, -type) %>%
distinct() %>%
filter(!(duplicated(id) == T & is.na(iss) == T)) %>%
arrange(id) %>%
select(id, iss, label, ext1, ext2) %>%
mutate(iss = str_replace_all(iss, " ", ""))
df
id iss label ext1 ext2
1 1 ISS101 23 x203 17
2 1 ISS201 23 x203 17
3 2 ISS5051 01 l018 <NA>
4 3 <NA> <NA> <NA> <NA>
我有以下df
df = data.frame(id = c(1,2,3), text = c('Label issues as ISS101 and ISS 201 on label 23 with x203 17','issue as ISS5051 with label 01 as l018','there is nothing here')
我想从 df
中提取并创建以下数据框id iss label ext1 ext2
1 ISS101 23 x203 17
1 ISS201 23 x203 17
2 ISS5051 01 l018 NA
3 NA NA NA NA
iss 的长度可以变化,如示例所示。它在 "ISS" 和后续数字之间可能有也可能没有 space,这在例如 标签、ext1 和 ext2 的长度是固定的。 我已经使用 stringr & dplyr 尝试了正则表达式的各种可能性。但是其中 none 接近解决方案,因此值得一提。期待您的帮助,如果您需要更多详细信息,请告诉我。
您可以像这样使用 dplyr
和 stringr
...
library(dplyr)
library(stringr)
df2 <- df %>% mutate(iss=str_extract_all(str_replace_all(text,"ISS\s+(\d+)","ISS\1"),
"ISS\d+"), #remove spaces and then extract ISSnnn
label=str_match(text,"label\s+(\d+)")[,2], #extract label + nn
ext1=str_match(text,"label\s+\d+.*?([a-z]\d+)")[,2], #extract Xnnn after label
ext2=str_match(text,"\s(\d+)$")[,2]) %>% #extract digits at end of string
unnest(iss) %>% #unnest iss (creates one row for each iss)
select(iss,label,ext1,ext2) #select wanted variables
df2
iss label ext1 ext2
1 ISS101 23 x203 17
2 ISS201 23 x203 17
3 ISS5051 01 l018 <NA>
这可能是一个开始:
do.call(plyr::rbind.fill,
lapply(df$text, function(x) {
as.data.frame(cbind(
iss = unlist(stringr::str_extract_all(x, "(ISS\s?\d{3,4})")),
label = unlist(stringr::str_extract_all(x, "(?<=label)\s?(\d{1,2})")),
ext1 = unlist(stringr::str_extract_all(x, "((x|l)\d{3})")),
ext2 = unlist(stringr::str_extract_all(x, "(?<=x|l\d{3})\s?\d{1,3}"))
))}
))
iss label ext1 ext2
1 ISS101 23 x203 203
2 ISS 201 23 x203 203
3 ISS5051 01 l018 <NA>
根据您的描述,我已尽力而为。在没有看到更多数据的情况下,我不能保证这将是可推广的,但它会为您提供的 df 生成所需的输出,因此它应该是一个好的开始。
# create data frame
df = data.frame(id = c(1,2,3), text = c('Label issues as ISS101 and ISS 201 on label 23 with x203 17','issue as ISS5051 with label 01 as l018','there is nothing here'))
# parse text into fields
df <- df %>% mutate(
iss = str_extract(text, "ISS\d+\D"),
iss_space = str_extract(text, "ISS\s\d+\D"),
label = str_extract(text, "label.+\D"),
label = str_extract(label, "\d+\D"),
ext1 = str_extract(text, "\s\D\d{3}"),
ext2 = str_extract(text, "\s\D\d{3}\s\d{2}"),
ext2 = str_extract(ext2, "\s\d{2}"))
# clean up into correct format
df <- df %>%
gather(iss, iss_space, key = "type", value = "iss") %>%
select(-text, -type) %>%
distinct() %>%
filter(!(duplicated(id) == T & is.na(iss) == T)) %>%
arrange(id) %>%
select(id, iss, label, ext1, ext2) %>%
mutate(iss = str_replace_all(iss, " ", ""))
df
id iss label ext1 ext2
1 1 ISS101 23 x203 17
2 1 ISS201 23 x203 17
3 2 ISS5051 01 l018 <NA>
4 3 <NA> <NA> <NA> <NA>