为文本提取匹配的关键字
Extract matched keyword for the text
寻求有关从文本中提取关键字的帮助。我有两个数据框。第一个数据框有描述列,另一个数据框只有一列关键字。
我想在描述字段中搜索 dataframe2 中的关键字,并使用匹配的关键字在 dataframe1 中创建一个新列。如果有多个关键字,我需要新添加的列,所有关键字都用逗号分隔,如下所述。
Dataframe2
Keywords
New
FUND
EVENT
Author
book
Dataframe1
ID NAME Month DESCRIPTION Keywords
12 x1 Jan funding recived fund
23 x2 Feb author of the book author, book
14 x3 Mar new year event new, event
此外,即使描述有完整的单词,我也需要关键字。即 Funding 我可以在新列中获得关键字 fund。
我们可以使用 fuzzyjoin
中的 regex_left_join
并进行 group_by
连接 (paste
)
library(fuzzyjoin)
library(dplyr)
df1 %>%
regex_left_join(df2, by = c('DESCRIPTION' = 'Keywords'),
ignore_case = TRUE) %>%
group_by(ID, NAME, Month, DESCRIPTION) %>%
summarise(Keywords = toString(unique(tolower(Keywords))))
# A tibble: 3 x 5
# Groups: ID, NAME, Month [?]
# ID NAME Month DESCRIPTION Keywords
# <int> <chr> <chr> <chr> <chr>
#1 12 x1 Jan funding recived fund
#2 14 x3 Mar new year event new, event
#3 23 x2 Feb author of the book author, book
数据
df1 <- structure(list(ID = c(12L, 23L, 14L), NAME = c("x1", "x2", "x3"
), Month = c("Jan", "Feb", "Mar"), DESCRIPTION = c("funding recived",
"author of the book", "new year event")), .Names = c("ID", "NAME",
"Month", "DESCRIPTION"), class = "data.frame", row.names = c(NA,
-3L))
df2 <- structure(list(Keywords = c("New", "FUND", "EVENT", "Author",
"book")), .Names = "Keywords", class = "data.frame", row.names = c(NA,
-5L))
一个解决方案是使用 stringr::str_detect
检查每个 DESCRIPTION
中是否存在 Keywords
。
library(stringr)
df1$Keywords <- mapply(function(x)paste(df2$Keywords[str_detect(x, tolower(df2$Keywords))],
collapse = ","), df1$DESCRIPTION)
df1
# ID NAME Month DESCRIPTION Keywords
# 1 12 x1 Jan funding recived FUND
# 2 23 x2 Feb author of the book Author,book
# 3 14 x3 Mar new year event New,EVENT
数据:
df1 <- read.table(text =
"ID NAME Month DESCRIPTION
12 x1 Jan 'funding recived'
23 x2 Feb 'author of the book'
14 x3 Mar 'new year event'",
header = TRUE, stringsAsFactors = FALSE)
df2 <- read.table(text =
"Keywords
New
FUND
EVENT
Author
book",
header = TRUE, stringsAsFactors = FALSE)
寻求有关从文本中提取关键字的帮助。我有两个数据框。第一个数据框有描述列,另一个数据框只有一列关键字。
我想在描述字段中搜索 dataframe2 中的关键字,并使用匹配的关键字在 dataframe1 中创建一个新列。如果有多个关键字,我需要新添加的列,所有关键字都用逗号分隔,如下所述。
Dataframe2
Keywords
New
FUND
EVENT
Author
book
Dataframe1
ID NAME Month DESCRIPTION Keywords
12 x1 Jan funding recived fund
23 x2 Feb author of the book author, book
14 x3 Mar new year event new, event
此外,即使描述有完整的单词,我也需要关键字。即 Funding 我可以在新列中获得关键字 fund。
我们可以使用 fuzzyjoin
中的 regex_left_join
并进行 group_by
连接 (paste
)
library(fuzzyjoin)
library(dplyr)
df1 %>%
regex_left_join(df2, by = c('DESCRIPTION' = 'Keywords'),
ignore_case = TRUE) %>%
group_by(ID, NAME, Month, DESCRIPTION) %>%
summarise(Keywords = toString(unique(tolower(Keywords))))
# A tibble: 3 x 5
# Groups: ID, NAME, Month [?]
# ID NAME Month DESCRIPTION Keywords
# <int> <chr> <chr> <chr> <chr>
#1 12 x1 Jan funding recived fund
#2 14 x3 Mar new year event new, event
#3 23 x2 Feb author of the book author, book
数据
df1 <- structure(list(ID = c(12L, 23L, 14L), NAME = c("x1", "x2", "x3"
), Month = c("Jan", "Feb", "Mar"), DESCRIPTION = c("funding recived",
"author of the book", "new year event")), .Names = c("ID", "NAME",
"Month", "DESCRIPTION"), class = "data.frame", row.names = c(NA,
-3L))
df2 <- structure(list(Keywords = c("New", "FUND", "EVENT", "Author",
"book")), .Names = "Keywords", class = "data.frame", row.names = c(NA,
-5L))
一个解决方案是使用 stringr::str_detect
检查每个 DESCRIPTION
中是否存在 Keywords
。
library(stringr)
df1$Keywords <- mapply(function(x)paste(df2$Keywords[str_detect(x, tolower(df2$Keywords))],
collapse = ","), df1$DESCRIPTION)
df1
# ID NAME Month DESCRIPTION Keywords
# 1 12 x1 Jan funding recived FUND
# 2 23 x2 Feb author of the book Author,book
# 3 14 x3 Mar new year event New,EVENT
数据:
df1 <- read.table(text =
"ID NAME Month DESCRIPTION
12 x1 Jan 'funding recived'
23 x2 Feb 'author of the book'
14 x3 Mar 'new year event'",
header = TRUE, stringsAsFactors = FALSE)
df2 <- read.table(text =
"Keywords
New
FUND
EVENT
Author
book",
header = TRUE, stringsAsFactors = FALSE)