根据R中的关键字创建类别
Create category based on keyword in R
我有一个包含两列的数据框:第一列是关键字,第二列是相关类别。
keywords <- c("keyword1", "keyword2", "keyword3")
categories <- c("category1", "category2", "category3")
lookup_table <- data.frame(keywords, categories)
我希望每次有新标签时,我检查是否有对应的类别,如果有,附上类别。
因此对于下面的示例,值 'category1' 附加到新列的第一行:
new_labels <- c("keyword1 qefjhqek", "hfaef", "fihiz")
非常感谢帮助!
这里只是用str_extract
获取相关文字加入参考table.
keywords <- c("keyword1", "keyword2", "keyword3")
categories <- c("category1", "category2", "category3")
lookup_table <- data.frame(keywords, categories)
new_labels <- c("keyword1 qefjhqek", "hfaef", "fihiz")
library(data.table)
library(tidyverse)
ref_tbl <-
# data.table(
# For the AntoniosK's sugguestion, recommend dplyr-like function.
tibble(
keywords = keywords
,categories = categories
)
# as.data.table(
# For the AntoniosK's sugguestion, recommend dplyr-like function.
as_tibble(
new_labels
) %>%
mutate(ref_key = str_extract(new_labels
# ,'keyword[:digit:]'
,(
keywords %>%
str_flatten('|')
# regular expression
)
)) %>%
left_join(
ref_tbl
,by=c('ref_key'='keywords')
)
#> # A tibble: 3 x 3
#> value ref_key categories
#> <chr> <chr> <chr>
#> 1 keyword1 qefjhqek keyword1 category1
#> 2 hfaef <NA> <NA>
#> 3 fihiz <NA> <NA>
由 reprex package (v0.2.1)
于 2018-11-10 创建
根据@AntoniosK 的问题,我比较了data.table
和tibble
。事实是有一个重要的迹象支持 tibble
优于 data.table
。
tibble
只有 2990 毫秒 -> 1st
:
data.table
和 as.data.table
3240 毫秒 -> 第二
:
data.table
只有 3840 毫秒 -> 第 3
:
这是一个 tidyverse
解决方案,可以在您的新标签和关键字之间创建所有组合,找到匹配项并提取关键字,然后从查找中加入类别 table:
keywords <- c("keyword1", "keyword2", "keyword3")
categories <- c("category1", "category2", "category3")
lookup_table <- data.frame(keywords, categories)
new_labels = c("keyword1 qefjhqek", "hfaef", "fihiz")
library(tidyverse)
expand.grid(new_labels=new_labels, keywords=lookup_table$keywords) %>% # create all combinations
mutate_all(as.character) %>% # update to character (if needed)
mutate(v = map2_lgl(new_labels, keywords, ~grepl(.y, .x))) %>% # check if there's a match
group_by(new_labels) %>% # for each new label
summarise(keywords = ifelse(sum(v) > 0, keywords[v==TRUE], NA)) %>% # get the keyword if there is one
left_join(lookup_table, by="keywords") %>% # join categoris
select(-keywords) # remove keywords
# # A tibble: 3 x 2
# new_labels categories
# <chr> <fct>
# 1 fihiz NA
# 2 hfaef NA
# 3 keyword1 qefjhqek category1
还有一个受@Jianxiang 的回答启发的替代版本,它使用 str_extract
获取相关关键字,而不是创建所有组合:
data.frame(new_labels) %>%
mutate(keywords = str_extract(new_labels, str_flatten(lookup_table$keywords, "|"))) %>%
left_join(lookup_table, by="keywords") %>%
select(-keywords)
# new_labels categories
# 1 keyword1 qefjhqek category1
# 2 hfaef <NA>
# 3 fihiz <NA>
library(tidyverse)
tibble(
label = new_labels,
keywords = unlist(
map2(new_labels, paste(lookup_table$keywords, collapse = "|"), str_extract)
)) %>%
left_join(lookup_table) %>%
select(- keywords)
# A tibble: 3 x 2
# label categories
# <chr> <fct>
# 1 keyword1 qefjhqek category1
# 2 hfaef <NA>
# 3 fihiz <NA>
我有一个包含两列的数据框:第一列是关键字,第二列是相关类别。
keywords <- c("keyword1", "keyword2", "keyword3")
categories <- c("category1", "category2", "category3")
lookup_table <- data.frame(keywords, categories)
我希望每次有新标签时,我检查是否有对应的类别,如果有,附上类别。
因此对于下面的示例,值 'category1' 附加到新列的第一行:
new_labels <- c("keyword1 qefjhqek", "hfaef", "fihiz")
非常感谢帮助!
这里只是用str_extract
获取相关文字加入参考table.
keywords <- c("keyword1", "keyword2", "keyword3")
categories <- c("category1", "category2", "category3")
lookup_table <- data.frame(keywords, categories)
new_labels <- c("keyword1 qefjhqek", "hfaef", "fihiz")
library(data.table)
library(tidyverse)
ref_tbl <-
# data.table(
# For the AntoniosK's sugguestion, recommend dplyr-like function.
tibble(
keywords = keywords
,categories = categories
)
# as.data.table(
# For the AntoniosK's sugguestion, recommend dplyr-like function.
as_tibble(
new_labels
) %>%
mutate(ref_key = str_extract(new_labels
# ,'keyword[:digit:]'
,(
keywords %>%
str_flatten('|')
# regular expression
)
)) %>%
left_join(
ref_tbl
,by=c('ref_key'='keywords')
)
#> # A tibble: 3 x 3
#> value ref_key categories
#> <chr> <chr> <chr>
#> 1 keyword1 qefjhqek keyword1 category1
#> 2 hfaef <NA> <NA>
#> 3 fihiz <NA> <NA>
由 reprex package (v0.2.1)
于 2018-11-10 创建根据@AntoniosK 的问题,我比较了data.table
和tibble
。事实是有一个重要的迹象支持 tibble
优于 data.table
。
tibble
只有 2990 毫秒 -> 1st :data.table
和as.data.table
3240 毫秒 -> 第二 :data.table
只有 3840 毫秒 -> 第 3 :
这是一个 tidyverse
解决方案,可以在您的新标签和关键字之间创建所有组合,找到匹配项并提取关键字,然后从查找中加入类别 table:
keywords <- c("keyword1", "keyword2", "keyword3")
categories <- c("category1", "category2", "category3")
lookup_table <- data.frame(keywords, categories)
new_labels = c("keyword1 qefjhqek", "hfaef", "fihiz")
library(tidyverse)
expand.grid(new_labels=new_labels, keywords=lookup_table$keywords) %>% # create all combinations
mutate_all(as.character) %>% # update to character (if needed)
mutate(v = map2_lgl(new_labels, keywords, ~grepl(.y, .x))) %>% # check if there's a match
group_by(new_labels) %>% # for each new label
summarise(keywords = ifelse(sum(v) > 0, keywords[v==TRUE], NA)) %>% # get the keyword if there is one
left_join(lookup_table, by="keywords") %>% # join categoris
select(-keywords) # remove keywords
# # A tibble: 3 x 2
# new_labels categories
# <chr> <fct>
# 1 fihiz NA
# 2 hfaef NA
# 3 keyword1 qefjhqek category1
还有一个受@Jianxiang 的回答启发的替代版本,它使用 str_extract
获取相关关键字,而不是创建所有组合:
data.frame(new_labels) %>%
mutate(keywords = str_extract(new_labels, str_flatten(lookup_table$keywords, "|"))) %>%
left_join(lookup_table, by="keywords") %>%
select(-keywords)
# new_labels categories
# 1 keyword1 qefjhqek category1
# 2 hfaef <NA>
# 3 fihiz <NA>
library(tidyverse)
tibble(
label = new_labels,
keywords = unlist(
map2(new_labels, paste(lookup_table$keywords, collapse = "|"), str_extract)
)) %>%
left_join(lookup_table) %>%
select(- keywords)
# A tibble: 3 x 2
# label categories
# <chr> <fct>
# 1 keyword1 qefjhqek category1
# 2 hfaef <NA>
# 3 fihiz <NA>