根据R中的关键字创建类别

Create category based on keyword in R

我有一个包含两列的数据框:第一列是关键字,第二列是相关类别。

keywords <- c("keyword1", "keyword2", "keyword3")
categories <- c("category1", "category2", "category3")

lookup_table <- data.frame(keywords, categories)

我希望每次有新标签时,我检查是否有对应的类别,如果有,附上类别。

因此对于下面的示例,值 'category1' 附加到新列的第一行:

new_labels <- c("keyword1 qefjhqek", "hfaef", "fihiz")

非常感谢帮助!

这里只是用str_extract获取相关文字加入参考table.

keywords <- c("keyword1", "keyword2", "keyword3")
categories <- c("category1", "category2", "category3")

lookup_table <- data.frame(keywords, categories)
new_labels <- c("keyword1 qefjhqek", "hfaef", "fihiz")

library(data.table)
library(tidyverse)
ref_tbl <- 
# data.table(
# For the AntoniosK's sugguestion, recommend dplyr-like function.
tibble(
    keywords = keywords
    ,categories = categories
)

# as.data.table(
# For the AntoniosK's sugguestion, recommend dplyr-like function.
as_tibble(
    new_labels
    ) %>% 
    mutate(ref_key = str_extract(new_labels
                                 # ,'keyword[:digit:]'
                                 ,(
                                   keywords %>% 
                                     str_flatten('|')
                                   # regular expression
                                 )
                                 )) %>% 
    left_join(
         ref_tbl
         ,by=c('ref_key'='keywords')
    )
#> # A tibble: 3 x 3
#>   value             ref_key  categories
#>   <chr>             <chr>    <chr>     
#> 1 keyword1 qefjhqek keyword1 category1 
#> 2 hfaef             <NA>     <NA>      
#> 3 fihiz             <NA>     <NA>

reprex package (v0.2.1)

于 2018-11-10 创建

根据@AntoniosK 的问题,我比较了data.tabletibble。事实是有一个重要的迹象支持 tibble 优于 data.table

  1. tibble 只有 2990 毫秒 -> 1st :
  2. data.tableas.data.table 3240 毫秒 -> 第二 :
  3. data.table 只有 3840 毫秒 -> 第 3 :

这是一个 tidyverse 解决方案,可以在您的新标签和关键字之间创建所有组合,找到匹配项并提取关键字,然后从查找中加入类别 table:

keywords <- c("keyword1", "keyword2", "keyword3")
categories <- c("category1", "category2", "category3")
lookup_table <- data.frame(keywords, categories)

new_labels = c("keyword1 qefjhqek", "hfaef", "fihiz")

library(tidyverse)

expand.grid(new_labels=new_labels, keywords=lookup_table$keywords) %>%  # create all combinations
  mutate_all(as.character) %>%                                          # update to character (if needed)
  mutate(v = map2_lgl(new_labels, keywords, ~grepl(.y, .x))) %>%        # check if there's a match
  group_by(new_labels) %>%                                              # for each new label
  summarise(keywords = ifelse(sum(v) > 0, keywords[v==TRUE], NA)) %>%   # get the keyword if there is one
  left_join(lookup_table, by="keywords") %>%                            # join categoris
  select(-keywords)                                                     # remove keywords

# # A tibble: 3 x 2
#   new_labels        categories
#   <chr>             <fct>     
# 1 fihiz             NA        
# 2 hfaef             NA        
# 3 keyword1 qefjhqek category1 

还有一个受@Jianxiang 的回答启发的替代版本,它使用 str_extract 获取相关关键字,而不是创建所有组合:

data.frame(new_labels) %>%
  mutate(keywords = str_extract(new_labels, str_flatten(lookup_table$keywords, "|"))) %>%
  left_join(lookup_table, by="keywords") %>%        
  select(-keywords)

#           new_labels categories
# 1 keyword1 qefjhqek  category1
# 2             hfaef       <NA>
# 3             fihiz       <NA>
library(tidyverse)

tibble(
    label = new_labels,
    keywords = unlist(
      map2(new_labels, paste(lookup_table$keywords, collapse = "|"), str_extract)
    )) %>% 
    left_join(lookup_table) %>%
    select(- keywords)

# A tibble: 3 x 2
#   label             categories
#   <chr>             <fct>   
# 1 keyword1 qefjhqek category1
# 2 hfaef             <NA>    
# 3 fihiz             <NA>