R中关键字匹配的有效方法?
Efficient way of keyword matching in R?
我正在尝试匹配两个大型书目数据集(1.8M obs 和 3.9M obs)之间的关键字,它们来自记录中的各个字段:标题、作者、出版日期、出版商。
对于每个条目 (1.8M),我想将字符串中的每个关键字与另一个数据集 (3.9M) 的每个条目中的关键字进行匹配,return 匹配最多的行。
我想出的方法,使用 tidyverse 的 separate() 和 gather() 函数,以及一些基本的 dplyr,似乎可行,但无法扩展到整个数据集。
是否有更有效(或完全更好)的方法来做到这一点?
三个关键字和字符串以及代码的示例数据:
library(dplyr)
library(tidyverse)
df1 <- data.frame("df1.index" = c(1:3),
"keywords" = c("2013 history interpretation oxford the tractatus univ wittgensteins",
"2014 baxi law of oxford pratiksha public secrets univ",
"2014 darwin flinching from looking on oxford scientific shell-shock"))
df2 <- data.frame("df2.index" = c(1:3),
"keywords" = c("2014 darwin flinching from looking on oxford scientific theatricality",
"2013 interpretation oxford tractatushistory univ wittgensteins",
"2014 baxi in india law of oxford pratiksha public rape secrets trials univ"))
#separate up to 10 keywords
df1_sep <- separate(df1, keywords, into =
c("key1", "key2", "key3", "key4", "key5", "key6", "key7", "key8", "key9", "key10"),
sep = " ", remove = FALSE)
df2_sep <- separate(df2, keywords, into =
c("key1", "key2", "key3", "key4", "key5", "key6", "key7", "key8", "key9", "key10"),
sep = " ", remove = FALSE)
#gather separated keywords into one column
df1_gather <- df1_sep %>%
gather(keys, key.match, key1:key10, factor_key = TRUE) %>%
distinct()
df2_gather <- df2_sep %>%
gather(keys, key.match, key1:key10, factor_key = TRUE) %>%
distinct()
#remove NAs, blanks, trim
df1_gather <- df1_gather %>% filter(!is.na(key.match))
df1_gather <- df1_gather %>% filter(key.match != "")
df1_gather$key.match <- str_trim(df1_gather$key.match)
df2_gather <- df2_gather %>% filter(!is.na(key.match))
df2_gather <- df2_gather %>% filter(key.match != "")
df2_gather$key.match <- str_trim(df2_gather$key.match)
#join, after removing some columns from df2_gather
df2_gather <- df2_gather %>% select(df2.index, key.match)
df_join <- left_join(df1_gather, df2_gather)
#remove NAs
df_join <- df_join %>% filter(!is.na(df2.index))
#tally matches for each index, then take top match
df_join <- df_join %>% group_by(df1.index, df2.index) %>% tally()
df_join <- df_join %>% group_by(df1.index) %>% top_n(1, n)
#add back keywords to review match
df_join$df1.keywords=df1$keywords[match(df_join$df1.index, df1$df1.index)]
df_join$df2.keywords=df2$keywords[match(df_join$df2.index, df2$df2.index)]
也许这种方法可以用于直接使用每个关键字进行计数。希望对您有所帮助:
library(tidytext)
#Separate
df1 %>% mutate(keywords=as.character(keywords)) %>% unnest_tokens(word,keywords) -> l1
df2 %>% mutate(keywords=as.character(keywords)) %>% unnest_tokens(word,keywords) -> l2
#Join
l1 %>% left_join(l2) -> l3
l2 %>% left_join(l1) -> l4
#Compute number of ocuurences
table(l3$df1.index,l3$df2.index,exclude=NULL)
table(l4$df1.index,l4$df2.index,exclude=NULL)
输出:
1 2 3 <NA>
1 1 5 2 3
2 2 2 9 0
3 8 1 2 2
1 2 3
1 1 5 2
2 2 2 9
3 8 1 2
<NA> 1 1 4
我正在尝试匹配两个大型书目数据集(1.8M obs 和 3.9M obs)之间的关键字,它们来自记录中的各个字段:标题、作者、出版日期、出版商。
对于每个条目 (1.8M),我想将字符串中的每个关键字与另一个数据集 (3.9M) 的每个条目中的关键字进行匹配,return 匹配最多的行。
我想出的方法,使用 tidyverse 的 separate() 和 gather() 函数,以及一些基本的 dplyr,似乎可行,但无法扩展到整个数据集。
是否有更有效(或完全更好)的方法来做到这一点?
三个关键字和字符串以及代码的示例数据:
library(dplyr)
library(tidyverse)
df1 <- data.frame("df1.index" = c(1:3),
"keywords" = c("2013 history interpretation oxford the tractatus univ wittgensteins",
"2014 baxi law of oxford pratiksha public secrets univ",
"2014 darwin flinching from looking on oxford scientific shell-shock"))
df2 <- data.frame("df2.index" = c(1:3),
"keywords" = c("2014 darwin flinching from looking on oxford scientific theatricality",
"2013 interpretation oxford tractatushistory univ wittgensteins",
"2014 baxi in india law of oxford pratiksha public rape secrets trials univ"))
#separate up to 10 keywords
df1_sep <- separate(df1, keywords, into =
c("key1", "key2", "key3", "key4", "key5", "key6", "key7", "key8", "key9", "key10"),
sep = " ", remove = FALSE)
df2_sep <- separate(df2, keywords, into =
c("key1", "key2", "key3", "key4", "key5", "key6", "key7", "key8", "key9", "key10"),
sep = " ", remove = FALSE)
#gather separated keywords into one column
df1_gather <- df1_sep %>%
gather(keys, key.match, key1:key10, factor_key = TRUE) %>%
distinct()
df2_gather <- df2_sep %>%
gather(keys, key.match, key1:key10, factor_key = TRUE) %>%
distinct()
#remove NAs, blanks, trim
df1_gather <- df1_gather %>% filter(!is.na(key.match))
df1_gather <- df1_gather %>% filter(key.match != "")
df1_gather$key.match <- str_trim(df1_gather$key.match)
df2_gather <- df2_gather %>% filter(!is.na(key.match))
df2_gather <- df2_gather %>% filter(key.match != "")
df2_gather$key.match <- str_trim(df2_gather$key.match)
#join, after removing some columns from df2_gather
df2_gather <- df2_gather %>% select(df2.index, key.match)
df_join <- left_join(df1_gather, df2_gather)
#remove NAs
df_join <- df_join %>% filter(!is.na(df2.index))
#tally matches for each index, then take top match
df_join <- df_join %>% group_by(df1.index, df2.index) %>% tally()
df_join <- df_join %>% group_by(df1.index) %>% top_n(1, n)
#add back keywords to review match
df_join$df1.keywords=df1$keywords[match(df_join$df1.index, df1$df1.index)]
df_join$df2.keywords=df2$keywords[match(df_join$df2.index, df2$df2.index)]
也许这种方法可以用于直接使用每个关键字进行计数。希望对您有所帮助:
library(tidytext)
#Separate
df1 %>% mutate(keywords=as.character(keywords)) %>% unnest_tokens(word,keywords) -> l1
df2 %>% mutate(keywords=as.character(keywords)) %>% unnest_tokens(word,keywords) -> l2
#Join
l1 %>% left_join(l2) -> l3
l2 %>% left_join(l1) -> l4
#Compute number of ocuurences
table(l3$df1.index,l3$df2.index,exclude=NULL)
table(l4$df1.index,l4$df2.index,exclude=NULL)
输出:
1 2 3 <NA>
1 1 5 2 3
2 2 2 9 0
3 8 1 2 2
1 2 3
1 1 5 2
2 2 2 9
3 8 1 2
<NA> 1 1 4