为下一个单词预测实现 n-gram
Implementing n-grams for next word prediction
我正在尝试使用三元组来预测下一个单词。
我已经能够上传语料库并根据频率识别最常见的三字母组。我在 R 中使用了 "ngrams"、"RWeka" 和 "tm" 包。我按照这个问题寻求指导:
What algorithm I need to find n-grams?
text1<-readLines("MyText.txt", encoding = "UTF-8")
corpus <- Corpus(VectorSource(text1))
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tdm <- TermDocumentMatrix(corpus, control = list(tokenize = BigramTokenizer))
如果用户输入一组单词,我将如何生成下一个单词?例如,如果用户键入 "can of",我将如何检索三个最有可能的词(例如啤酒、苏打水、油漆等)?
这是一种入门方法:
f <- function(queryHistoryTab, query, n = 2) {
require(tau)
trigrams <- sort(textcnt(rep(tolower(names(queryHistoryTab)), queryHistoryTab), method = "string", n = length(scan(text = query, what = "character", quiet = TRUE)) + 1))
query <- tolower(query)
idx <- which(substr(names(trigrams), 0, nchar(query)) == query)
res <- head(names(sort(trigrams[idx], decreasing = TRUE)), n)
res <- substr(res, nchar(query) + 2, nchar(res))
return(res)
}
f(c("Can of beer" = 3, "can of Soda" = 2, "A can of water" = 1, "Buy me a can of soda, please" = 2), "Can of")
# [1] "soda" "beer"
我刚刚试过了!希望以下带注释的代码对您有所帮助,但我想看看 RNN 如何处理三元组!由于可能是三元组的稀疏性,NaiveBayes 没有做得很好。
Gram_12实际上是三元组中前两个单词的二元组。将此视为第一步,而不是您努力的最终模型。
library(stringr)
library(qdap)
if (word_count(qry) >= 2){
lastwd<-word(qry,-2:-1)
test<-paste(lastwd[1],lastwd[2])
#Check if you find matching last two words in trigram Gram_12
index1 <- with(tri.df, grepl(test, tri.df$Gram_12))
#If found
if(any(index1)){
#Subset the trigram and group by Gram_3
index1 <- with(tri.df, grepl(test, tri.df$Gram_12))
filtered<-tri.df[index1, ]
#Find frequency of each unique group
freq<-data.frame(table(filtered$Gram_3))
#Order by Frequency of Gram_3 & return top 5
freq<-head(freq[order(-freq$Freq),],5)
predict<-as.character(freq[(freq$Freq>0),]$Var1)
#return(predict)
}
else { #If notfound
#Get only last word
library(stringr)
lastwd<-word(qry,-1)
#Search in bi gram Gram_1 and Group by Gram_2
index2 <- with(bi.df, grepl(lastwd, bi.df$Gram_1))
if(any(index2)){
filtered<-bi.df[index2, ]
#Find frequency of each unique group
freq<-data.frame(table(filtered$Gram_2))
#Order by Frequency of Gram 2
freq<-head(freq[order(-freq$Freq),],5)
predict<-as.character(freq[(freq$Freq>0),]$Var1)
}
else{
(predict<-"Need more training to predict")
}
}
}
else {
#else if length words==1 & Applied
library(stringr)
lastwd<-word(qry,-1)
index3 <- with(bi.df, grepl(lastwd, bi.df$Gram_1))
if(any(index3)){
filtered<-bi.df[index3, ]
#Find frequency of each unique group
freq<-data.frame(table(filtered$Gram_2))
#Order by Frequency of Gram 2
freq<-head(freq[order(-freq$Freq),],5)
predict<-as.character(freq[(freq$Freq>0),]$Var1)
}
else{
(predict<-"Need more training to predict")
}
}
我正在尝试使用三元组来预测下一个单词。
我已经能够上传语料库并根据频率识别最常见的三字母组。我在 R 中使用了 "ngrams"、"RWeka" 和 "tm" 包。我按照这个问题寻求指导:
What algorithm I need to find n-grams?
text1<-readLines("MyText.txt", encoding = "UTF-8")
corpus <- Corpus(VectorSource(text1))
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tdm <- TermDocumentMatrix(corpus, control = list(tokenize = BigramTokenizer))
如果用户输入一组单词,我将如何生成下一个单词?例如,如果用户键入 "can of",我将如何检索三个最有可能的词(例如啤酒、苏打水、油漆等)?
这是一种入门方法:
f <- function(queryHistoryTab, query, n = 2) {
require(tau)
trigrams <- sort(textcnt(rep(tolower(names(queryHistoryTab)), queryHistoryTab), method = "string", n = length(scan(text = query, what = "character", quiet = TRUE)) + 1))
query <- tolower(query)
idx <- which(substr(names(trigrams), 0, nchar(query)) == query)
res <- head(names(sort(trigrams[idx], decreasing = TRUE)), n)
res <- substr(res, nchar(query) + 2, nchar(res))
return(res)
}
f(c("Can of beer" = 3, "can of Soda" = 2, "A can of water" = 1, "Buy me a can of soda, please" = 2), "Can of")
# [1] "soda" "beer"
我刚刚试过了!希望以下带注释的代码对您有所帮助,但我想看看 RNN 如何处理三元组!由于可能是三元组的稀疏性,NaiveBayes 没有做得很好。 Gram_12实际上是三元组中前两个单词的二元组。将此视为第一步,而不是您努力的最终模型。
library(stringr)
library(qdap)
if (word_count(qry) >= 2){
lastwd<-word(qry,-2:-1)
test<-paste(lastwd[1],lastwd[2])
#Check if you find matching last two words in trigram Gram_12
index1 <- with(tri.df, grepl(test, tri.df$Gram_12))
#If found
if(any(index1)){
#Subset the trigram and group by Gram_3
index1 <- with(tri.df, grepl(test, tri.df$Gram_12))
filtered<-tri.df[index1, ]
#Find frequency of each unique group
freq<-data.frame(table(filtered$Gram_3))
#Order by Frequency of Gram_3 & return top 5
freq<-head(freq[order(-freq$Freq),],5)
predict<-as.character(freq[(freq$Freq>0),]$Var1)
#return(predict)
}
else { #If notfound
#Get only last word
library(stringr)
lastwd<-word(qry,-1)
#Search in bi gram Gram_1 and Group by Gram_2
index2 <- with(bi.df, grepl(lastwd, bi.df$Gram_1))
if(any(index2)){
filtered<-bi.df[index2, ]
#Find frequency of each unique group
freq<-data.frame(table(filtered$Gram_2))
#Order by Frequency of Gram 2
freq<-head(freq[order(-freq$Freq),],5)
predict<-as.character(freq[(freq$Freq>0),]$Var1)
}
else{
(predict<-"Need more training to predict")
}
}
}
else {
#else if length words==1 & Applied
library(stringr)
lastwd<-word(qry,-1)
index3 <- with(bi.df, grepl(lastwd, bi.df$Gram_1))
if(any(index3)){
filtered<-bi.df[index3, ]
#Find frequency of each unique group
freq<-data.frame(table(filtered$Gram_2))
#Order by Frequency of Gram 2
freq<-head(freq[order(-freq$Freq),],5)
predict<-as.character(freq[(freq$Freq>0),]$Var1)
}
else{
(predict<-"Need more training to predict")
}
}