data.table 查找三元组概率的错误和警告

data.table error and warnings for finding trigram probability

我正在尝试使用与 https://thiloshon.wordpress.com/2018/03/11/build-your-own-word-sentence-prediction-application-part-02/ 中相同的代码来进行词级预测。输入文本数据也在提到的 link 中,我使用 en_US.news.txt 文件作为我唯一的输入文件。

library(quanteda)
library(data.table)

#read the .txt file
df=readLines('en_US.news.txt')

#take a sample of the df
sampleHolderNews <- sample(length(df), length(df) * 0.1)
US_News_Sample <- df[sampleHolderNews]

#build the corpus of the data 
corp <- corpus(US_News_Sample)

#Preprocessing

master_Tokens <- tokens(x = tolower(corp),remove_punct = 
TRUE,remove_numbers = TRUE,remove_hyphens = TRUE,remove_symbols = TRUE)
stemed_words <- tokens_wordstem(master_Tokens, language = "english")

#tokenization#
bi_gram <- tokens_ngrams(stemed_words, n = 2)
tri_gram <- tokens_ngrams(stemed_words, n = 3)

uni_DFM <- dfm(stemed_words)
bi_DFM <- dfm(bi_gram)
tri_DFM <- dfm(tri_gram)

uni_DFM <- dfm_trim(uni_DFM, 3)
bi_DFM <- dfm_trim(bi_DFM, 3)
tri_DFM <- dfm_trim(tri_DFM, 3)

sums_U <- colSums(uni_DFM)
sums_B <- colSums(bi_DFM)
sums_T <- colSums(tri_DFM)

# Create data tables with individual words as columns
uni_words <- data.table(word_1 = names(sums_U), count = sums_U)

bi_words <- data.table(
word_1 = sapply(strsplit(names(sums_B), "_", fixed = TRUE), '[[', 1),
word_2 = sapply(strsplit(names(sums_B), "_", fixed = TRUE), '[[', 2),
count = sums_B)

tri_words <- data.table(
word_1 = sapply(strsplit(names(sums_T), "_", fixed = TRUE), '[[', 1),
word_2 = sapply(strsplit(names(sums_T), "_", fixed = TRUE), '[[', 2),
word_3 = sapply(strsplit(names(sums_T), "_", fixed = TRUE), '[[', 3),
count = sums_T)

#indexing#
setkey(uni_words, word_1)
setkey(bi_words, word_1, word_2)
setkey(tri_words, word_1, word_2, word_3)

######## Finding Bi-Gram Probability #################

discount_value <- 0.75
# Finding number of bi-gram words
numOfBiGrams <- nrow(bi_words[.(word_1, word_2)])
# Dividing number of times word 2 occurs as second part of bigram, by total number of bigrams.  
# Finding probability for a word given the number of times it was second word of a bigram
ckn <- bi_words[, .(Prob = ((.N) / numOfBiGrams)), by = word_2]
setkey(ckn, word_2)
# Assigning the probabilities as second word of bigram, to unigrams
uni_words[, Prob := ckn[word_1, Prob]]
uni_words <- uni_words[!is.na(uni_words$Prob)]
# Finding number of times word 1 occurred as word 1 of bi-grams
n1wi <- bi_words[, .(N = .N), by = word_1]
setkey(n1wi, word_1)
# Assigning total times word 1 occured to bigram cn1
bi_words[, Cn1 := uni_words[word_1, count]]
# Kneser Kney Algorithm
bi_words[, Prob := ((count - discount_value) / Cn1 + discount_value / Cn1 * 
n1wi[word_1, N] * uni_words[word_2, Prob])]

######## End of Finding Bi-Gram Probability #################

######## Finding Tri-Gram Probability #################

# Finding count of word1-word2 combination in bigram 
tri_words[, Cn2 := bi_words[.(word_1, word_2), .N]]
n1w12 <- tri_words[, .N, by = .(word_1, word_2)]
setkey(n1w12, word_1, word_2)

# Kneser Kney Algorithm
tri_words[, Prob := ((count - discount_value) / Cn2 + discount_value / Cn2 * 
n1w12[.(word_1, word_2), .N] * bi_words[.(word_1, word_2), Prob])]

这里我得到了三元组的 Kneser 算法的以下错误:

 Error in `[.data.table`(tri_words, , `:=`(Prob, ((count - discount_value)/Cn2 +  : 
 Supplied 13867 items to be assigned to 3932 items of column 'Prob'. If you wish to 'recycle' 
 the RHS please use rep() to make this intent clear to readers of your code.
 In addition: Warning messages:
 1: In discount_value/Cn2 * n1w12[list(word_1, word_2), .N] * bi_words[list(word_1,  :
 longer object length is not a multiple of shorter object length
 2: In (count - discount_value)/Cn2 + discount_value/Cn2 * n1w12[list(word_1,  :
 longer object length is not a multiple of shorter object length

我可以找到一些与数据 table 错误相关的类似问题,但我不明白我应该如何解决代码中的这个错误。

问题在于您试图乘以最后一行中的数量。这个表达式:

(count - discount_value) / Cn2 + discount_value / Cn2

是长度 20,就像 tri_words。但是下一个表达式

n1w12[.(word_1, word_2), .N]

是长度19。那么最后一部分,

bi_words[.(word_1, word_2), Prob])

的长度为 155(并且包含很多 NA)。

错误消息说较短的项目不能回收到较长的项目中,因为较长的项目的长度不是较短的项目长度的倍数。要解决此问题,您需要更仔细地实施此算法。