使用 R 从文本字符串中提取词向量数组的更快解决方案

Faster solution for extracting array of word-vectors from text string with R

我正在遍历文本以从各种词典中查找和计算特定单词。我使用了两个非常慢并且需要几天才能完成的 FOR 循环。以下可重现代码:

library(stringr)

#Sample data
tweets=data.frame(id=c(1,2,3),text=c("This is a tweet that contains word1",
                                     "And here you can find word1 and word2 word2",
                                     "And here is only one word3 and one word3a"))

words=data.frame(id=c(1,2,3),word=c("word1","word2","word3"))

for(i in 1:nrow(tweets)){
  for(j in 1:nrow(words)){
    term = paste("\<",words[j,2],"\>", sep="")
    if (str_count(tweets[i,2], term) != 0) {
     tmp <- data.frame(id=tweets[i,1],termfound=words[j,2],count=str_count(tweets[i,2], term), row.names=NULL)
     message("ID ",tweets[i,1]," - Word '",words[j,2],"' found ",str_count(tweets[i,2], term)," times")
     #sqlSave(myconn, tmp, "DataTable", append=T, rownames=F)
    }
  }
}

备注:
我有大约 100 万行文本和大约 25,000 个单词。
消息行仅用于调试。
最终值写入 SQL - 行被注释掉,因为它不可重现。

有什么改进方法吗?我在想一个 APPLY 函数???

干杯 B

更新: 您也可以在 data.table 中尝试 stringi

library(data.table); library(stringi)

## convert tweets to  data table and set key on 'id' column 
dtweets <- as.data.table(tweets)
setkey(dtweets, id)

## convert words to data.table and set up the regex
dtw <- as.data.table(words)
dtw[,term := stri_c("\b", word, "\b")]

## run stri_count_regex by each id 
dtn <- dt[dtw, stri_count_regex(text, term), by = key(dt)]
#    id V1
# 1:  1  1
# 2:  1  0
# 3:  1  0
# 4:  2  1
# 5:  2  2
# 6:  2  0
# 7:  3  0
# 8:  3  0
# 9:  3  1 

## melt the rows to columns
melted <- melt(dtn, id = 1L, measure = 2L)
dcast(melted, id ~ value, sum)
#   id 0 1 2
# 1  1 0 1 0
# 2  2 0 1 2
# 3  3 0 1 0 

原回答

这是另一种方法,它只采用逻辑匹配,然后根据这些值计算结果。我不得不使用 \b 作为 term 中的单词边界。

library(stringi)

term <- stri_c("\b", words$word, "\b")

out <- vapply(seq_along(tweets$text), function(i) {
        a <- stri_detect_regex(tweets$text[i], term)
        a[a] <- cumsum(a[a != 0])
        a
    }, integer(nrow(tweets)))

cbind(tweets[1], `colnames<-`(out, words$word))
#   id word1 word2 word3
# 1  1     1     1     0
# 2  2     0     2     0
# 3  3     0     0     1

观察:您的代码对每个单词进行了三次计数。一次在 IF 语句中,一次在 tmp 赋值中,一次在调试消息中。减少对字符串计数函数的调用次数肯定会提高您的代码效率。

如上所述,stringi 包提供了一组更快的字符串函数。

以下矢量化代码将生成一个二维矩阵,其中包含您想要的结果,可以 然后转换为您的数据库所需的格式。

require(stringi)
tweets=data.frame(id=c(1,2,3),text=c("This is a tweet that contains word1",
                                     "And here you can find word1 and word2 word2",
                                     "And here is only one word3 and one word3a"),
                  stringsAsFactors = FALSE)
words=data.frame(id=c(1,2,3),word=c("word1","word2","word3"), stringsAsFactors = FALSE)
pat <- paste("\b",words$word,"\b", sep="")
sd <- function(text) { stri_count(text, regex=pat) }
results <- sapply(tweets$text, sd, USE.NAMES=F)
colnames(results) <- words$word
rownames(results) <- paste("ID", tweets$id)
results

产生以下输出:

##      word1 word2 word3
## ID 1     1     1     0
## ID 2     0     2     0
## ID 3     0     0     1

我对这个问题的想法与赛跑者爸爸几乎相同:

term = paste("\<",words$word,"\>", sep="") # create a regex for every word
# [1] "\<word1\>" "\<word2\>" "\<word3\>"

m <- sapply(tweets$text,function(tweet) str_count(tweet,term)) # find a number of occurences of every word in every tweet
#      [,1] [,2] [,3]
# [1,]    1    1    0
# [2,]    0    2    0
# [3,]    0    0    1



library(reshape)
df <- melt(m) # convert the result into the data frame format
#   X1 X2 value
# 1  1  1     1
# 2  2  1     0
# 3  3  1     0
# 4  1  2     1
# 5  2  2     2
# 6  3  2     0
# 7  1  3     0
# 8  2  3     0
# 9  3  3     1

colnames(df) <- c('id.tweet','id.word','count')

tmp <- with(df,data.frame(id=id.tweet,termfound=words$word[id.word],count=count)) # create a data frame similar to the one in the example
# id termfound count
# 1  1     word1     1
# 2  2     word1     0
# 3  3     word1     0
# 4  1     word2     1
# 5  2     word2     2
# 6  3     word2     0
# 7  1     word3     0
# 8  2     word3     0
# 9  3     word3     1