使用 R 从文本字符串中提取词向量数组的更快解决方案
Faster solution for extracting array of word-vectors from text string with R
我正在遍历文本以从各种词典中查找和计算特定单词。我使用了两个非常慢并且需要几天才能完成的 FOR 循环。以下可重现代码:
library(stringr)
#Sample data
tweets=data.frame(id=c(1,2,3),text=c("This is a tweet that contains word1",
"And here you can find word1 and word2 word2",
"And here is only one word3 and one word3a"))
words=data.frame(id=c(1,2,3),word=c("word1","word2","word3"))
for(i in 1:nrow(tweets)){
for(j in 1:nrow(words)){
term = paste("\<",words[j,2],"\>", sep="")
if (str_count(tweets[i,2], term) != 0) {
tmp <- data.frame(id=tweets[i,1],termfound=words[j,2],count=str_count(tweets[i,2], term), row.names=NULL)
message("ID ",tweets[i,1]," - Word '",words[j,2],"' found ",str_count(tweets[i,2], term)," times")
#sqlSave(myconn, tmp, "DataTable", append=T, rownames=F)
}
}
}
备注:
我有大约 100 万行文本和大约 25,000 个单词。
消息行仅用于调试。
最终值写入 SQL - 行被注释掉,因为它不可重现。
有什么改进方法吗?我在想一个 APPLY 函数???
干杯
B
更新: 您也可以在 data.table
中尝试 stringi
。
library(data.table); library(stringi)
## convert tweets to data table and set key on 'id' column
dtweets <- as.data.table(tweets)
setkey(dtweets, id)
## convert words to data.table and set up the regex
dtw <- as.data.table(words)
dtw[,term := stri_c("\b", word, "\b")]
## run stri_count_regex by each id
dtn <- dt[dtw, stri_count_regex(text, term), by = key(dt)]
# id V1
# 1: 1 1
# 2: 1 0
# 3: 1 0
# 4: 2 1
# 5: 2 2
# 6: 2 0
# 7: 3 0
# 8: 3 0
# 9: 3 1
## melt the rows to columns
melted <- melt(dtn, id = 1L, measure = 2L)
dcast(melted, id ~ value, sum)
# id 0 1 2
# 1 1 0 1 0
# 2 2 0 1 2
# 3 3 0 1 0
原回答
这是另一种方法,它只采用逻辑匹配,然后根据这些值计算结果。我不得不使用 \b
作为 term
中的单词边界。
library(stringi)
term <- stri_c("\b", words$word, "\b")
out <- vapply(seq_along(tweets$text), function(i) {
a <- stri_detect_regex(tweets$text[i], term)
a[a] <- cumsum(a[a != 0])
a
}, integer(nrow(tweets)))
cbind(tweets[1], `colnames<-`(out, words$word))
# id word1 word2 word3
# 1 1 1 1 0
# 2 2 0 2 0
# 3 3 0 0 1
观察:您的代码对每个单词进行了三次计数。一次在 IF 语句中,一次在 tmp 赋值中,一次在调试消息中。减少对字符串计数函数的调用次数肯定会提高您的代码效率。
如上所述,stringi 包提供了一组更快的字符串函数。
以下矢量化代码将生成一个二维矩阵,其中包含您想要的结果,可以
然后转换为您的数据库所需的格式。
require(stringi)
tweets=data.frame(id=c(1,2,3),text=c("This is a tweet that contains word1",
"And here you can find word1 and word2 word2",
"And here is only one word3 and one word3a"),
stringsAsFactors = FALSE)
words=data.frame(id=c(1,2,3),word=c("word1","word2","word3"), stringsAsFactors = FALSE)
pat <- paste("\b",words$word,"\b", sep="")
sd <- function(text) { stri_count(text, regex=pat) }
results <- sapply(tweets$text, sd, USE.NAMES=F)
colnames(results) <- words$word
rownames(results) <- paste("ID", tweets$id)
results
产生以下输出:
## word1 word2 word3
## ID 1 1 1 0
## ID 2 0 2 0
## ID 3 0 0 1
我对这个问题的想法与赛跑者爸爸几乎相同:
term = paste("\<",words$word,"\>", sep="") # create a regex for every word
# [1] "\<word1\>" "\<word2\>" "\<word3\>"
m <- sapply(tweets$text,function(tweet) str_count(tweet,term)) # find a number of occurences of every word in every tweet
# [,1] [,2] [,3]
# [1,] 1 1 0
# [2,] 0 2 0
# [3,] 0 0 1
library(reshape)
df <- melt(m) # convert the result into the data frame format
# X1 X2 value
# 1 1 1 1
# 2 2 1 0
# 3 3 1 0
# 4 1 2 1
# 5 2 2 2
# 6 3 2 0
# 7 1 3 0
# 8 2 3 0
# 9 3 3 1
colnames(df) <- c('id.tweet','id.word','count')
tmp <- with(df,data.frame(id=id.tweet,termfound=words$word[id.word],count=count)) # create a data frame similar to the one in the example
# id termfound count
# 1 1 word1 1
# 2 2 word1 0
# 3 3 word1 0
# 4 1 word2 1
# 5 2 word2 2
# 6 3 word2 0
# 7 1 word3 0
# 8 2 word3 0
# 9 3 word3 1
我正在遍历文本以从各种词典中查找和计算特定单词。我使用了两个非常慢并且需要几天才能完成的 FOR 循环。以下可重现代码:
library(stringr)
#Sample data
tweets=data.frame(id=c(1,2,3),text=c("This is a tweet that contains word1",
"And here you can find word1 and word2 word2",
"And here is only one word3 and one word3a"))
words=data.frame(id=c(1,2,3),word=c("word1","word2","word3"))
for(i in 1:nrow(tweets)){
for(j in 1:nrow(words)){
term = paste("\<",words[j,2],"\>", sep="")
if (str_count(tweets[i,2], term) != 0) {
tmp <- data.frame(id=tweets[i,1],termfound=words[j,2],count=str_count(tweets[i,2], term), row.names=NULL)
message("ID ",tweets[i,1]," - Word '",words[j,2],"' found ",str_count(tweets[i,2], term)," times")
#sqlSave(myconn, tmp, "DataTable", append=T, rownames=F)
}
}
}
备注:
我有大约 100 万行文本和大约 25,000 个单词。
消息行仅用于调试。
最终值写入 SQL - 行被注释掉,因为它不可重现。
有什么改进方法吗?我在想一个 APPLY 函数???
干杯 B
更新: 您也可以在 data.table
中尝试 stringi
。
library(data.table); library(stringi)
## convert tweets to data table and set key on 'id' column
dtweets <- as.data.table(tweets)
setkey(dtweets, id)
## convert words to data.table and set up the regex
dtw <- as.data.table(words)
dtw[,term := stri_c("\b", word, "\b")]
## run stri_count_regex by each id
dtn <- dt[dtw, stri_count_regex(text, term), by = key(dt)]
# id V1
# 1: 1 1
# 2: 1 0
# 3: 1 0
# 4: 2 1
# 5: 2 2
# 6: 2 0
# 7: 3 0
# 8: 3 0
# 9: 3 1
## melt the rows to columns
melted <- melt(dtn, id = 1L, measure = 2L)
dcast(melted, id ~ value, sum)
# id 0 1 2
# 1 1 0 1 0
# 2 2 0 1 2
# 3 3 0 1 0
原回答
这是另一种方法,它只采用逻辑匹配,然后根据这些值计算结果。我不得不使用 \b
作为 term
中的单词边界。
library(stringi)
term <- stri_c("\b", words$word, "\b")
out <- vapply(seq_along(tweets$text), function(i) {
a <- stri_detect_regex(tweets$text[i], term)
a[a] <- cumsum(a[a != 0])
a
}, integer(nrow(tweets)))
cbind(tweets[1], `colnames<-`(out, words$word))
# id word1 word2 word3
# 1 1 1 1 0
# 2 2 0 2 0
# 3 3 0 0 1
观察:您的代码对每个单词进行了三次计数。一次在 IF 语句中,一次在 tmp 赋值中,一次在调试消息中。减少对字符串计数函数的调用次数肯定会提高您的代码效率。
如上所述,stringi 包提供了一组更快的字符串函数。
以下矢量化代码将生成一个二维矩阵,其中包含您想要的结果,可以 然后转换为您的数据库所需的格式。
require(stringi)
tweets=data.frame(id=c(1,2,3),text=c("This is a tweet that contains word1",
"And here you can find word1 and word2 word2",
"And here is only one word3 and one word3a"),
stringsAsFactors = FALSE)
words=data.frame(id=c(1,2,3),word=c("word1","word2","word3"), stringsAsFactors = FALSE)
pat <- paste("\b",words$word,"\b", sep="")
sd <- function(text) { stri_count(text, regex=pat) }
results <- sapply(tweets$text, sd, USE.NAMES=F)
colnames(results) <- words$word
rownames(results) <- paste("ID", tweets$id)
results
产生以下输出:
## word1 word2 word3
## ID 1 1 1 0
## ID 2 0 2 0
## ID 3 0 0 1
我对这个问题的想法与赛跑者爸爸几乎相同:
term = paste("\<",words$word,"\>", sep="") # create a regex for every word
# [1] "\<word1\>" "\<word2\>" "\<word3\>"
m <- sapply(tweets$text,function(tweet) str_count(tweet,term)) # find a number of occurences of every word in every tweet
# [,1] [,2] [,3]
# [1,] 1 1 0
# [2,] 0 2 0
# [3,] 0 0 1
library(reshape)
df <- melt(m) # convert the result into the data frame format
# X1 X2 value
# 1 1 1 1
# 2 2 1 0
# 3 3 1 0
# 4 1 2 1
# 5 2 2 2
# 6 3 2 0
# 7 1 3 0
# 8 2 3 0
# 9 3 3 1
colnames(df) <- c('id.tweet','id.word','count')
tmp <- with(df,data.frame(id=id.tweet,termfound=words$word[id.word],count=count)) # create a data frame similar to the one in the example
# id termfound count
# 1 1 word1 1
# 2 2 word1 0
# 3 3 word1 0
# 4 1 word2 1
# 5 2 word2 2
# 6 3 word2 0
# 7 1 word3 0
# 8 2 word3 0
# 9 3 word3 1