比较两个文档中的词袋并在第二个文档中找到匹配的词及其频率

Compare the bag of words in two document and find the matching word and their frequency in second document

我计算了 'yelp.csv'、'yelpp.csv'、'yelpn.csv' 的词袋,并创建了个体数据集词频矩阵。现在,我想将 yelp 的词袋与 yelpn 进行比较,并检查 yelp 中有多少词出现在 yelpn 中及其频率并将其存储在变量作为矩阵,然后与 yelpp 相同。 yelp 包含正面和负面。 yelpp,只有正数,yelpn,只有负数。任何人都可以完成代码吗?我不知道这段代码是否相关,我希望如此。

getwd()
setwd("/Users/ash/RProjects/exc")
getwd()
df <- read.csv("yelp.CSV",header = TRUE,quote="\"",stringsAsFactors= TRUE,
           strip.white = TRUE)
df
dfd<-as.character(df[,2])
dfd
df2<-as.character(df[,1])
df2
words <- readLines(system.file("stopwords", "english.dat",
                           package = "tm"))
s<-remove_stopwords(dfd, words, lines = TRUE)
s
print(paste("****Stopwords are removed successfully****"))
n<-removeNumbers(s)
n
t<-removePunctuation(n, preserve_intra_word_dashes = FALSE)
t

#pos
dfp <- read.csv("yelpp.CSV",header = TRUE,quote="\"",stringsAsFactors= TRUE,
           strip.white = TRUE)
dfp
dfdp<-as.character(dfp[,2])
dfdp
df2p<-as.character(dfp[,1])
df2p
wordsp <- readLines(system.file("stopwords", "english.dat",
                           package = "tm"))
sp<-remove_stopwords(dfdp, words, lines = TRUE)
sp
print(paste("****Stopwords are removed successfully****"))
np<-removeNumbers(sp)
np
tp<-removePunctuation(np, preserve_intra_word_dashes = FALSE)
tp

#neg
dfn <- read.csv("yelpn.CSV",header = TRUE,quote="\"",stringsAsFactors=   TRUE,
           strip.white = TRUE)
dfn
dfdn<-as.character(dfn[,2])
dfdn
df2n<-as.character(dfn[,1])
df2n
wordsn <- readLines(system.file("stopwords", "english.dat",
                           package = "tm"))
sn<-remove_stopwords(dfdn, words, lines = TRUE)
sn
print(paste("****Stopwords are removed successfully****"))
nn<-removeNumbers(sn)
nn
tn<-removePunctuation(nn, preserve_intra_word_dashes = FALSE)
tn



#bag
b<-bag_o_words(t, apostrophe.remove = TRUE)
b
b.mat = as.matrix(b)
b.mat
bp<-bag_o_words(tp, apostrophe.remove = TRUE)
bp
bp.mat = as.matrix(bp)
bp.mat
bn<-bag_o_words(tn, apostrophe.remove = TRUE)
bn
bn.mat = as.matrix(bn)
bn.mat

#frequent terms
frequent_terms <- freq_terms(b.mat, 2000)
frequent_terms
frequent_termsp <- freq_terms(tp, 2000)
frequent_termsp
frequent_termsn <- freq_terms(tn, 2000)
frequent_termsn

我正在使用 wiki Text mining 中的文本作为示例语料库。使用 tm 包和 findFreqTerms,agrep 函数是这种方法的要点。

agrep

Searches for approximate matches to pattern (the first argument) within each element of the string x (the second argument) using the generalized Levenshtein edit distance (the minimal possibly weighted number of insertions, deletions and substitutions needed to transform one string into another).

方法步骤:

texts -> corpuses -> data cleaning -> findfreqterms -> compare with other term doc matrix

library(tm)

c1 <- Corpus(VectorSource("Text mining, also referred to as text data mining, roughly equivalent to text analytics, is the process of deriving high-quality information from text. High-quality information is typically derived through the devising of patterns and trends through means such as statistical pattern learning"))

c2 <- Corpus(VectorSource("Text mining usually involves the process of structuring the input text (usually parsing, along with the addition of some derived linguistic features and the removal of others, and subsequent insertion into a database), deriving patterns within the structured data, and finally evaluation and interpretation of the output"))

c3 <- Corpus(VectorSource("Typical text mining tasks include text categorization, text clustering, concept/entity extraction, production of granular taxonomies, sentiment analysis, document summarization, and entity relation modeling (i.e., learning relations between named entities)"))

# Data Cleaning and transformation
c1 <- tm_map(c1, content_transformer(tolower))
c2 <- tm_map(c2, content_transformer(tolower))
c3 <- tm_map(c3, content_transformer(tolower))

c1 <- tm_map(c1, removePunctuation)
c1 <- tm_map(c1, removeNumbers)
c1 <- tm_map(c1, removeWords, stopwords("english"))
c1 <- tm_map(c1, stripWhitespace)

c2 <- tm_map(c2, removePunctuation)
c2 <- tm_map(c2, removeNumbers)
c2 <- tm_map(c2, removeWords, stopwords("english"))
c2 <- tm_map(c2, stripWhitespace)

c3 <- tm_map(c3, removePunctuation)
c3 <- tm_map(c3, removeNumbers)
c3 <- tm_map(c3, removeWords, stopwords("english"))
c3 <- tm_map(c3, stripWhitespace)

dtm1 <- DocumentTermMatrix(c1, control = list(weighting = weightTfIdf, stopwords = TRUE))
dtm2 <- DocumentTermMatrix(c2, control = list(weighting = weightTfIdf, stopwords = TRUE))
dtm3 <- DocumentTermMatrix(c3, control = list(weighting = weightTfIdf, stopwords = TRUE))

ft1 <- findFreqTerms(dtm1)
ft2 <- findFreqTerms(dtm2)
ft3 <- findFreqTerms(dtm3)

#similarity between c1 and c2
common.c1c2 <- data.frame(term = character(0), freq = integer(0))
for(t in ft1){
  find <- agrep(t, ft2)
  if(length(find) != 0){
    common.c1c2 <- rbind(common.c1c2, data.frame(term = t, freq = length(find)))
  }
}
# Note : this for loop can be substituted by apply family functions if taking time for large text

common.c1c2 包含语料库 1 和语料库 2 之间的常用词,频率为

> common.c1c2
      term freq
1     also    1
2     data    2
3  derived    1
4 deriving    1
5   mining    1
6  pattern    1
7 patterns    1
8  process    1
9     text    1

> ft1
 [1] "also"        "analytics"   "data"        "derived"     "deriving"    "devising"    "equivalent" 
 [8] "highquality" "information" "learning"    "means"       "mining"      "pattern"     "patterns"   
[15] "process"     "referred"    "roughly"     "statistical" "text"        "trends"      "typically"  

> ft2
 [1] "addition"       "along"          "data"           "database"       "derived"        "deriving"      
 [7] "evaluation"     "features"       "finally"        "input"          "insertion"      "interpretation"
[13] "involves"       "linguistic"     "mining"         "others"         "output"         "parsing"       
[19] "patterns"       "process"        "removal"        "structured"     "structuring"    "subsequent"    
[25] "text"           "usually"        "within"        

此解决方案不是最有效的解决方案,但希望对您有所帮助。