保留数据频率的所有文本短语
Keep all the text phrases for data frequency
我有一个只有一列的数据框"text"
"text"
"User Interfaces"
"Twitter"
"Text Normalization"
"Term weighting"
"Teenagers"
"Team member replacement"
我想获取一个包含每个短语频率的数据框,如下所示:
"User Interfaces",1
"Twitter",1
"Text Normalization",1
"Term weighting",1
"Teenagers",1
"Team member replacement",1
为了制作它,我使用了这个:
library(tm)
df <- read.csv("C:/Users/acel/Desktop/myphr.csv", header=TRUE, sep=",")
corpusD <- Corpus(VectorSource(df$text))
corpusD <- tm_map(corpusD, tolower)
corpusD <- tm_map(corpusD, removeWords, stopwords('english'))
corpusD <- tm_map(corpusD, removeNumbers)
corpusD <- tm_map(corpusD, stripWhitespace)
corpusD <- tm_map(corpusD, PlainTextDocument)
corpusD <- tm_map(corpusD, stemDocument, language = "english")
corpusC <- Corpus(VectorSource(corpusD))
matrixD <- TermDocumentMatrix(corpusC)
matrixD <- removeSparseTerms(matrixD, 0.75)
MatrixDfreq <- rowSums(as.matrix(matrixD))
MatrixDfreq<-sort(MatrixDfreq, decreasing = TRUE)
MatrixDtop30<- MatrixDfreq [1:30]
但是当我检查 MatrixDtop30
的结果时,我看到一个词被计数为 user,1
和 interface,1
而不是 "user interface",1
知道为什么会这样吗?
我认为使用 data.table 操作会容易得多。
library(data.table)
df = data.frame(text = c("test", "test" ,"test" , "test2", "test3", "test2"))
> df
text
1 test
2 test
3 test
4 test2
5 test3
6 test2
setDT(df)
df = df[ , .(Number = .N), by = .(text)]
> df
text Number
1: test 3
2: test2 2
3: test3 1
编辑
我们可以用这个来包含词干
library(data.table)
library(SnowballC)
df = data.frame(text = c("test", "testing" ,"test" , "test2", "test3", "test2"))
> df
text
1 test
2 testing
3 test
4 test2
5 test3
6 test2
df$text = wordStem(df$text, language = "porter")
> df
text
1 test
2 test
3 test
4 test2
5 test3
6 test2
setDT(df)
df = df[ , .(Number = .N), by = .(text)]
> df
text Number
1: test 3
2: test2 2
3: test3 1
在示例输出中,您似乎没有对文本执行任何转换,例如小写或删除停用词,只是保持短语原样?如果是这样,您可以使用 tidyverse
.
轻松计算唯一短语的数量
library(dplyr)
library(readr)
df <- data_frame(text = c("User Interfaces", "Twitter", "Text Normalization", "Term weighting", "Teenagers", "Team member replacement")
count(df, text)
text n
<chr> <int>
1 Team member replacement 1
2 Teenagers 1
3 Term weighting 1
4 text 1
5 Text Normalization 1
6 Twitter 1
7 User Interfaces 1
或
text_df <- read_csv("C:/Users/acel/Desktop/myphr.csv")
count(text_df, text, sort = TRUE)
如果您需要对文本执行转换,请查看 stringr
和 tidytext
包。
我有一个只有一列的数据框"text"
"text"
"User Interfaces"
"Twitter"
"Text Normalization"
"Term weighting"
"Teenagers"
"Team member replacement"
我想获取一个包含每个短语频率的数据框,如下所示:
"User Interfaces",1
"Twitter",1
"Text Normalization",1
"Term weighting",1
"Teenagers",1
"Team member replacement",1
为了制作它,我使用了这个:
library(tm)
df <- read.csv("C:/Users/acel/Desktop/myphr.csv", header=TRUE, sep=",")
corpusD <- Corpus(VectorSource(df$text))
corpusD <- tm_map(corpusD, tolower)
corpusD <- tm_map(corpusD, removeWords, stopwords('english'))
corpusD <- tm_map(corpusD, removeNumbers)
corpusD <- tm_map(corpusD, stripWhitespace)
corpusD <- tm_map(corpusD, PlainTextDocument)
corpusD <- tm_map(corpusD, stemDocument, language = "english")
corpusC <- Corpus(VectorSource(corpusD))
matrixD <- TermDocumentMatrix(corpusC)
matrixD <- removeSparseTerms(matrixD, 0.75)
MatrixDfreq <- rowSums(as.matrix(matrixD))
MatrixDfreq<-sort(MatrixDfreq, decreasing = TRUE)
MatrixDtop30<- MatrixDfreq [1:30]
但是当我检查 MatrixDtop30
的结果时,我看到一个词被计数为 user,1
和 interface,1
而不是 "user interface",1
知道为什么会这样吗?
我认为使用 data.table 操作会容易得多。
library(data.table)
df = data.frame(text = c("test", "test" ,"test" , "test2", "test3", "test2"))
> df
text
1 test
2 test
3 test
4 test2
5 test3
6 test2
setDT(df)
df = df[ , .(Number = .N), by = .(text)]
> df
text Number
1: test 3
2: test2 2
3: test3 1
编辑
我们可以用这个来包含词干
library(data.table)
library(SnowballC)
df = data.frame(text = c("test", "testing" ,"test" , "test2", "test3", "test2"))
> df
text
1 test
2 testing
3 test
4 test2
5 test3
6 test2
df$text = wordStem(df$text, language = "porter")
> df
text
1 test
2 test
3 test
4 test2
5 test3
6 test2
setDT(df)
df = df[ , .(Number = .N), by = .(text)]
> df
text Number
1: test 3
2: test2 2
3: test3 1
在示例输出中,您似乎没有对文本执行任何转换,例如小写或删除停用词,只是保持短语原样?如果是这样,您可以使用 tidyverse
.
library(dplyr)
library(readr)
df <- data_frame(text = c("User Interfaces", "Twitter", "Text Normalization", "Term weighting", "Teenagers", "Team member replacement")
count(df, text)
text n
<chr> <int>
1 Team member replacement 1
2 Teenagers 1
3 Term weighting 1
4 text 1
5 Text Normalization 1
6 Twitter 1
7 User Interfaces 1
或
text_df <- read_csv("C:/Users/acel/Desktop/myphr.csv")
count(text_df, text, sort = TRUE)
如果您需要对文本执行转换,请查看 stringr
和 tidytext
包。