阻止 rm_stopwords 函数创建列表
Prevent the rm_stopwords function creating a list
我使用 qdap
包中的 rm_stopwords
函数从我的数据框中的文本列中删除停用词和标点符号。
library(qdap)
library(dplyr)
library(tm)
glimpse(dat_full)
Observations: 500
Variables: 9
$ reviewerID <chr> "ABF0ARHORHUUC", "AH4KMS2YC6TXA", "A2IXK5LB...
$ asin <chr> "B00BE6C9S0", "B009X78DKU", "B0077PM3KG", "...
$ reviewerName <chr> "stuartm \"stuartm\"", "HottMess", "G. Farn...
$ helpful <list> [<1, 2>, <0, 0>, <0, 0>, <0, 0>, <0, 0>, <...
$ reviewText <chr> "I've used the Mophie juice pack for my iPh...
$ overall <dbl> 3, 5, 5, 5, 5, 3, 3, 5, 5, 5, 5, 4, 5, 5, 3...
$ summary <chr> "Case issues limit utility of this device",...
$ unixReviewTime <int> 1375142400, 1355356800, 1383350400, 1367193...
$ reviewTime <chr> "07 30, 2013", "12 13, 2012", "11 2, 2013",...
full_dat$reviewText = rm_stopwords(full_dat$reviewText,
tm::stopwords("english"), strip = TRUE)
函数 return 是 reviewText 列的列表。
glimpse(full_dat)
Observations: 500
Variables: 9
$ reviewerID <chr> "ABF0ARHORHUUC", "AH4KMS2YC6TXA", "A2IXK5LB...
$ asin <chr> "B00BE6C9S0", "B009X78DKU", "B0077PM3KG", "...
$ reviewerName <chr> "stuartm \"stuartm\"", "HottMess", "G. Farn...
$ helpful <list> [<1, 2>, <0, 0>, <0, 0>, <0, 0>, <0, 0>, <...
$ reviewText <list> [<"used", "mophie", "juice", "pack", "ipho...
$ overall <dbl> 3, 5, 5, 5, 5, 3, 3, 5, 5, 5, 5, 4, 5, 5, 3...
$ summary <chr> "Case issues limit utility of this device",...
$ unixReviewTime <int> 1375142400, 1355356800, 1383350400, 1367193...
$ reviewTime <chr> "07 30, 2013", "12 13, 2012", "11 2, 2013",...
关于如何防止它(即保留原始格式)或 unlist/unnest 列和 return 原始格式的任何想法?
结果应该与原始数据框中的一样,但没有停用词和标点符号。
这是一个小输出:
structure(list(reviewerID = "A3LWYDTO7928SH", asin = "B00B0FT2T4",
reviewerName = "D. Lang", helpful = list(c(0L, 0L)), reviewText = "When I first put your glass protector on my phone I was blown away! (I knew how "degrading" the soft plastic covers were - ruining my experience, so I chose not to have a protector on my screen.) Then I saw your website and I wondered if it was as good as spoken about. The answer is YES. The application was flawless even after I pulled the glass back off because I had not put it on absolutely perfectly. It repositioned with ease and you could not find a bubble if you had a microscope! Fascinating to see the viscous material on the back spread out on its own! Application could not be easier and the quality of the product seems like it came from NASA.",
overall = 5, summary = "It is as perfect as a product can get - Really!",
unixReviewTime = 1396569600L, reviewTime = "04 4, 2014"), row.names = 145945L, class = "data.frame")
dplyr 管道中的类似内容。结合使用 paste 和 unlist 得到结果。
full_dat <- dat_full %>%
mutate(reviewText = map_chr(reviewText,
function(x) paste0(unlist(qdap::rm_stopwords(x,
tm::stopwords("english"),
strip = TRUE)),
collapse = " ")
)
)
我使用 qdap
包中的 rm_stopwords
函数从我的数据框中的文本列中删除停用词和标点符号。
library(qdap)
library(dplyr)
library(tm)
glimpse(dat_full)
Observations: 500
Variables: 9
$ reviewerID <chr> "ABF0ARHORHUUC", "AH4KMS2YC6TXA", "A2IXK5LB...
$ asin <chr> "B00BE6C9S0", "B009X78DKU", "B0077PM3KG", "...
$ reviewerName <chr> "stuartm \"stuartm\"", "HottMess", "G. Farn...
$ helpful <list> [<1, 2>, <0, 0>, <0, 0>, <0, 0>, <0, 0>, <...
$ reviewText <chr> "I've used the Mophie juice pack for my iPh...
$ overall <dbl> 3, 5, 5, 5, 5, 3, 3, 5, 5, 5, 5, 4, 5, 5, 3...
$ summary <chr> "Case issues limit utility of this device",...
$ unixReviewTime <int> 1375142400, 1355356800, 1383350400, 1367193...
$ reviewTime <chr> "07 30, 2013", "12 13, 2012", "11 2, 2013",...
full_dat$reviewText = rm_stopwords(full_dat$reviewText,
tm::stopwords("english"), strip = TRUE)
函数 return 是 reviewText 列的列表。
glimpse(full_dat)
Observations: 500
Variables: 9
$ reviewerID <chr> "ABF0ARHORHUUC", "AH4KMS2YC6TXA", "A2IXK5LB...
$ asin <chr> "B00BE6C9S0", "B009X78DKU", "B0077PM3KG", "...
$ reviewerName <chr> "stuartm \"stuartm\"", "HottMess", "G. Farn...
$ helpful <list> [<1, 2>, <0, 0>, <0, 0>, <0, 0>, <0, 0>, <...
$ reviewText <list> [<"used", "mophie", "juice", "pack", "ipho...
$ overall <dbl> 3, 5, 5, 5, 5, 3, 3, 5, 5, 5, 5, 4, 5, 5, 3...
$ summary <chr> "Case issues limit utility of this device",...
$ unixReviewTime <int> 1375142400, 1355356800, 1383350400, 1367193...
$ reviewTime <chr> "07 30, 2013", "12 13, 2012", "11 2, 2013",...
关于如何防止它(即保留原始格式)或 unlist/unnest 列和 return 原始格式的任何想法?
结果应该与原始数据框中的一样,但没有停用词和标点符号。
这是一个小输出:
structure(list(reviewerID = "A3LWYDTO7928SH", asin = "B00B0FT2T4",
reviewerName = "D. Lang", helpful = list(c(0L, 0L)), reviewText = "When I first put your glass protector on my phone I was blown away! (I knew how "degrading" the soft plastic covers were - ruining my experience, so I chose not to have a protector on my screen.) Then I saw your website and I wondered if it was as good as spoken about. The answer is YES. The application was flawless even after I pulled the glass back off because I had not put it on absolutely perfectly. It repositioned with ease and you could not find a bubble if you had a microscope! Fascinating to see the viscous material on the back spread out on its own! Application could not be easier and the quality of the product seems like it came from NASA.",
overall = 5, summary = "It is as perfect as a product can get - Really!",
unixReviewTime = 1396569600L, reviewTime = "04 4, 2014"), row.names = 145945L, class = "data.frame")
dplyr 管道中的类似内容。结合使用 paste 和 unlist 得到结果。
full_dat <- dat_full %>%
mutate(reviewText = map_chr(reviewText,
function(x) paste0(unlist(qdap::rm_stopwords(x,
tm::stopwords("english"),
strip = TRUE)),
collapse = " ")
)
)