如何在不删除哈希符号的情况下从文本中提取二元组?
How can I extract bigrams from text without removing the hash symbol?
我正在使用以下函数(基于 https://rpubs.com/sprishi/twitterIBM)从文本中提取二元语法。但是,我想保留哈希符号以供分析之用。清除文本的功能可以正常工作,但 unnest 标记功能会删除特殊字符。有什么方法可以 运行 取消嵌套令牌而不删除特殊字符吗?
x <- (c("I went to afternoon tea with her majesty and #queen @Victoria in the palace.", "Does tea have extra caffeine?"))
clean_Twitter_Corpus <- function(x) {
x = tolower(x) # convert to lower case characters
x = stripWhitespace(x) # removing white space
x = gsub("^\s+|\s+$", "", x) # remove leading and trailing white space
x = removeWords(x,stopwords("english")) # remove stopwords
return(x)
}
# clean the twitter texts. call the clean_Twitter_Corpus function
tweets <- clean_Twitter_Corpus(x)
tweets
text <- as.character(tweets)
text <- as.data.frame(text)
tidy_descr_ngrams <- text %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ")
tidy_descr_ngrams
bigram_counts <- tidy_descr_ngrams %>%
count(word1, word2, sort = TRUE)
bigram_counts
这是一个涉及创建自定义 n-gram 函数的解决方案
设置
library(tidyverse)
library(tidytext)
library(tm)
library(purrr)
x <- (c("I went to afternoon tea with her majesty and #queen @Victoria in the palace.", "Does tea have extra caffeine?"))
clean_Twitter_Corpus <- function(x) {
x = tolower(x) # convert to lower case characters
x = stripWhitespace(x) # removing white space
x = gsub("^\s+|\s+$", "", x) # remove leading and trailing white space
x = removeWords(x,stopwords("english")) # remove stopwords
return(x)
}
创建n克不去除特殊字符的自定义函数
# A custom build function that will take in a sentence and create
# a tibble of ngrams
ngrams_build = function(sentence, column_name, n = 2) {
words <- sentence %>% str_split(pattern = " ", simplify = TRUE)
words <- words[words != ""]
ngrams <- map_chr(1:(length(words) - n + 1),
.f = function(x, words, n) {
paste(words[x:(x + n - 1)], collapse = " ")
}, words = words, n = n)
tibble(!!column_name := ngrams)
}
再次输入您的代码
# clean the twitter texts. call the clean_Twitter_Corpus function
tweets <- clean_Twitter_Corpus(x)
tweets
#> [1] " went afternoon tea majesty #queen @victoria palace."
#> [2] " tea extra caffeine?"
text <- as.character(tweets)
text <- as.data.frame(text)
tidy_descr_ngrams <-
# here I use purrr function with the custom function
map_dfr(text$text, ngrams_build, column_name = "bigram", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ")
# Here is the output which is similar to unnest_tokens but has special
# character included
tidy_descr_ngrams
#> # A tibble: 8 x 2
#> word1 word2
#> <chr> <chr>
#> 1 went afternoon
#> 2 afternoon tea
#> 3 tea majesty
#> 4 majesty #queen
#> 5 #queen @victoria
#> 6 @victoria palace.
#> 7 tea extra
#> 8 extra caffeine?
最终结果
bigram_counts <- tidy_descr_ngrams %>%
count(word1, word2, sort = TRUE)
bigram_counts
#> # A tibble: 8 x 3
#> word1 word2 n
#> <chr> <chr> <int>
#> 1 #queen @victoria 1
#> 2 @victoria palace. 1
#> 3 afternoon tea 1
#> 4 extra caffeine? 1
#> 5 majesty #queen 1
#> 6 tea extra 1
#> 7 tea majesty 1
#> 8 went afternoon 1
由 reprex package (v2.0.1)
创建于 2022-01-09
我正在使用以下函数(基于 https://rpubs.com/sprishi/twitterIBM)从文本中提取二元语法。但是,我想保留哈希符号以供分析之用。清除文本的功能可以正常工作,但 unnest 标记功能会删除特殊字符。有什么方法可以 运行 取消嵌套令牌而不删除特殊字符吗?
x <- (c("I went to afternoon tea with her majesty and #queen @Victoria in the palace.", "Does tea have extra caffeine?"))
clean_Twitter_Corpus <- function(x) {
x = tolower(x) # convert to lower case characters
x = stripWhitespace(x) # removing white space
x = gsub("^\s+|\s+$", "", x) # remove leading and trailing white space
x = removeWords(x,stopwords("english")) # remove stopwords
return(x)
}
# clean the twitter texts. call the clean_Twitter_Corpus function
tweets <- clean_Twitter_Corpus(x)
tweets
text <- as.character(tweets)
text <- as.data.frame(text)
tidy_descr_ngrams <- text %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ")
tidy_descr_ngrams
bigram_counts <- tidy_descr_ngrams %>%
count(word1, word2, sort = TRUE)
bigram_counts
这是一个涉及创建自定义 n-gram 函数的解决方案
设置
library(tidyverse)
library(tidytext)
library(tm)
library(purrr)
x <- (c("I went to afternoon tea with her majesty and #queen @Victoria in the palace.", "Does tea have extra caffeine?"))
clean_Twitter_Corpus <- function(x) {
x = tolower(x) # convert to lower case characters
x = stripWhitespace(x) # removing white space
x = gsub("^\s+|\s+$", "", x) # remove leading and trailing white space
x = removeWords(x,stopwords("english")) # remove stopwords
return(x)
}
创建n克不去除特殊字符的自定义函数
# A custom build function that will take in a sentence and create
# a tibble of ngrams
ngrams_build = function(sentence, column_name, n = 2) {
words <- sentence %>% str_split(pattern = " ", simplify = TRUE)
words <- words[words != ""]
ngrams <- map_chr(1:(length(words) - n + 1),
.f = function(x, words, n) {
paste(words[x:(x + n - 1)], collapse = " ")
}, words = words, n = n)
tibble(!!column_name := ngrams)
}
再次输入您的代码
# clean the twitter texts. call the clean_Twitter_Corpus function
tweets <- clean_Twitter_Corpus(x)
tweets
#> [1] " went afternoon tea majesty #queen @victoria palace."
#> [2] " tea extra caffeine?"
text <- as.character(tweets)
text <- as.data.frame(text)
tidy_descr_ngrams <-
# here I use purrr function with the custom function
map_dfr(text$text, ngrams_build, column_name = "bigram", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ")
# Here is the output which is similar to unnest_tokens but has special
# character included
tidy_descr_ngrams
#> # A tibble: 8 x 2
#> word1 word2
#> <chr> <chr>
#> 1 went afternoon
#> 2 afternoon tea
#> 3 tea majesty
#> 4 majesty #queen
#> 5 #queen @victoria
#> 6 @victoria palace.
#> 7 tea extra
#> 8 extra caffeine?
最终结果
bigram_counts <- tidy_descr_ngrams %>%
count(word1, word2, sort = TRUE)
bigram_counts
#> # A tibble: 8 x 3
#> word1 word2 n
#> <chr> <chr> <int>
#> 1 #queen @victoria 1
#> 2 @victoria palace. 1
#> 3 afternoon tea 1
#> 4 extra caffeine? 1
#> 5 majesty #queen 1
#> 6 tea extra 1
#> 7 tea majesty 1
#> 8 went afternoon 1
由 reprex package (v2.0.1)
创建于 2022-01-09