如何在不删除哈希符号的情况下从文本中提取二元组?

How can I extract bigrams from text without removing the hash symbol?

我正在使用以下函数(基于 https://rpubs.com/sprishi/twitterIBM)从文本中提取二元语法。但是,我想保留哈希符号以供分析之用。清除文本的功能可以正常工作,但 unnest 标记功能会删除特殊字符。有什么方法可以 运行 取消嵌套令牌而不删除特殊字符吗?

x <- (c("I went to afternoon tea with her majesty and #queen @Victoria in the palace.", "Does tea have extra caffeine?"))

clean_Twitter_Corpus <- function(x) {
x  =  tolower(x)                          # convert to lower case characters
x  =  stripWhitespace(x)                  # removing white space
x  =  gsub("^\s+|\s+$", "", x)          # remove leading and trailing white space
x  = removeWords(x,stopwords("english"))  # remove stopwords
return(x)
}    

# clean the twitter texts. call the clean_Twitter_Corpus function
tweets <- clean_Twitter_Corpus(x)
tweets
text <- as.character(tweets)
text <- as.data.frame(text)

tidy_descr_ngrams <- text %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ")
tidy_descr_ngrams

bigram_counts <- tidy_descr_ngrams %>%
  count(word1, word2, sort = TRUE)

bigram_counts   

这是一个涉及创建自定义 n-gram 函数的解决方案

设置

library(tidyverse)
library(tidytext)
library(tm)
library(purrr)

x <- (c("I went to afternoon tea with her majesty and #queen @Victoria in the palace.", "Does tea have extra caffeine?"))

clean_Twitter_Corpus <- function(x) {
  x  =  tolower(x)                          # convert to lower case characters
  x  =  stripWhitespace(x)                  # removing white space
  x  =  gsub("^\s+|\s+$", "", x)          # remove leading and trailing white space
  x  = removeWords(x,stopwords("english"))  # remove stopwords
  return(x)
}

创建n克不去除特殊字符的自定义函数

# A custom build function that will take in a sentence and create
# a tibble of ngrams
ngrams_build = function(sentence, column_name, n = 2) {
  words <- sentence %>% str_split(pattern = " ", simplify = TRUE) 
  words <- words[words != ""]
  ngrams <- map_chr(1:(length(words) - n + 1),
                    .f = function(x, words, n) {
                      paste(words[x:(x + n - 1)], collapse = " ")
                    }, words = words, n = n)
  tibble(!!column_name := ngrams)
}

再次输入您的代码

# clean the twitter texts. call the clean_Twitter_Corpus function
tweets <- clean_Twitter_Corpus(x)
tweets
#> [1] " went  afternoon tea   majesty  #queen @victoria   palace."
#> [2] " tea  extra caffeine?"
text <- as.character(tweets)
text <- as.data.frame(text)

tidy_descr_ngrams <- 
  # here I use purrr function with the custom function
  map_dfr(text$text, ngrams_build, column_name = "bigram", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ")

# Here is the output which is similar to unnest_tokens but has special
# character included
tidy_descr_ngrams
#> # A tibble: 8 x 2
#>   word1     word2    
#>   <chr>     <chr>    
#> 1 went      afternoon
#> 2 afternoon tea      
#> 3 tea       majesty  
#> 4 majesty   #queen   
#> 5 #queen    @victoria
#> 6 @victoria palace.  
#> 7 tea       extra    
#> 8 extra     caffeine?

最终结果

bigram_counts <- tidy_descr_ngrams %>%
  count(word1, word2, sort = TRUE)

bigram_counts
#> # A tibble: 8 x 3
#>   word1     word2         n
#>   <chr>     <chr>     <int>
#> 1 #queen    @victoria     1
#> 2 @victoria palace.       1
#> 3 afternoon tea           1
#> 4 extra     caffeine?     1
#> 5 majesty   #queen        1
#> 6 tea       extra         1
#> 7 tea       majesty       1
#> 8 went      afternoon     1

reprex package (v2.0.1)

创建于 2022-01-09