R- 词在段落内的共现频率

R- Word co-occurrence frequency within paragraph

该数据集包含 26 篇新闻文章的文本数据。 我想计算每个段落中的单词共现频率,但似乎我下面的代码是在文档(整篇文章)中进行的。 你能用fcm()指定计算共现频率的级别(句子、段落……)吗? 或者有没有其他包可以这样做?


tf_pb <- readtext("PB_articles.csv",text_field = "text")
tf2_pb  <- gsub(pattern = "\b(rifle|rifles|weapon|weapons)\b", replace = "gun", x = tf_pb)
corpus_pb <- corpus(tf2_pb)


tkn_pb <- tokens(corpus_pb,
                 remove_url = TRUE,
                 remove_numbers = TRUE,
                 remove_punct = TRUE,
                 remove_symbols = TRUE,
                 remove_separators = TRUE)

##removeing stopwords & stemming
stm_pb <- tokens_wordstem(tkn_pb)
stw_pb <- tokens_remove(stm_pb, pattern = stopwords('en'))

##multi-word expression
multiword <- c("social media", "house of worship")
comp_toks <- tokens_compound(stw_pb, pattern = phrase(multiword))

kw_pb <- lapply(comp_toks, function(x){ x[which(grepl("\b(synagogu|jewish|rabbi|jew|anti-semit|pittsburgh|congre|communiti|member|hous|worship|weapon|rifle|gun|shooting|assault|attack|hate|hatr|twitter|social_media|morn|gab|white|post|gun|funer|victim|prayer|rabinowitz|suspect|religi|account|nation|door|friend|charge|shiva|wax|speech|home|leader)\b", x))]})

tkn2_pb <- as.tokens(kw_pb)

##remove words
tkn3_pb <- tokens_select(tkn2_pb,c("next-door","consumer-friend","ar-15-gun-mass-shootings.html",
                         selection = "remove", padding = FALSE)

##co-occurrence frequency
fcm_pb <- fcm(tkn3_pb,
             count = "frequency")



## Package version: 2.0.1

data_corpus_inauguralpara <-
  corpus_reshape(data_corpus_inaugural[1:3], to = "paragraphs")
## Corpus consisting of 23 documents, showing 23 documents:
##               Text Types Tokens Sentences Year  President FirstName      Party
##  1789-Washington.1     8     11         1 1789 Washington    George       none
##  1789-Washington.2   184    341         5 1789 Washington    George       none
##  1789-Washington.3   192    328         6 1789 Washington    George       none
##  1789-Washington.4   214    391         5 1789 Washington    George       none
##  1789-Washington.5   120    182         2 1789 Washington    George       none
##  1789-Washington.6   102    164         4 1789 Washington    George       none
##  1789-Washington.7    88    120         1 1789 Washington    George       none
##  1793-Washington.1    47     64         2 1793 Washington    George       none
##  1793-Washington.2    61     83         2 1793 Washington    George       none
##       1797-Adams.1   114    180         2 1797      Adams      John Federalist
##       1797-Adams.2    88    137         3 1797      Adams      John Federalist
##       1797-Adams.3    63    101         1 1797      Adams      John Federalist
##       1797-Adams.4    60     82         3 1797      Adams      John Federalist
##       1797-Adams.5   145    277         6 1797      Adams      John Federalist
##       1797-Adams.6    62    108         2 1797      Adams      John Federalist
##       1797-Adams.7    16     17         1 1797      Adams      John Federalist
##       1797-Adams.8   158    303         8 1797      Adams      John Federalist
##       1797-Adams.9    97    184         4 1797      Adams      John Federalist
##      1797-Adams.10    80    128         1 1797      Adams      John Federalist
##      1797-Adams.11    74    119         3 1797      Adams      John Federalist
##      1797-Adams.12   329    808         1 1797      Adams      John Federalist
##      1797-Adams.13    51     75         1 1797      Adams      John Federalist
##      1797-Adams.14    41     58         1 1797      Adams      John Federalist

您可以在这里看到文档现在是如何变成段落的。现在,对其进行标记并将您自己的操作添加到标记中(您的问题中有几个),然后计算 fcm。

# add your own additional manipulation of tokens here: compounding, etc
toks <- data_corpus_inauguralpara %>%
  tokens(remove_punct = TRUE) %>%

# this creates the fcm within paragraph
fcmat <- fcm(toks, context = "document")
## Feature co-occurrence matrix of: 1,093 by 1,093 features.
##                  features
## features          Fellow-Citizens Senate House Representatives Among
##   Fellow-Citizens               0      1     1               1     0
##   Senate                        0      0     1               1     0
##   House                         0      0     0               2     0
##   Representatives               0      0     0               0     0
##   Among                         0      0     0               0     0
##   vicissitudes                  0      0     0               0     0
##   incident                      0      0     0               0     0
##   life                          0      0     0               0     0
##   event                         0      0     0               0     0
##   filled                        0      0     0               0     0
##                  features
## features          vicissitudes incident life event filled
##   Fellow-Citizens            0        0    0     0      0
##   Senate                     0        0    0     0      0
##   House                      0        0    0     0      0
##   Representatives            0        0    0     0      0
##   Among                      1        1    1     1      1
##   vicissitudes               0        1    1     1      1
##   incident                   0        0    1     1      1
##   life                       0        0    1     1      1
##   event                      0        0    0     0      1
##   filled                     0        0    0     0      0
## [ reached max_feat ... 1,083 more features, reached max_nfeat ... 1,083 more features ]