具有变化的数据类型 R 的语料库提取

corpus extraction with changing data type R

我有一个文本文件语料库,只包含文本,我想从文本中提取 ngram,并将每个文件的原始文件名保存在 3 列的矩阵中..

   library(tokenizer)      
    myTokenizer <- function(x, n, n_min) {

corp<-"this is a full text "
     tok <- unlist(tokenize_ngrams(as.character(x), n = n, n_min = n_min))
      M <- matrix(nrow=length(tok), ncol=3, 
                  dimnames=list(NULL, c( "gram" , "num.words", "words")))
      }
    corp <- tm_map(corp,content_transformer(function (x) myTokenizer(x, n=3, n_min=1)))

        writecorpus(corp)

我建议创建一个文档术语矩阵 (DTM)。无论如何,您可能会在下游任务中需要它。从中你也可以提取你想要的信息,虽然,假设一个术语(包括 ngrams)只有一个文件来自它可能是不合理的(至少这是我从你的问题中理解的,请如果我错了请纠正我)。因此,我猜想在实践中,一个术语会关联多个文档——这种信息通常存储在 DTM 中。

下面是 text2vec 的示例。如果你能进一步详细说明你想如何使用你的术语等。我可以根据你的需要调整代码。

library(text2vec)
# I have set up two text do not overlap in any term just as an example
# in practice, this probably never happens
docs = c(d1 = c("here a text"), d2 = c("and another one"))
it = itoken(docs, tokenizer = word_tokenizer, progressbar = F)
v = create_vocabulary(it, ngram = c(1,3))
vectorizer = vocab_vectorizer(v)
dtm = create_dtm(it, vectorizer)
as.matrix(dtm)
#    a a_text and and_another and_another_one another another_one here here_a here_a_text one text
# d1 1      1   0           0               0       0           0    1      1           1   0    1
# d2 0      0   1           1               1       1           1    0      0           0   1    0

library(stringi)
docs = c(d1 = c("here a text"), d2 = c("and another one"))
it = itoken(docs, tokenizer = word_tokenizer, progressbar = F)
v = create_vocabulary(it, ngram = c(1,3))
vectorizer = vocab_vectorizer(v)
dtm = create_dtm(it, vectorizer)
for (d in rownames(dtm)) {
  v = dtm[d, ]
  v = v[v!=0]
  v = data.frame(number = 1:length(v)
                 ,term = names(v))
  v$n = stri_count_fixed(v$term, "_")+1
  write.csv(v, file = paste0("v_", d, ".csv"), row.names = F)
}
read.csv("v_d1.csv")
#   number        term n
# 1      1           a 1
# 2      2      a_text 2
# 3      3        here 1
# 4      4      here_a 2
# 5      5 here_a_text 3
# 6      6        text 1
read.csv("v_d2.csv")
#   number            term n
# 1      1             and 1
# 2      2     and_another 2
# 3      3 and_another_one 3
# 4      4         another 1
# 5      5     another_one 2
# 6      6             one 1

因为我没有你的语料库,所以我使用来自 tm 的原始数据集创建了一个我自己的语料库。无需使用 tm_map,因为它可以将数据保存为语料库格式。 tokenizer 包可以处理这个。

我所做的是通过 lapply 将所有你想要的矩阵存储在一个列表对象中,然后使用 sapply 将数据作为单独的文件存储在原始目录中。

请注意,函数中指定的矩阵将是字符矩阵。这意味着第 1 列和第 2 列将是字符,而不是数字。

library(tm)
data("crude")
crude <- as.VCorpus(crude)

myTokenizer <- function(x, n, n_min) {
  tok <- unlist(tokenizers::tokenize_ngrams(as.character(x), n = n, n_min = n_min))
  M <- matrix(nrow=length(tok), ncol=3, 
              dimnames=list(NULL, c( "gram" , "num.words", "words")))
  M[, 3] <- tok
  M[, 2] <- lengths(strsplit(M[, 3], "\W+"))  # counts the words
  M[, 1] <- 1:length(tok)
  return(M)
}

my_matrices <- lapply(crude, myTokenizer, n = 3, n_min = 1)

#  make sure directory crude exists as a subfolder in working directory
sapply(names(my_matrices), 
       function (x) write.table(my_matrices[[x]], file=paste("crude/", x, ".txt", sep=""), row.names = FALSE))

第一个文件的结果:

"gram" "num.words" "words"
"1" "1" "diamond"
"2" "2" "diamond shamrock"
"3" "3" "diamond shamrock corp"
"4" "1" "shamrock"
"5" "2" "shamrock corp"
"6" "3" "shamrock corp said"