具有变化的数据类型 R 的语料库提取
corpus extraction with changing data type R
我有一个文本文件语料库,只包含文本,我想从文本中提取 ngram,并将每个文件的原始文件名保存在 3 列的矩阵中..
library(tokenizer)
myTokenizer <- function(x, n, n_min) {
corp<-"this is a full text "
tok <- unlist(tokenize_ngrams(as.character(x), n = n, n_min = n_min))
M <- matrix(nrow=length(tok), ncol=3,
dimnames=list(NULL, c( "gram" , "num.words", "words")))
}
corp <- tm_map(corp,content_transformer(function (x) myTokenizer(x, n=3, n_min=1)))
writecorpus(corp)
我建议创建一个文档术语矩阵 (DTM)。无论如何,您可能会在下游任务中需要它。从中你也可以提取你想要的信息,虽然,假设一个术语(包括 ngrams)只有一个文件来自它可能是不合理的(至少这是我从你的问题中理解的,请如果我错了请纠正我)。因此,我猜想在实践中,一个术语会关联多个文档——这种信息通常存储在 DTM 中。
下面是 text2vec
的示例。如果你能进一步详细说明你想如何使用你的术语等。我可以根据你的需要调整代码。
library(text2vec)
# I have set up two text do not overlap in any term just as an example
# in practice, this probably never happens
docs = c(d1 = c("here a text"), d2 = c("and another one"))
it = itoken(docs, tokenizer = word_tokenizer, progressbar = F)
v = create_vocabulary(it, ngram = c(1,3))
vectorizer = vocab_vectorizer(v)
dtm = create_dtm(it, vectorizer)
as.matrix(dtm)
# a a_text and and_another and_another_one another another_one here here_a here_a_text one text
# d1 1 1 0 0 0 0 0 1 1 1 0 1
# d2 0 0 1 1 1 1 1 0 0 0 1 0
library(stringi)
docs = c(d1 = c("here a text"), d2 = c("and another one"))
it = itoken(docs, tokenizer = word_tokenizer, progressbar = F)
v = create_vocabulary(it, ngram = c(1,3))
vectorizer = vocab_vectorizer(v)
dtm = create_dtm(it, vectorizer)
for (d in rownames(dtm)) {
v = dtm[d, ]
v = v[v!=0]
v = data.frame(number = 1:length(v)
,term = names(v))
v$n = stri_count_fixed(v$term, "_")+1
write.csv(v, file = paste0("v_", d, ".csv"), row.names = F)
}
read.csv("v_d1.csv")
# number term n
# 1 1 a 1
# 2 2 a_text 2
# 3 3 here 1
# 4 4 here_a 2
# 5 5 here_a_text 3
# 6 6 text 1
read.csv("v_d2.csv")
# number term n
# 1 1 and 1
# 2 2 and_another 2
# 3 3 and_another_one 3
# 4 4 another 1
# 5 5 another_one 2
# 6 6 one 1
因为我没有你的语料库,所以我使用来自 tm 的原始数据集创建了一个我自己的语料库。无需使用 tm_map,因为它可以将数据保存为语料库格式。 tokenizer 包可以处理这个。
我所做的是通过 lapply
将所有你想要的矩阵存储在一个列表对象中,然后使用 sapply
将数据作为单独的文件存储在原始目录中。
请注意,函数中指定的矩阵将是字符矩阵。这意味着第 1 列和第 2 列将是字符,而不是数字。
library(tm)
data("crude")
crude <- as.VCorpus(crude)
myTokenizer <- function(x, n, n_min) {
tok <- unlist(tokenizers::tokenize_ngrams(as.character(x), n = n, n_min = n_min))
M <- matrix(nrow=length(tok), ncol=3,
dimnames=list(NULL, c( "gram" , "num.words", "words")))
M[, 3] <- tok
M[, 2] <- lengths(strsplit(M[, 3], "\W+")) # counts the words
M[, 1] <- 1:length(tok)
return(M)
}
my_matrices <- lapply(crude, myTokenizer, n = 3, n_min = 1)
# make sure directory crude exists as a subfolder in working directory
sapply(names(my_matrices),
function (x) write.table(my_matrices[[x]], file=paste("crude/", x, ".txt", sep=""), row.names = FALSE))
第一个文件的结果:
"gram" "num.words" "words"
"1" "1" "diamond"
"2" "2" "diamond shamrock"
"3" "3" "diamond shamrock corp"
"4" "1" "shamrock"
"5" "2" "shamrock corp"
"6" "3" "shamrock corp said"
我有一个文本文件语料库,只包含文本,我想从文本中提取 ngram,并将每个文件的原始文件名保存在 3 列的矩阵中..
library(tokenizer)
myTokenizer <- function(x, n, n_min) {
corp<-"this is a full text "
tok <- unlist(tokenize_ngrams(as.character(x), n = n, n_min = n_min))
M <- matrix(nrow=length(tok), ncol=3,
dimnames=list(NULL, c( "gram" , "num.words", "words")))
}
corp <- tm_map(corp,content_transformer(function (x) myTokenizer(x, n=3, n_min=1)))
writecorpus(corp)
我建议创建一个文档术语矩阵 (DTM)。无论如何,您可能会在下游任务中需要它。从中你也可以提取你想要的信息,虽然,假设一个术语(包括 ngrams)只有一个文件来自它可能是不合理的(至少这是我从你的问题中理解的,请如果我错了请纠正我)。因此,我猜想在实践中,一个术语会关联多个文档——这种信息通常存储在 DTM 中。
下面是 text2vec
的示例。如果你能进一步详细说明你想如何使用你的术语等。我可以根据你的需要调整代码。
library(text2vec)
# I have set up two text do not overlap in any term just as an example
# in practice, this probably never happens
docs = c(d1 = c("here a text"), d2 = c("and another one"))
it = itoken(docs, tokenizer = word_tokenizer, progressbar = F)
v = create_vocabulary(it, ngram = c(1,3))
vectorizer = vocab_vectorizer(v)
dtm = create_dtm(it, vectorizer)
as.matrix(dtm)
# a a_text and and_another and_another_one another another_one here here_a here_a_text one text
# d1 1 1 0 0 0 0 0 1 1 1 0 1
# d2 0 0 1 1 1 1 1 0 0 0 1 0
library(stringi)
docs = c(d1 = c("here a text"), d2 = c("and another one"))
it = itoken(docs, tokenizer = word_tokenizer, progressbar = F)
v = create_vocabulary(it, ngram = c(1,3))
vectorizer = vocab_vectorizer(v)
dtm = create_dtm(it, vectorizer)
for (d in rownames(dtm)) {
v = dtm[d, ]
v = v[v!=0]
v = data.frame(number = 1:length(v)
,term = names(v))
v$n = stri_count_fixed(v$term, "_")+1
write.csv(v, file = paste0("v_", d, ".csv"), row.names = F)
}
read.csv("v_d1.csv")
# number term n
# 1 1 a 1
# 2 2 a_text 2
# 3 3 here 1
# 4 4 here_a 2
# 5 5 here_a_text 3
# 6 6 text 1
read.csv("v_d2.csv")
# number term n
# 1 1 and 1
# 2 2 and_another 2
# 3 3 and_another_one 3
# 4 4 another 1
# 5 5 another_one 2
# 6 6 one 1
因为我没有你的语料库,所以我使用来自 tm 的原始数据集创建了一个我自己的语料库。无需使用 tm_map,因为它可以将数据保存为语料库格式。 tokenizer 包可以处理这个。
我所做的是通过 lapply
将所有你想要的矩阵存储在一个列表对象中,然后使用 sapply
将数据作为单独的文件存储在原始目录中。
请注意,函数中指定的矩阵将是字符矩阵。这意味着第 1 列和第 2 列将是字符,而不是数字。
library(tm)
data("crude")
crude <- as.VCorpus(crude)
myTokenizer <- function(x, n, n_min) {
tok <- unlist(tokenizers::tokenize_ngrams(as.character(x), n = n, n_min = n_min))
M <- matrix(nrow=length(tok), ncol=3,
dimnames=list(NULL, c( "gram" , "num.words", "words")))
M[, 3] <- tok
M[, 2] <- lengths(strsplit(M[, 3], "\W+")) # counts the words
M[, 1] <- 1:length(tok)
return(M)
}
my_matrices <- lapply(crude, myTokenizer, n = 3, n_min = 1)
# make sure directory crude exists as a subfolder in working directory
sapply(names(my_matrices),
function (x) write.table(my_matrices[[x]], file=paste("crude/", x, ".txt", sep=""), row.names = FALSE))
第一个文件的结果:
"gram" "num.words" "words"
"1" "1" "diamond"
"2" "2" "diamond shamrock"
"3" "3" "diamond shamrock corp"
"4" "1" "shamrock"
"5" "2" "shamrock corp"
"6" "3" "shamrock corp said"