文本的 R 特征提取
R feature extraction for text
我的问题是关于文本挖掘和文本处理。
我想从我的文本构建一个数据框。
我的数据是:
text <- c("#*TeX: The Program,
#@Donald E. Knuth,
#t1986,
#c,
#index68,
""
#*Foundations of Databases.,
#@Serge Abiteboul,Richard Hull,Victor Vianu,
#t1995,
#c,
#index69,
#%1118192,
#%189,
#%1088975,
#%971271,
#%832272,
#!From the Book: This book will teach you how to write specifications of computer systems, using the language TLA+.")
我的预期输出是:
expected <- data.frame(title=c("#*TeX: The Program", "#*Foundations of Databases."), authors=c("#@Donald E. Knuth", "#@Serge Abiteboul,Richard Hull,Victor Vianu"), year=c("#t1986", "#t1995"), revue=c("#c", "#c"), id_paper=c("#index68", "#index69"),
id_ref=c(NA,"#%1118192, #%189, #%1088975, #%971271, #%832272"), abstract=c(NA, "#!From the Book: This book will teach you how to write specifications of computer systems, using the language TLA+."))
我的代码是:
coln <- c("title", "authors", "year", "revue","id_paper", "id_ref", "abstract")
title_index <- grep("^#[*]", text)
authors_index <- grep("#@", text)
year_index <- grep("#t", text)
revue_index <- grep("#c", text)
id_paper_index <- grep("#index", text)
id_refindex <- grep("#%", text)
abstract_index <- grep("#!", text)
df <- matrix(NA, nrow=length(title_index), ncol=length(coln))
colnames(df) <- coln
stoc_index <- grep("#cSTOC", text)
sigir_index <- grep("#cSIGIR", text)}
########## titre
{der_pos <- length(title_index)
tit_position <- c(title_index , der_pos)
for(i in 1:length(title_position)){
if(i != length(title_position)){
df[i, "title"] <- text[title_position[i]]
}
}
}
########## author
{der_pos <- length(authors_index)
authors_position <- c(authors_index )
for(i in 1:length(auteur_position)){
if(i != length(auteur_position)){
df[i, "auteur"] <- text[auteur_position[i]]
}
}
}
########## year
{der_pos <- length(year_index)
year_position <- c(year_index , der_pos)
for(i in 1:length(year_position)){
if(i != length(year_position)){
df[i, "année"] <- text[year_position[i]]
}
}
}
##########??? revue
{der_pos <- length(revue_index)
revue_position <- c(revue_index )
for(i in 1:length(revue_position)){
if(i != length(revue_position)){
df[i, "revue"] <- text[revue_position[i]]
}
}
}
########## id_paper
{der_pos <- length(id_paper_index)
id_paper_position <- c(id_paper_index , dern_pos)
for(i in 1:length(id_paper_position)){
if(i != length(id_paper_position)){
df[i, "id_paper"] <- text[id_paper_position[i]]
}
}
}
########## id_ref
{der_pos <- length(id_ref_index)
id_ref_position <- c(id_ref_index , der_pos)
for(i in 1:length(id_ref_position)){
if(i != length(id_ref_position)){
df[i, "id_ref"] <- text[id_ref_position[i]]
}
}
}
########## abstract
{der_pos <- length(abstract_index)
abstract_position <- c(abstract_index , der_pos)
for(i in 1:length(abstract_position)){
if(i != length(abstract_position)){
df[i, "abstract"] <- text[abstract_position[i]]
}
}
}
所以我想在一行中提取引用
如果您有针对一篇文章在一栏中连接多个引用并用逗号分隔的解决方案,请提前致谢。
谢谢:)
新的和改进的
text.n <- strsplit(text, "\n(?=#\*)", perl=TRUE)[[1]]; text.n
text.s <- lapply(text.n, function(x) strsplit(x, "\n")[[1]])
patterns <- list(title="^#\*",
autors="^#@",
year="^#t",
revue="^#c",
id_paper="^#index",
id_ref="^#%",
abstract="^#!")
tex.l <- lapply(text.s, function(x)
lapply(patterns, function(y)
paste(sub(y, "", grep(y, x, value=TRUE)), collapse=",")
)
)
tex.m <- matrix(unlist(tex.l), ncol=length(tex.l[[1]]), byrow=TRUE)
tex.df <- as.data.frame(tex.m, stringsAsFactors=FALSE)
colnames(tex.df) <- names(patterns)
str(tex.df)
# 'data.frame': 2 obs. of 7 variables:
# $ title : chr "TeX: The Program" "Foundations of Databases."
# $ autors : chr "Donald E. Knuth" "Serge Abiteboul,Richard Hull,Victor Vianu"
# $ year : chr "1986" "1995"
# $ revue : chr "" ""
# $ id_paper: chr "68" "69"
# $ id_ref : chr "" "1118192,189,1088975,971271,832272"
# $ abstract: chr "" "From the Book: This book will teach you how to write
# specifications of computer systems, using the language TLA+."
这里是基于@AkselA 的回答的解决方案。我不能只在评论中处理这个问题,因此,一个额外的答案(我知道我可以更好地格式化它......)
#split into individual docs
text.s = strsplit(text, "\n(?=#\*)", perl = T)[[1]]
# function to extract information from individual docs
extract_info = function(x, patterns = list(title="^*#\*",
autors="^*#@",
year="^*#t",
revue="^*#c",
id_paper="^*#index",
id_ref="^*#%",
abstract="^*#!")) {
lapply(patterns, function(p) {
extract = grep(p, x, value = T)
# here you check the length of the potential output
# and modify the type according to your needs
if (length(extract) > 1) {
extract = list(extract)
} else if (length(extract) == 0) {
extract = NA
}
return(extract)
})
}
# apply the function to the data
# and rbind it into a data.frame
do.call(rbind,
lapply(text.s, function(x) {
x = strsplit(x, "\n")[[1]]
extract_info(x)
})
)
# title autors year revue id_paper id_ref
# [1,] "#*TeX: The Program" "#@Donald E. Knuth" "#t1986" "#c" "#index68" NA
# [2,] "#*Foundations of Databases." "#@Serge Abiteboul,Richard Hull,Victor Vianu" "#t1995" "#c" "#index69" List,1
# abstract
# [1,] NA
# [2,] "#!From the Book: This book will teach you how to write specifications of computer systems, using th" [truncated]
我的问题是关于文本挖掘和文本处理。
我想从我的文本构建一个数据框。
我的数据是:
text <- c("#*TeX: The Program,
#@Donald E. Knuth,
#t1986,
#c,
#index68,
""
#*Foundations of Databases.,
#@Serge Abiteboul,Richard Hull,Victor Vianu,
#t1995,
#c,
#index69,
#%1118192,
#%189,
#%1088975,
#%971271,
#%832272,
#!From the Book: This book will teach you how to write specifications of computer systems, using the language TLA+.")
我的预期输出是:
expected <- data.frame(title=c("#*TeX: The Program", "#*Foundations of Databases."), authors=c("#@Donald E. Knuth", "#@Serge Abiteboul,Richard Hull,Victor Vianu"), year=c("#t1986", "#t1995"), revue=c("#c", "#c"), id_paper=c("#index68", "#index69"),
id_ref=c(NA,"#%1118192, #%189, #%1088975, #%971271, #%832272"), abstract=c(NA, "#!From the Book: This book will teach you how to write specifications of computer systems, using the language TLA+."))
我的代码是:
coln <- c("title", "authors", "year", "revue","id_paper", "id_ref", "abstract")
title_index <- grep("^#[*]", text)
authors_index <- grep("#@", text)
year_index <- grep("#t", text)
revue_index <- grep("#c", text)
id_paper_index <- grep("#index", text)
id_refindex <- grep("#%", text)
abstract_index <- grep("#!", text)
df <- matrix(NA, nrow=length(title_index), ncol=length(coln))
colnames(df) <- coln
stoc_index <- grep("#cSTOC", text)
sigir_index <- grep("#cSIGIR", text)}
########## titre
{der_pos <- length(title_index)
tit_position <- c(title_index , der_pos)
for(i in 1:length(title_position)){
if(i != length(title_position)){
df[i, "title"] <- text[title_position[i]]
}
}
}
########## author
{der_pos <- length(authors_index)
authors_position <- c(authors_index )
for(i in 1:length(auteur_position)){
if(i != length(auteur_position)){
df[i, "auteur"] <- text[auteur_position[i]]
}
}
}
########## year
{der_pos <- length(year_index)
year_position <- c(year_index , der_pos)
for(i in 1:length(year_position)){
if(i != length(year_position)){
df[i, "année"] <- text[year_position[i]]
}
}
}
##########??? revue
{der_pos <- length(revue_index)
revue_position <- c(revue_index )
for(i in 1:length(revue_position)){
if(i != length(revue_position)){
df[i, "revue"] <- text[revue_position[i]]
}
}
}
########## id_paper
{der_pos <- length(id_paper_index)
id_paper_position <- c(id_paper_index , dern_pos)
for(i in 1:length(id_paper_position)){
if(i != length(id_paper_position)){
df[i, "id_paper"] <- text[id_paper_position[i]]
}
}
}
########## id_ref
{der_pos <- length(id_ref_index)
id_ref_position <- c(id_ref_index , der_pos)
for(i in 1:length(id_ref_position)){
if(i != length(id_ref_position)){
df[i, "id_ref"] <- text[id_ref_position[i]]
}
}
}
########## abstract
{der_pos <- length(abstract_index)
abstract_position <- c(abstract_index , der_pos)
for(i in 1:length(abstract_position)){
if(i != length(abstract_position)){
df[i, "abstract"] <- text[abstract_position[i]]
}
}
}
所以我想在一行中提取引用
如果您有针对一篇文章在一栏中连接多个引用并用逗号分隔的解决方案,请提前致谢。
谢谢:)
新的和改进的
text.n <- strsplit(text, "\n(?=#\*)", perl=TRUE)[[1]]; text.n
text.s <- lapply(text.n, function(x) strsplit(x, "\n")[[1]])
patterns <- list(title="^#\*",
autors="^#@",
year="^#t",
revue="^#c",
id_paper="^#index",
id_ref="^#%",
abstract="^#!")
tex.l <- lapply(text.s, function(x)
lapply(patterns, function(y)
paste(sub(y, "", grep(y, x, value=TRUE)), collapse=",")
)
)
tex.m <- matrix(unlist(tex.l), ncol=length(tex.l[[1]]), byrow=TRUE)
tex.df <- as.data.frame(tex.m, stringsAsFactors=FALSE)
colnames(tex.df) <- names(patterns)
str(tex.df)
# 'data.frame': 2 obs. of 7 variables:
# $ title : chr "TeX: The Program" "Foundations of Databases."
# $ autors : chr "Donald E. Knuth" "Serge Abiteboul,Richard Hull,Victor Vianu"
# $ year : chr "1986" "1995"
# $ revue : chr "" ""
# $ id_paper: chr "68" "69"
# $ id_ref : chr "" "1118192,189,1088975,971271,832272"
# $ abstract: chr "" "From the Book: This book will teach you how to write
# specifications of computer systems, using the language TLA+."
这里是基于@AkselA 的回答的解决方案。我不能只在评论中处理这个问题,因此,一个额外的答案(我知道我可以更好地格式化它......)
#split into individual docs
text.s = strsplit(text, "\n(?=#\*)", perl = T)[[1]]
# function to extract information from individual docs
extract_info = function(x, patterns = list(title="^*#\*",
autors="^*#@",
year="^*#t",
revue="^*#c",
id_paper="^*#index",
id_ref="^*#%",
abstract="^*#!")) {
lapply(patterns, function(p) {
extract = grep(p, x, value = T)
# here you check the length of the potential output
# and modify the type according to your needs
if (length(extract) > 1) {
extract = list(extract)
} else if (length(extract) == 0) {
extract = NA
}
return(extract)
})
}
# apply the function to the data
# and rbind it into a data.frame
do.call(rbind,
lapply(text.s, function(x) {
x = strsplit(x, "\n")[[1]]
extract_info(x)
})
)
# title autors year revue id_paper id_ref
# [1,] "#*TeX: The Program" "#@Donald E. Knuth" "#t1986" "#c" "#index68" NA
# [2,] "#*Foundations of Databases." "#@Serge Abiteboul,Richard Hull,Victor Vianu" "#t1995" "#c" "#index69" List,1
# abstract
# [1,] NA
# [2,] "#!From the Book: This book will teach you how to write specifications of computer systems, using th" [truncated]