识别列中的模式,并将它们添加到数据框中的列
Recognize patterns in column, and add them to column in Data frame
得到一个包含 50 个关键字的列:
Keyword1
Keyword2
Keyword3
KeywordN=50
此外,我得到了一个包含两列的数据框:标题和摘要。
Title Abstract
Rstudio Keyword1 A interesting program language keyword2
Python Keyword3 A interesting program keyword3 language
我想要一个额外的列(我们称之为关键字),如果关键字名称出现在标题或摘要中,它将出现在该列中,如下所示:
Title Abstract Keywords
Rstudio Keyword1 A interesting program language keyword2 Keyword1, keyword2
Python Keyword2 A interesting program keyword3 language Keyword2, Keyword3
我唯一能做到的 'solve' 就是制作一个二进制列(如果模式匹配)。 (grepl 函数),但这不是所需的解决方案...
基础 R
:
- 这处理标点符号、空格、end/starts 行。
- 关键字可以包含空格和一些标点符号(但不是全部)
- 新列中的关键字保持原始关键字向量的大小写:
代码
ind <- sapply(paste0('(^|[ [:punct:]])',tolower(keywords),'($|[ [:punct:]])'),grep,tolower(paste(df$Title,df$Abstract)))
ind[lengths(ind)==0] <- NA # for cases where no keyword is found
ind2 <- do.call(rbind,Map(data.frame,keyword=keywords,i=ind))
ind3 <- aggregate(keyword ~ i,ind2,paste,collapse=', ')
df$keywords[ind3$i] <- ind3$keyword
df$keywords[is.na(df$keywords)] <- "" # replacing NAs with empty strings
# Title Abstract keywords
# 1 Rstudio Keyword1 A interesting program language keyword2 Keyword1, Keyword2
# 2 Python Keyword2 A interesting program keyword3 language Keyword2, Keyword3
数据
keywords <- c("Keyword1", "Keyword2", "Keyword3")
df <- read.table(text="Title Abstract
'Rstudio Keyword1' 'A interesting program language keyword2'
'Python Keyword2' 'A interesting program keyword3 language'",h=T,strin=F)
另一种使用 strsplit
的方法(也在 base R 中):
ls <- strsplit(tolower(paste(df$Title, df$Abstract)),
"(\s+)|(?!')(?=[[:punct:]])", perl = TRUE)
df$Keywords <- do.call("rbind",
lapply(ls, function(x) paste(unique(x[x %in% tolower(keywords)]),
collapse = ", ")))
# Title Abstract Keywords
#1 Rstudio Keyword1 A interesting program language keyword2 keyword1, keyword2
#2 Python Keyword2 A interesting program keyword3 language keyword2, keyword3
示例数据
df <- data.frame(Title = c("Rstudio Keyword1", "Python Keyword2"),
Abstract = c("A interesting program language keyword2",
"A interesting program keyword3 language"),
stringsAsFactors = F)
keywords <- paste0("Keyword", 1:4)
cbind(dat,Keywords=do.call(paste,c(sep=",",Map(sub,paste0(".*(",paste(keywords,collapse="|"),").*"),"\1",dat,TRUE))))
Title Abstract Keywords
1 Rstudio Keyword1 A interesting program language keyword2 Keyword1,keyword2
2 Python Keyword3 A interesting program keyword3 language Keyword3,keyword3
其中 keywords=paste0("Keyword",1:3)
和
dat=read.table(text="Title Abstract
'Rstudio Keyword1' 'A interesting program language keyword2'
'Python Keyword3' 'A interesting program keyword3 language'",h=T,strin=F)
该行可能看起来很长:细分:
a=paste0(".*(",paste(keywords,collapse="|"),").*")
b=do.call(paste,c(sep=",",Map(sub,a,"\1",dat,TRUE)))
cbind(dat,keywords=b)
Title Abstract keywords
1 Rstudio Keyword1 A interesting program language keyword2 Keyword1,keyword2
2 Python Keyword3 A interesting program keyword3 language Keyword3,keyword3
Title<-as.character(c("Rstudio Keyword1","Python Keyword3"))
Abstract<-as.character(c("A interesting program language keyword2"," A interesting program keyword3 language"))
example1.data <- data.frame(Title,Abstract)
#loop answer
f<-length(example1.data)
example1.data$Keyword <- NA
for (i in 1:nrow(example1.data)){
testA[i]<-regmatches(example1.data$Title[i], regexpr("(Keyword|keyword) ([0-9])", example1.data$Title[i]))
testB[i]<-regmatches(example1.data$Abstract[i], regexpr("(Keyword|keyword)([0-9])", example1.data$Abstract[i]))
example1.data$Keyword[i]<-paste(testA[i],testB[i], sep=", ")
}
得到一个包含 50 个关键字的列:
Keyword1
Keyword2
Keyword3
KeywordN=50
此外,我得到了一个包含两列的数据框:标题和摘要。
Title Abstract
Rstudio Keyword1 A interesting program language keyword2
Python Keyword3 A interesting program keyword3 language
我想要一个额外的列(我们称之为关键字),如果关键字名称出现在标题或摘要中,它将出现在该列中,如下所示:
Title Abstract Keywords
Rstudio Keyword1 A interesting program language keyword2 Keyword1, keyword2
Python Keyword2 A interesting program keyword3 language Keyword2, Keyword3
我唯一能做到的 'solve' 就是制作一个二进制列(如果模式匹配)。 (grepl 函数),但这不是所需的解决方案...
基础 R
:
- 这处理标点符号、空格、end/starts 行。
- 关键字可以包含空格和一些标点符号(但不是全部)
- 新列中的关键字保持原始关键字向量的大小写:
代码
ind <- sapply(paste0('(^|[ [:punct:]])',tolower(keywords),'($|[ [:punct:]])'),grep,tolower(paste(df$Title,df$Abstract)))
ind[lengths(ind)==0] <- NA # for cases where no keyword is found
ind2 <- do.call(rbind,Map(data.frame,keyword=keywords,i=ind))
ind3 <- aggregate(keyword ~ i,ind2,paste,collapse=', ')
df$keywords[ind3$i] <- ind3$keyword
df$keywords[is.na(df$keywords)] <- "" # replacing NAs with empty strings
# Title Abstract keywords
# 1 Rstudio Keyword1 A interesting program language keyword2 Keyword1, Keyword2
# 2 Python Keyword2 A interesting program keyword3 language Keyword2, Keyword3
数据
keywords <- c("Keyword1", "Keyword2", "Keyword3")
df <- read.table(text="Title Abstract
'Rstudio Keyword1' 'A interesting program language keyword2'
'Python Keyword2' 'A interesting program keyword3 language'",h=T,strin=F)
另一种使用 strsplit
的方法(也在 base R 中):
ls <- strsplit(tolower(paste(df$Title, df$Abstract)),
"(\s+)|(?!')(?=[[:punct:]])", perl = TRUE)
df$Keywords <- do.call("rbind",
lapply(ls, function(x) paste(unique(x[x %in% tolower(keywords)]),
collapse = ", ")))
# Title Abstract Keywords
#1 Rstudio Keyword1 A interesting program language keyword2 keyword1, keyword2
#2 Python Keyword2 A interesting program keyword3 language keyword2, keyword3
示例数据
df <- data.frame(Title = c("Rstudio Keyword1", "Python Keyword2"),
Abstract = c("A interesting program language keyword2",
"A interesting program keyword3 language"),
stringsAsFactors = F)
keywords <- paste0("Keyword", 1:4)
cbind(dat,Keywords=do.call(paste,c(sep=",",Map(sub,paste0(".*(",paste(keywords,collapse="|"),").*"),"\1",dat,TRUE))))
Title Abstract Keywords
1 Rstudio Keyword1 A interesting program language keyword2 Keyword1,keyword2
2 Python Keyword3 A interesting program keyword3 language Keyword3,keyword3
其中 keywords=paste0("Keyword",1:3)
和
dat=read.table(text="Title Abstract
'Rstudio Keyword1' 'A interesting program language keyword2'
'Python Keyword3' 'A interesting program keyword3 language'",h=T,strin=F)
该行可能看起来很长:细分:
a=paste0(".*(",paste(keywords,collapse="|"),").*")
b=do.call(paste,c(sep=",",Map(sub,a,"\1",dat,TRUE)))
cbind(dat,keywords=b)
Title Abstract keywords
1 Rstudio Keyword1 A interesting program language keyword2 Keyword1,keyword2
2 Python Keyword3 A interesting program keyword3 language Keyword3,keyword3
Title<-as.character(c("Rstudio Keyword1","Python Keyword3"))
Abstract<-as.character(c("A interesting program language keyword2"," A interesting program keyword3 language"))
example1.data <- data.frame(Title,Abstract)
#loop answer
f<-length(example1.data)
example1.data$Keyword <- NA
for (i in 1:nrow(example1.data)){
testA[i]<-regmatches(example1.data$Title[i], regexpr("(Keyword|keyword) ([0-9])", example1.data$Title[i]))
testB[i]<-regmatches(example1.data$Abstract[i], regexpr("(Keyword|keyword)([0-9])", example1.data$Abstract[i]))
example1.data$Keyword[i]<-paste(testA[i],testB[i], sep=", ")
}