计算数据框 R 中每个语音的动词数
Count number of verbs for each speech in data frame R
我有如下数据框:
str(data)
'data.frame': 255 obs. of 3 variables:
$ Group : Factor w/ 255 levels "AlzGroup1","AlzGroup10",..: 1 112 179 190 201 212 223 234 245 2 ...
$ Gender : int 1 1 0 0 0 0 0 1 0 0 ...
$ Description: Factor w/ 255 levels "A boy's on the uh falling off the stool picking up cookies . The girl's reaching up for it . The girl the lady "| __truncated__,..: 63 69 38 134 111 242 196 85 84 233 ...
在“描述”列中,我有 255 个演讲,我想在我的数据框中添加一列,其中包含每个演讲中的动词数量,我知道如何获取动词数量,但以下代码给出了动词总数在描述栏中:
> library(NLP);
> library(tm);
> library(openNLP);
NumOfVerbs=sapply(strsplit(as.character(tagPOS(data$Description)),"[[:punct:]]*/VB.?"),function(x) {res = sub("(^.*\s)(\w+$)", "\2", x); res[!grepl("\s",res)]} )
有谁知道我怎样才能知道每次演讲中动词的数量?
感谢您的帮助!
以拉河
假设您正在使用与此类似的函数(在此处找到:):
tagPOS <- function(x, ...) {
s <- as.String(x)
word_token_annotator <- Maxent_Word_Token_Annotator()
a2 <- Annotation(1L, "sentence", 1L, nchar(s))
a2 <- annotate(s, word_token_annotator, a2)
a3 <- annotate(s, Maxent_POS_Tag_Annotator(), a2)
a3w <- a3[a3$type == "word"]
POStags <- unlist(lapply(a3w$features, `[[`, "POS"))
POStagged <- paste(sprintf("%s/%s", s[a3w], POStags), collapse = " ")
list(POStagged = POStagged, POStags = POStags)
}
创建一个函数来计算包含字母 'VB'
的 POS 标签的数量
count_verbs <-function(x) {
pos_tags <- tagPOS(x)$POStags
sum(grepl("VB", pos_tags))
}
并使用 dplyr
按 Group
分组并使用 count_verbs()
汇总:
library(dplyr)
data %>%
group_by(Group) %>%
summarise(num_verbs = count_verbs(Description))
我有如下数据框:
str(data)
'data.frame': 255 obs. of 3 variables:
$ Group : Factor w/ 255 levels "AlzGroup1","AlzGroup10",..: 1 112 179 190 201 212 223 234 245 2 ...
$ Gender : int 1 1 0 0 0 0 0 1 0 0 ...
$ Description: Factor w/ 255 levels "A boy's on the uh falling off the stool picking up cookies . The girl's reaching up for it . The girl the lady "| __truncated__,..: 63 69 38 134 111 242 196 85 84 233 ...
在“描述”列中,我有 255 个演讲,我想在我的数据框中添加一列,其中包含每个演讲中的动词数量,我知道如何获取动词数量,但以下代码给出了动词总数在描述栏中:
> library(NLP);
> library(tm);
> library(openNLP);
NumOfVerbs=sapply(strsplit(as.character(tagPOS(data$Description)),"[[:punct:]]*/VB.?"),function(x) {res = sub("(^.*\s)(\w+$)", "\2", x); res[!grepl("\s",res)]} )
有谁知道我怎样才能知道每次演讲中动词的数量?
感谢您的帮助!
以拉河
假设您正在使用与此类似的函数(在此处找到:
tagPOS <- function(x, ...) {
s <- as.String(x)
word_token_annotator <- Maxent_Word_Token_Annotator()
a2 <- Annotation(1L, "sentence", 1L, nchar(s))
a2 <- annotate(s, word_token_annotator, a2)
a3 <- annotate(s, Maxent_POS_Tag_Annotator(), a2)
a3w <- a3[a3$type == "word"]
POStags <- unlist(lapply(a3w$features, `[[`, "POS"))
POStagged <- paste(sprintf("%s/%s", s[a3w], POStags), collapse = " ")
list(POStagged = POStagged, POStags = POStags)
}
创建一个函数来计算包含字母 'VB'
count_verbs <-function(x) {
pos_tags <- tagPOS(x)$POStags
sum(grepl("VB", pos_tags))
}
并使用 dplyr
按 Group
分组并使用 count_verbs()
汇总:
library(dplyr)
data %>%
group_by(Group) %>%
summarise(num_verbs = count_verbs(Description))