计算 R 中 2 个数据帧之间的特定单词出现次数,需要 group_by
Counting specific word occurrences between 2 data frames in R with a group_by needed
我在 R 中有两个数据框,第一个(名为 Words)由单列单词组成:
Words
Hello
Building
School
Hospital
Doctors
第二个是这样呈现的大数据集:
id
description
382
Building a school
787
Hiring doctors for the new hospital and teachers for the school
然后,我想按ID分组,得到如下结果
id
description
Match
382
Building a school
2
787
Hiring doctors for the new hospital and teachers for the school
3
这是我试过的
library(stringr)
df <- df %>% group_by(df$id)
getCount <- function(data,keyword)
{
wcount <- str_count(df$description, keyword)
return(data.frame(data,wcount))
}
gCount(df$description,Words)
(我也尝试过将 Words 数据集转换为列表)
以及:
df <- df %>% group_by(df$id)
table(df$description)
df$match <- df[df$description %in% Words$Words,]
table(df$match)
最后
Words.list <- setNames(split(Words, seq(nrow(Words))), rownames(Words))
description <- subset(df, select = c("description","id"))
description <- description %>% group_by(description$id)
description.list <- setNames(split(description, seq(nrow(description))), rownames(description))
str_to_search = Words.list
str_to_count = description.list
lengths(regmatches(str_to_search, gregexpr(str_to_count, str_to_search, fixed = TRUE)))
但是我只有一些我不理解的奇怪错误消息。
library(stringr)
library(purrr)
words <- c("Hello", "Building", "School", "Hospital", "Doctors") %>%
str_to_lower()
descriptions <- c("Building a school", "Hiring doctors for the new hospital and teachers for the school")
df_descriptions <- data.frame(description = descriptions) %>%
mutate(Match = map_int(str_to_lower(description), ~str_count(.x, words) %>% sum()))
编辑
df_descriptions <- data.frame(description = descriptions) %>%
mutate(
Match = str_to_lower(description) %>%
str_split(" ") %>%
map_int(~sum(.x %in% words))
)
我在 R 中有两个数据框,第一个(名为 Words)由单列单词组成:
Words |
---|
Hello |
Building |
School |
Hospital |
Doctors |
第二个是这样呈现的大数据集:
id | description |
---|---|
382 | Building a school |
787 | Hiring doctors for the new hospital and teachers for the school |
然后,我想按ID分组,得到如下结果
id | description | Match |
---|---|---|
382 | Building a school | 2 |
787 | Hiring doctors for the new hospital and teachers for the school | 3 |
这是我试过的
library(stringr)
df <- df %>% group_by(df$id)
getCount <- function(data,keyword)
{
wcount <- str_count(df$description, keyword)
return(data.frame(data,wcount))
}
gCount(df$description,Words)
(我也尝试过将 Words 数据集转换为列表)
以及:
df <- df %>% group_by(df$id)
table(df$description)
df$match <- df[df$description %in% Words$Words,]
table(df$match)
最后
Words.list <- setNames(split(Words, seq(nrow(Words))), rownames(Words))
description <- subset(df, select = c("description","id"))
description <- description %>% group_by(description$id)
description.list <- setNames(split(description, seq(nrow(description))), rownames(description))
str_to_search = Words.list
str_to_count = description.list
lengths(regmatches(str_to_search, gregexpr(str_to_count, str_to_search, fixed = TRUE)))
但是我只有一些我不理解的奇怪错误消息。
library(stringr)
library(purrr)
words <- c("Hello", "Building", "School", "Hospital", "Doctors") %>%
str_to_lower()
descriptions <- c("Building a school", "Hiring doctors for the new hospital and teachers for the school")
df_descriptions <- data.frame(description = descriptions) %>%
mutate(Match = map_int(str_to_lower(description), ~str_count(.x, words) %>% sum()))
编辑
df_descriptions <- data.frame(description = descriptions) %>%
mutate(
Match = str_to_lower(description) %>%
str_split(" ") %>%
map_int(~sum(.x %in% words))
)