将 user-defined 函数应用于一个 df,使用另一个 df 中的单个列
Apply a user-defined function to one df, using a single column in another df
df1(1,500 行)显示问题、正确回答的百分比和问题尝试次数:
qtitle avg_correct attempts
"Asthma and exercise, question 1" 54.32 893
"COVID-19 and ventilators, q 3" 23.60 143
"Pedestrian vs. car MVCs" 74.19 227
"Hemophilia and monoclonal Abs" 34.56 78
"COVID-19 and droplets" 83.21 234
使用 tidytext 库,识别 qtitle 列中最频繁出现的单词并按频率计算以创建第二个数据帧(df2,有 320 行)。
word n
COVID-19 68
Trauma 57
Hemophilia 46
我想使用 df2 的单词 col 中的每个条目与 df1 (qtitle) 中问题标题中的单词进行匹配,并找到 avg_correct 的平均值、尝试总和以及包含频率搜索词(df2 中的 n)。[即,通过 user-defined 函数将 df2 映射到 df1]
word avg_correct attempts count(n)
COVID-19 55.23 456 68
Hemophilia 45.92 123 46
这行不通(显然)
correct_by_terms <- function(x) {
filter(df1, str_detect(title, x))
result <- summarise(df1, mean = mean(average), n = n(), x = x)
return (result)
}
frequent_terms_by_correct_percent<- map_df(df2$word, correct_by_terms)
这里有一些使用基数 R 来计算你所要求的东西。
# get total number of correct per question
df1$correct <- df1$avg_correct * df1$attempts / 100
# initialize attempts and correct to 0
df2$attempts <- 0
df2$correct <- 0
# loop over df2
for (df2_index in 1:nrow(df2)){
df2_row <- df2[df2_index,]
# loop over df1
for (df1_index in 1:nrow(df1)){
df1_row <- df1[df1_index,]
# if df1 qtitle contains df2 word
if(grepl(df2_row$word, df1_row$qtitle, fixed = T)){
df2[df2_index ,"attempts"] <- df2[df2_index ,"attempts"] + df1_row$attempts
df2[df2_index ,"correct"] <- df2[df2_index ,"correct"] + df1_row$correct
}
}
}
df2$avg_correct = (df2$correct / df2$attempts) * 100
您可以尝试使用这种基础 R 方法。使用 sapply
我们遍历 df2
中的每个 word
,将其与 df1
中问题标题中的 grepl
匹配,并且 return [=16] =] 的 avg_correct
和 sum
的 attempts
对于这些索引。
cbind(df2, t(sapply(df2$word, function(x) {
inds <- grepl(paste0('\b', x, '\b'), df1$qtitle)
c(avg_correct = mean(df1$avg_correct[inds]),
attempts = sum(df1$attempts[inds]))
})))
如果您要匹配的词都是 词,可以像您展示的示例一样通过标记化识别,我会:
- 标记化,
- 内连接,然后
group_by()
总结。
library(tidyverse)
library(tidytext)
df1 <- tribble(~qtitle, ~avg_correct, ~attempts,
"Asthma and exercise, question 1", 54.32, 893,
"COVID19 and ventilators, q 3", 23.60, 143,
"Pedestrian vs. car MVCs", 74.19, 227,
"Hemophilia and monoclonal Abs", 34.56, 78,
"COVID19 and droplets", 83.21, 234
)
df2 <- tribble(~word, ~n,
"COVID19", 68,
"Trauma", 57,
"Hemophilia", 46) %>%
mutate(word = tolower(word))
df1 %>%
unnest_tokens(word, qtitle) %>%
inner_join(df2) %>%
group_by(word) %>%
summarise(avg_correct = mean(avg_correct),
attempts = sum(attempts),
n = first(n))
#> Joining, by = "word"
#> `summarise()` ungrouping output (override with `.groups` argument)
#> # A tibble: 2 x 4
#> word avg_correct attempts n
#> <chr> <dbl> <dbl> <dbl>
#> 1 covid19 53.4 377 68
#> 2 hemophilia 34.6 78 46
由 reprex package (v0.3.0)
于 2020-07-18 创建
df1(1,500 行)显示问题、正确回答的百分比和问题尝试次数:
qtitle avg_correct attempts
"Asthma and exercise, question 1" 54.32 893
"COVID-19 and ventilators, q 3" 23.60 143
"Pedestrian vs. car MVCs" 74.19 227
"Hemophilia and monoclonal Abs" 34.56 78
"COVID-19 and droplets" 83.21 234
使用 tidytext 库,识别 qtitle 列中最频繁出现的单词并按频率计算以创建第二个数据帧(df2,有 320 行)。
word n
COVID-19 68
Trauma 57
Hemophilia 46
我想使用 df2 的单词 col 中的每个条目与 df1 (qtitle) 中问题标题中的单词进行匹配,并找到 avg_correct 的平均值、尝试总和以及包含频率搜索词(df2 中的 n)。[即,通过 user-defined 函数将 df2 映射到 df1]
word avg_correct attempts count(n)
COVID-19 55.23 456 68
Hemophilia 45.92 123 46
这行不通(显然)
correct_by_terms <- function(x) {
filter(df1, str_detect(title, x))
result <- summarise(df1, mean = mean(average), n = n(), x = x)
return (result)
}
frequent_terms_by_correct_percent<- map_df(df2$word, correct_by_terms)
这里有一些使用基数 R 来计算你所要求的东西。
# get total number of correct per question
df1$correct <- df1$avg_correct * df1$attempts / 100
# initialize attempts and correct to 0
df2$attempts <- 0
df2$correct <- 0
# loop over df2
for (df2_index in 1:nrow(df2)){
df2_row <- df2[df2_index,]
# loop over df1
for (df1_index in 1:nrow(df1)){
df1_row <- df1[df1_index,]
# if df1 qtitle contains df2 word
if(grepl(df2_row$word, df1_row$qtitle, fixed = T)){
df2[df2_index ,"attempts"] <- df2[df2_index ,"attempts"] + df1_row$attempts
df2[df2_index ,"correct"] <- df2[df2_index ,"correct"] + df1_row$correct
}
}
}
df2$avg_correct = (df2$correct / df2$attempts) * 100
您可以尝试使用这种基础 R 方法。使用 sapply
我们遍历 df2
中的每个 word
,将其与 df1
中问题标题中的 grepl
匹配,并且 return [=16] =] 的 avg_correct
和 sum
的 attempts
对于这些索引。
cbind(df2, t(sapply(df2$word, function(x) {
inds <- grepl(paste0('\b', x, '\b'), df1$qtitle)
c(avg_correct = mean(df1$avg_correct[inds]),
attempts = sum(df1$attempts[inds]))
})))
如果您要匹配的词都是 词,可以像您展示的示例一样通过标记化识别,我会:
- 标记化,
- 内连接,然后
group_by()
总结。
library(tidyverse)
library(tidytext)
df1 <- tribble(~qtitle, ~avg_correct, ~attempts,
"Asthma and exercise, question 1", 54.32, 893,
"COVID19 and ventilators, q 3", 23.60, 143,
"Pedestrian vs. car MVCs", 74.19, 227,
"Hemophilia and monoclonal Abs", 34.56, 78,
"COVID19 and droplets", 83.21, 234
)
df2 <- tribble(~word, ~n,
"COVID19", 68,
"Trauma", 57,
"Hemophilia", 46) %>%
mutate(word = tolower(word))
df1 %>%
unnest_tokens(word, qtitle) %>%
inner_join(df2) %>%
group_by(word) %>%
summarise(avg_correct = mean(avg_correct),
attempts = sum(attempts),
n = first(n))
#> Joining, by = "word"
#> `summarise()` ungrouping output (override with `.groups` argument)
#> # A tibble: 2 x 4
#> word avg_correct attempts n
#> <chr> <dbl> <dbl> <dbl>
#> 1 covid19 53.4 377 68
#> 2 hemophilia 34.6 78 46
由 reprex package (v0.3.0)
于 2020-07-18 创建