这个for循环可以向量化吗
Can this for-loop be vectorized
我使用 for 循环在 R 中实现了卡方检验,以便计算每个单元格的检验统计量。但是,我想知道这是否可以优化。 R 中的 chi=square 是否用作我的代码?
eval_preds <- function(df, observations, predictions, groups) {
# determine cells
cells <- unique(df[,groups])
my_sum = 0
# compute test statistic for every cell
for (i in 1:length(cells)) {
# get cell means
m_obs <- mean(df[df[,groups] == cells[i], observations])
m_pred <- mean(df[df[,groups] == cells[i], predictions])
# get cell variance
var_obs <- var(df[df[,groups] == cells[i], observations])
var_pred <- var(df[df[,groups] == cells[i], predictions])
# get cell's number of observations
N_obs <- length(df[df[,groups] == cells[i], observations])
N_pred <- length(df[df[,groups] == cells[i], predictions])
# sum up
my_sum = my_sum + (m_obs-m_pred) / ((var_obs/N_obs) + (var_pred/N_pred))
}
return(my_sum)
}
还有一个玩具示例:
# save this as df_head.csv
"RT","RTmodel_s","groups"
899,975.978308710825,"pl_sgDom"
1115,1095.61538562629,"sg_sgDom"
1077,1158.19266217845,"pl_sgDom"
850,996.410033287249,"sg_plDom"
854,894.862587823602,"pl_sgDom"
1720,1046.34200941684,"sg_sgDom"
# load dat into R
df <- read.csv('./df_head.csv')
my_chi <- eval_preds(df, 'RT', 'Pred', 'Group')
编辑
函数 eval_preds 函数在 for 循环中调用,其中根据自由参数 t_parsing 计算不同的预测。
p_grid = seq(0,1,0.1)
# tune p
for (i in 1:length(p_grid)) {
# set t_parsing
t_parsing = p_grid[i]
# compute model-time RTs
df$RTmodel <- ifelse(df$number == 'sing',
RT_lookup(df$sg_t_act, df$epsilon),
RT_decompose(df$sg_t_act, df$pl_t_act, df$epsilon))
# scale into real time
df$RTmodel_s <- scale_RTmodel(df$RT, df$RTmodel)
# compare model output to measured RT
# my_chi <- eval_preds(my_nouns, 'RT', 'RTmodel_s', 'groups')
my_chi <- eval_preds1(df, RT, RTmodel_s, groups) #function written by DaveArmstrong
print(paste(p_grid[i], ': ', my_chi))
}
RT
和RTmodel_s
是数值变量,groups
是字符变量。
当然,您可以使用 dplyr
和 rlang
:
eval_preds <- function(df, observations, predictions, groups) {
# determine cells
require(dplyr)
require(rlang)
groups <- enquo(groups)
observations <- enquo(observations)
predictions <- enquo(predictions)
out <- DF %>%
group_by(!!groups) %>%
summarise(m_obs = mean(!!observations),
m_pred = mean(!!predictions),
var_obs = var(!!observations),
var_pred = var(!!predictions),
N_obs = sum(!is.na(!!observations)),
N_pred = sum(!is.na(!!predictions))) %>%
mutate(my_sum = (m_obs - m_pred)/((var_obs/N_obs) + (var_pred/N_pred)))
sum(out$my_sum)
}
my_chi <- eval_preds(DF, RT, Pred, Group)
my_chi
# [1] 1.6
或者,更精简一点:
eval_preds <- function(df, observations, predictions, groups) {
# determine cells
require(dplyr)
require(rlang)
groups <- enquo(groups)
observations <- enquo(observations)
predictions <- enquo(predictions)
out <- DF %>%
group_by(!!groups) %>%
summarise(diff= mean(!!observations) - mean(!!predictions),
sum_v = (var(!!observations)/n()) + (var(!!predictions)/n())) %>%
mutate(my_sum = diff/sum_v)
sum(out$my_sum)
}
my_chi <- eval_preds(DF, RT, Pred, Group)
my_chi
# [1] 1.6
编辑 - 添加基准
所以,我认为哪个更好的问题取决于数据集的大小。我还想再添加一个函数 - 一个使用 by()
from base R:
的函数
eval_preds <- function(df, observations, predictions, groups) {
# determine cells
cells <- unique(df[,groups])
my_sum = 0
# compute test statistic for every cell
for (i in 1:length(cells)) {
# get cell means
m_obs <- mean(df[df[,groups] == cells[i], observations])
m_pred <- mean(df[df[,groups] == cells[i], predictions])
# get cell variance
var_obs <- var(df[df[,groups] == cells[i], observations])
var_pred <- var(df[df[,groups] == cells[i], predictions])
# get cell's number of observations
N_obs <- length(df[df[,groups] == cells[i], observations])
N_pred <- length(df[df[,groups] == cells[i], predictions])
# sum up
my_sum = my_sum + (m_obs-m_pred) / ((var_obs/N_obs) + (var_pred/N_pred))
}
return(my_sum)
}
eval_preds1 <- function(df, observations, predictions, groups) {
# determine cells
require(dplyr)
require(rlang)
groups <- enquo(groups)
observations <- enquo(observations)
predictions <- enquo(predictions)
out <- df %>%
group_by(!!groups) %>%
summarise(m_obs = mean(!!observations),
m_pred = mean(!!predictions),
var_obs = var(!!observations),
var_pred = var(!!predictions),
N_obs = sum(!is.na(!!observations)),
N_pred = sum(!is.na(!!predictions))) %>%
ungroup %>%
mutate(my_sum = (m_obs - m_pred)/((var_obs/N_obs) + (var_pred/N_pred)))
sum(out$my_sum)
}
eval_preds2 <- function(df, observations, predictions, groups) {
# determine cells
require(dplyr)
require(rlang)
groups <- enquo(groups)
observations <- enquo(observations)
predictions <- enquo(predictions)
out <- df %>%
group_by(!!groups) %>%
summarise(diff= mean(!!observations) - mean(!!predictions),
sum_v = (var(!!observations)/n()) + (var(!!predictions)/n())) %>%
ungroup %>%
mutate(my_sum = diff/sum_v)
sum(out$my_sum)
}
eval_preds3<- function(df, observations, predictions, groups) {
# determine cells
m <- by(df[,c(observations, predictions)], list(df[[groups]]), function(x)diff(-colMeans(x)))
v <- by(df[,c(observations, predictions)], list(df[[groups]]), function(x)sum(apply(x, 2, var)/nrow(x)))
sum(m/v)
}
所以,eval_preds()
是原始的,eval_preds1()
是第一组dplyr
代码和eval_preds2()
,eval_preds3()
是基础R加上by()
。在原始数据集上,这是 microbenchmark()
.
的输出
microbenchmark(eval_preds(DF, 'RT', 'Pred', 'Group'),
eval_preds1(DF, RT, Pred, Group),
eval_preds2(DF, RT, Pred, Group),
eval_preds4(DF, 'RT', 'Pred', 'Group'), times=100)
# Unit: microseconds
# expr min lq mean median uq max neval cld
# eval_preds(DF, "RT", "Pred", "Group") 236.513 279.4920 324.9760 295.190 321.0125 774.213 100 a
# eval_preds1(DF, RT, Pred, Group) 5236.850 5747.5095 6503.0251 6089.343 6937.8670 12950.677 100 d
# eval_preds2(DF, RT, Pred, Group) 4871.812 5372.2365 6070.7297 5697.686 6548.8935 14577.786 100 c
# eval_preds4(DF, "RT", "Pred", "Group") 651.013 739.9405 839.1706 773.610 923.9870 1582.218 100 b
在这种情况下,原始代码是最快的。但是,如果我们创建一个更大的数据集——这里有 1000 个组,每组 10 个观察值。
DF2 <- data.frame(
Group = rep(1:1000, each=10),
RT = rpois(10000, 3),
Pred = rpois(10000, 4)
)
此处 microbenchmark()
的输出讲述了一个完全不同的故事。
microbenchmark(eval_preds(DF2, 'RT', 'Pred', 'Group'),
eval_preds1(DF2, RT, Pred, Group),
eval_preds2(DF2, RT, Pred, Group),
eval_preds3(DF2, 'RT', 'Pred', 'Group'), times=25)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# eval_preds(DF2, "RT", "Pred", "Group") 245.67280 267.96445 353.6489 324.29416 419.4193 565.4494 25 c
# eval_preds1(DF2, RT, Pred, Group) 74.56522 88.15003 102.6583 92.24103 106.6766 211.2368 25 a
# eval_preds2(DF2, RT, Pred, Group) 79.00919 89.03754 125.8202 94.71703 114.5176 606.8830 25 a
# eval_preds3(DF2, "RT", "Pred", "Group") 193.94042 240.35447 272.0004 254.85557 316.5098 420.5394 25 b
两个基于 dplyr()
的函数都比它们的基本 R 竞争对手快得多。所以,哪种方法是“最优”的问题取决于要解决的问题的大小。
这是一个与您的循环看起来非常相似的基本方法。请注意,循环效率低下,因为每个循环都在查看每个组以进行过滤。分组操作,例如 by()
会有所帮助,因为我们只需扫描一次分组变量即可获得分组。
DF <- read.table(header=T, text="Group Pred RT
cond1 2 3
cond1 4 2
cond2 2 2
cond2 1 2")
stats = by(data = DF[c("Pred", "RT")],
INDICES = DF["Group"],
simplify = FALSE,
FUN = function(DF_grp) {
RT = DF_grp[["RT"]]
Pred = DF_grp[["Pred"]]
m_obs = mean(RT)
m_pred = mean(Pred)
var_obs = var(RT)
var_pred = var(Pred)
N_obs = N_pred = length(Pred) ##simplification
return((m_obs - m_pred) / ((var_obs / N_obs) + (var_pred / N_pred)))
}
)
stats
#> Group: cond1
#> [1] -0.4
#> ------------------------------------------------------------
#> Group: cond2
#> [1] 2
sum(unlist(stats, use.names = FALSE))
#> [1] 1.6
最后,如果您知道您的分组是有序的,您可以查看 Rcpp
以获得额外的性能。
我使用 for 循环在 R 中实现了卡方检验,以便计算每个单元格的检验统计量。但是,我想知道这是否可以优化。 R 中的 chi=square 是否用作我的代码?
eval_preds <- function(df, observations, predictions, groups) {
# determine cells
cells <- unique(df[,groups])
my_sum = 0
# compute test statistic for every cell
for (i in 1:length(cells)) {
# get cell means
m_obs <- mean(df[df[,groups] == cells[i], observations])
m_pred <- mean(df[df[,groups] == cells[i], predictions])
# get cell variance
var_obs <- var(df[df[,groups] == cells[i], observations])
var_pred <- var(df[df[,groups] == cells[i], predictions])
# get cell's number of observations
N_obs <- length(df[df[,groups] == cells[i], observations])
N_pred <- length(df[df[,groups] == cells[i], predictions])
# sum up
my_sum = my_sum + (m_obs-m_pred) / ((var_obs/N_obs) + (var_pred/N_pred))
}
return(my_sum)
}
还有一个玩具示例:
# save this as df_head.csv
"RT","RTmodel_s","groups"
899,975.978308710825,"pl_sgDom"
1115,1095.61538562629,"sg_sgDom"
1077,1158.19266217845,"pl_sgDom"
850,996.410033287249,"sg_plDom"
854,894.862587823602,"pl_sgDom"
1720,1046.34200941684,"sg_sgDom"
# load dat into R
df <- read.csv('./df_head.csv')
my_chi <- eval_preds(df, 'RT', 'Pred', 'Group')
编辑
函数 eval_preds 函数在 for 循环中调用,其中根据自由参数 t_parsing 计算不同的预测。
p_grid = seq(0,1,0.1)
# tune p
for (i in 1:length(p_grid)) {
# set t_parsing
t_parsing = p_grid[i]
# compute model-time RTs
df$RTmodel <- ifelse(df$number == 'sing',
RT_lookup(df$sg_t_act, df$epsilon),
RT_decompose(df$sg_t_act, df$pl_t_act, df$epsilon))
# scale into real time
df$RTmodel_s <- scale_RTmodel(df$RT, df$RTmodel)
# compare model output to measured RT
# my_chi <- eval_preds(my_nouns, 'RT', 'RTmodel_s', 'groups')
my_chi <- eval_preds1(df, RT, RTmodel_s, groups) #function written by DaveArmstrong
print(paste(p_grid[i], ': ', my_chi))
}
RT
和RTmodel_s
是数值变量,groups
是字符变量。
当然,您可以使用 dplyr
和 rlang
:
eval_preds <- function(df, observations, predictions, groups) {
# determine cells
require(dplyr)
require(rlang)
groups <- enquo(groups)
observations <- enquo(observations)
predictions <- enquo(predictions)
out <- DF %>%
group_by(!!groups) %>%
summarise(m_obs = mean(!!observations),
m_pred = mean(!!predictions),
var_obs = var(!!observations),
var_pred = var(!!predictions),
N_obs = sum(!is.na(!!observations)),
N_pred = sum(!is.na(!!predictions))) %>%
mutate(my_sum = (m_obs - m_pred)/((var_obs/N_obs) + (var_pred/N_pred)))
sum(out$my_sum)
}
my_chi <- eval_preds(DF, RT, Pred, Group)
my_chi
# [1] 1.6
或者,更精简一点:
eval_preds <- function(df, observations, predictions, groups) {
# determine cells
require(dplyr)
require(rlang)
groups <- enquo(groups)
observations <- enquo(observations)
predictions <- enquo(predictions)
out <- DF %>%
group_by(!!groups) %>%
summarise(diff= mean(!!observations) - mean(!!predictions),
sum_v = (var(!!observations)/n()) + (var(!!predictions)/n())) %>%
mutate(my_sum = diff/sum_v)
sum(out$my_sum)
}
my_chi <- eval_preds(DF, RT, Pred, Group)
my_chi
# [1] 1.6
编辑 - 添加基准
所以,我认为哪个更好的问题取决于数据集的大小。我还想再添加一个函数 - 一个使用 by()
from base R:
eval_preds <- function(df, observations, predictions, groups) {
# determine cells
cells <- unique(df[,groups])
my_sum = 0
# compute test statistic for every cell
for (i in 1:length(cells)) {
# get cell means
m_obs <- mean(df[df[,groups] == cells[i], observations])
m_pred <- mean(df[df[,groups] == cells[i], predictions])
# get cell variance
var_obs <- var(df[df[,groups] == cells[i], observations])
var_pred <- var(df[df[,groups] == cells[i], predictions])
# get cell's number of observations
N_obs <- length(df[df[,groups] == cells[i], observations])
N_pred <- length(df[df[,groups] == cells[i], predictions])
# sum up
my_sum = my_sum + (m_obs-m_pred) / ((var_obs/N_obs) + (var_pred/N_pred))
}
return(my_sum)
}
eval_preds1 <- function(df, observations, predictions, groups) {
# determine cells
require(dplyr)
require(rlang)
groups <- enquo(groups)
observations <- enquo(observations)
predictions <- enquo(predictions)
out <- df %>%
group_by(!!groups) %>%
summarise(m_obs = mean(!!observations),
m_pred = mean(!!predictions),
var_obs = var(!!observations),
var_pred = var(!!predictions),
N_obs = sum(!is.na(!!observations)),
N_pred = sum(!is.na(!!predictions))) %>%
ungroup %>%
mutate(my_sum = (m_obs - m_pred)/((var_obs/N_obs) + (var_pred/N_pred)))
sum(out$my_sum)
}
eval_preds2 <- function(df, observations, predictions, groups) {
# determine cells
require(dplyr)
require(rlang)
groups <- enquo(groups)
observations <- enquo(observations)
predictions <- enquo(predictions)
out <- df %>%
group_by(!!groups) %>%
summarise(diff= mean(!!observations) - mean(!!predictions),
sum_v = (var(!!observations)/n()) + (var(!!predictions)/n())) %>%
ungroup %>%
mutate(my_sum = diff/sum_v)
sum(out$my_sum)
}
eval_preds3<- function(df, observations, predictions, groups) {
# determine cells
m <- by(df[,c(observations, predictions)], list(df[[groups]]), function(x)diff(-colMeans(x)))
v <- by(df[,c(observations, predictions)], list(df[[groups]]), function(x)sum(apply(x, 2, var)/nrow(x)))
sum(m/v)
}
所以,eval_preds()
是原始的,eval_preds1()
是第一组dplyr
代码和eval_preds2()
,eval_preds3()
是基础R加上by()
。在原始数据集上,这是 microbenchmark()
.
microbenchmark(eval_preds(DF, 'RT', 'Pred', 'Group'),
eval_preds1(DF, RT, Pred, Group),
eval_preds2(DF, RT, Pred, Group),
eval_preds4(DF, 'RT', 'Pred', 'Group'), times=100)
# Unit: microseconds
# expr min lq mean median uq max neval cld
# eval_preds(DF, "RT", "Pred", "Group") 236.513 279.4920 324.9760 295.190 321.0125 774.213 100 a
# eval_preds1(DF, RT, Pred, Group) 5236.850 5747.5095 6503.0251 6089.343 6937.8670 12950.677 100 d
# eval_preds2(DF, RT, Pred, Group) 4871.812 5372.2365 6070.7297 5697.686 6548.8935 14577.786 100 c
# eval_preds4(DF, "RT", "Pred", "Group") 651.013 739.9405 839.1706 773.610 923.9870 1582.218 100 b
在这种情况下,原始代码是最快的。但是,如果我们创建一个更大的数据集——这里有 1000 个组,每组 10 个观察值。
DF2 <- data.frame(
Group = rep(1:1000, each=10),
RT = rpois(10000, 3),
Pred = rpois(10000, 4)
)
此处 microbenchmark()
的输出讲述了一个完全不同的故事。
microbenchmark(eval_preds(DF2, 'RT', 'Pred', 'Group'),
eval_preds1(DF2, RT, Pred, Group),
eval_preds2(DF2, RT, Pred, Group),
eval_preds3(DF2, 'RT', 'Pred', 'Group'), times=25)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# eval_preds(DF2, "RT", "Pred", "Group") 245.67280 267.96445 353.6489 324.29416 419.4193 565.4494 25 c
# eval_preds1(DF2, RT, Pred, Group) 74.56522 88.15003 102.6583 92.24103 106.6766 211.2368 25 a
# eval_preds2(DF2, RT, Pred, Group) 79.00919 89.03754 125.8202 94.71703 114.5176 606.8830 25 a
# eval_preds3(DF2, "RT", "Pred", "Group") 193.94042 240.35447 272.0004 254.85557 316.5098 420.5394 25 b
两个基于 dplyr()
的函数都比它们的基本 R 竞争对手快得多。所以,哪种方法是“最优”的问题取决于要解决的问题的大小。
这是一个与您的循环看起来非常相似的基本方法。请注意,循环效率低下,因为每个循环都在查看每个组以进行过滤。分组操作,例如 by()
会有所帮助,因为我们只需扫描一次分组变量即可获得分组。
DF <- read.table(header=T, text="Group Pred RT
cond1 2 3
cond1 4 2
cond2 2 2
cond2 1 2")
stats = by(data = DF[c("Pred", "RT")],
INDICES = DF["Group"],
simplify = FALSE,
FUN = function(DF_grp) {
RT = DF_grp[["RT"]]
Pred = DF_grp[["Pred"]]
m_obs = mean(RT)
m_pred = mean(Pred)
var_obs = var(RT)
var_pred = var(Pred)
N_obs = N_pred = length(Pred) ##simplification
return((m_obs - m_pred) / ((var_obs / N_obs) + (var_pred / N_pred)))
}
)
stats
#> Group: cond1
#> [1] -0.4
#> ------------------------------------------------------------
#> Group: cond2
#> [1] 2
sum(unlist(stats, use.names = FALSE))
#> [1] 1.6
最后,如果您知道您的分组是有序的,您可以查看 Rcpp
以获得额外的性能。