在 R 中使用带有自定义函数的应用程序删除 for 循环
removing for loop using apply with custom function in R
我正在尝试 运行 对每个组的所有变量进行自动 t 检验。
但是,在大数据上使用 for-loop 似乎会加重我的计算机负担,并且会停止工作。
是否有办法删除 for 循环 并使代码 运行 更快?
示例代码使用 "for-loop" 和 "combn" 计算给定数据的每个可能的 t 检验组合
( 即 Sepal.Width, Sepal.Length, Petal.Length, Petal.Width for (setosa_vs_versicolor, setosa_vs_virginica, versicolor_vs_virginica" ))
然后将每个 t 检验结果保存到一个空白矩阵中(每个变量 4 行,每个比较 3 列
我尝试使用此代码的数据集有 36 个组和 103 个变量
旨在对以下代码进行全面检查,该代码完全是多个 for 循环的一团糟,并且似乎需要永远处理此类数据
https://github.com/CHKim5/LMSstat/blob/master/R/Allstats.R
system.time(
{data(iris)
Test<-as.data.frame(matrix(data = NA,nrow = 4, ncol = 3))
for (t in 1:(ncol(iris)-1)){
Test[t,]<-combn(as.character(unique(iris$Species)),2,
function(x) t.test(
x =iris[iris$Species == x[1],][,t],
y =iris[iris$Species == x[2],][,t])[["p.value"]])
}
}
)
下一个选项是:
library(tidyverse)
# Either the next 3:
# library(purrr)
# library(dplyr)
# library(tidyr)
combn(unique(iris$Species), 2, simplify = FALSE) %>%
structure(names = sapply(., function(.x) paste0(.x, collapse = "-"))) %>%
map_dfr(~iris %>%
pivot_longer(cols = -Species) %>%
pivot_wider(id_cols = name, names_from = Species, values_from = value) %>%
unnest(cols = c(-name)) %>%
nest_by(name) %>%
mutate(tt = list(t.test(data[[.x[[1]]]], data[[.x[[2]]]]))) %>%
summarise(across(-data, broom::tidy), .groups = "drop") %>%
mutate(across(c(-name), ~.$p.value)),
.id = "comb") %>%
pivot_wider(names_from = comb, values_from = tt)
# A tibble: 4 × 4
name `setosa-versicolor` `setosa-virginica` `versicolor-virginica`
<chr> <dbl> <dbl> <dbl>
1 Petal.Length 9.93e-46 9.27e-50 4.90e-22
2 Petal.Width 2.72e-47 2.44e-48 2.11e-25
3 Sepal.Length 3.75e-17 3.97e-25 1.87e- 7
4 Sepal.Width 2.48e-15 4.57e- 9 1.82e- 3
如果你想要尽可能快的时间,使用一个基准(下面我使用bench::mark()
)。
可以做很多改进。
当目标是速度时,data.table
包通常是一个好的开始。
此外,您要确保从热代码(循环中的代码)中取出所有冗余计算。
我能想到的最快的代码是这样的:
library(data.table)
iris <- as.data.table(iris)
# split the dataset by your target group
species_split <- split(iris, iris$Species)
# create a wrapper for the t-test
split_t_test <- function(x, i) t.test(species_split[[x[1]]][[i]],
species_split[[x[2]]][[i]])[["p.value"]]
# iterate over the columns and compute the t-tests
res <- lapply(seq_len(ncol(iris) - 1),
function(i) as.list(combn(names(species_split), 2, split_t_test, i = i)))
# combine the results
df <- rbindlist(res)
这比您的原始代码快大约 10 倍。
详细基准测试
1) 定义函数
original_function <- function(data) {
Test <- as.data.frame(matrix(data = NA, nrow = 4, ncol = 3))
for (t in 1:(ncol(data) - 1)) {
Test[t, ] <- combn(
as.character(unique(data$Species)),
2,
function(x) {
t.test(
x = data[data$Species == x[1], ][, t],
y = data[data$Species == x[2], ][, t]
)[["p.value"]]
}
)
}
return(Test)
}
# take out as much as possible from the loop
base_function <- function(data) {
unique_species <- as.character(unique(data$Species))
t_test_function <- function(x, i)
t.test(data[data$Species == x[1], ][, i],
data[data$Species == x[2], ][, i])[["p.value"]]
res <- lapply(seq_len(ncol(data) - 1),
function(i) {
combn(unique_species, 2, t_test_function, i = i)
})
return(do.call(rbind, res))
}
# split the dataset first to avoid the lookup for the Species in the loop
split_function <- function(data) {
species_split <- base::split(data, data$Species)
split_t_test <- function(x, i)
t.test(species_split[[x[1]]][, i],
species_split[[x[2]]][, i])[["p.value"]]
res <- lapply(seq_len(ncol(data) - 1),
function(i) combn(names(species_split), 2, split_t_test, i = i))
return(do.call(rbind, res))
}
# use data.table
datatable_version <- function(data) {
unique_species <- as.character(data[, unique(Species)])
dt_t_test <- function(x, i)
t.test(data[Species == x[1]][[i]], data[Species == x[2]][[i]])[["p.value"]]
rbindlist(lapply(seq_len(ncol(data) - 1),
function(i) as.list(combn(unique_species, 2, dt_t_test, i = i))))
}
# combine the split and data.table
dt_split <- function(data) {
species_split <- split(data, data$Species)
split_t_test <- function(x, i)
t.test(species_split[[x[1]]][[i]],
species_split[[x[2]]][[i]])[["p.value"]]
res <- lapply(seq_len(ncol(data) - 1),
function(i) as.list(combn(names(species_split), 2, split_t_test, i = i)))
return(rbindlist(res))
}
2) 计算基准
library(data.table)
iris_dt <- as.data.table(iris)
bench::mark(
original = original_function(iris),
base = base_function(iris),
split = split_function(iris),
datatable = datatable_version(iris_dt),
dt_split = dt_split(iris_dt),
check = FALSE # datatable returns a data.table not a data.frame
)
#> # A tibble: 5 x 13
#> expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
#> 1 original 7.52ms 11.1ms 51.4 245.5KB 0 26 0 505ms <NULL> <Rprofmem [605~ <bench_tm [~ <tibble [26 ~
#> 2 base 6.75ms 9.38ms 102. 243.6KB 0 51 0 500ms <NULL> <Rprofmem [603~ <bench_tm [~ <tibble [51 ~
#> 3 split 2.56ms 3.48ms 216. 44.3KB 2.57 84 1 389ms <NULL> <Rprofmem [167~ <bench_tm [~ <tibble [85 ~
#> 4 datatable 7.9ms 9.99ms 83.7 562.2KB 2.15 39 1 466ms <NULL> <Rprofmem [439~ <bench_tm [~ <tibble [40 ~
#> 5 dt_split 2.74ms 3.32ms 277. 161.6KB 0 139 0 502ms <NULL> <Rprofmem [660~ <bench_tm [~ <tibble [139~
在具有 100,000 个观察值的较大文件上计算基准。
set.seed(15212)
idx <- sample.int(nrow(iris), 1e5, replace = TRUE)
large_iris <- iris[idx, ]
large_iris_dt <- iris_dt[idx, ]
bench::mark(
original = original_function(large_iris),
base = base_function(large_iris),
split = split_function(large_iris),
datatable = datatable_version(large_iris_dt),
dt_split = dt_split(large_iris_dt),
check = FALSE, # datatable returns a data.table not a data.frame
min_time = 2
)
#> # A tibble: 5 x 13
#> expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
#> 1 original 158.4ms 179.2ms 5.49 147.8MB 9.99 11 20 2s <NULL> <Rprofmem [617 ~ <bench_tm ~ <tibble [11 ~
#> 2 base 145.3ms 167.9ms 5.84 146.8MB 10.2 12 21 2.05s <NULL> <Rprofmem [793 ~ <bench_tm ~ <tibble [12 ~
#> 3 split 19.8ms 23.7ms 31.9 25.2MB 11.0 64 22 2.01s <NULL> <Rprofmem [146 ~ <bench_tm ~ <tibble [64 ~
#> 4 datatable 49.9ms 72.4ms 12.7 68MB 10.3 26 21 2.04s <NULL> <Rprofmem [392 ~ <bench_tm ~ <tibble [26 ~
#> 5 dt_split 17.4ms 18.7ms 38.8 23MB 10.9 78 22 2.01s <NULL> <Rprofmem [174 ~ <bench_tm ~ <tibble [78 ~
我正在尝试 运行 对每个组的所有变量进行自动 t 检验。
但是,在大数据上使用 for-loop 似乎会加重我的计算机负担,并且会停止工作。
是否有办法删除 for 循环 并使代码 运行 更快?
示例代码使用 "for-loop" 和 "combn" 计算给定数据的每个可能的 t 检验组合
( 即 Sepal.Width, Sepal.Length, Petal.Length, Petal.Width for (setosa_vs_versicolor, setosa_vs_virginica, versicolor_vs_virginica" ))
然后将每个 t 检验结果保存到一个空白矩阵中(每个变量 4 行,每个比较 3 列
我尝试使用此代码的数据集有 36 个组和 103 个变量
旨在对以下代码进行全面检查,该代码完全是多个 for 循环的一团糟,并且似乎需要永远处理此类数据 https://github.com/CHKim5/LMSstat/blob/master/R/Allstats.R
system.time(
{data(iris)
Test<-as.data.frame(matrix(data = NA,nrow = 4, ncol = 3))
for (t in 1:(ncol(iris)-1)){
Test[t,]<-combn(as.character(unique(iris$Species)),2,
function(x) t.test(
x =iris[iris$Species == x[1],][,t],
y =iris[iris$Species == x[2],][,t])[["p.value"]])
}
}
)
下一个选项是:
library(tidyverse)
# Either the next 3:
# library(purrr)
# library(dplyr)
# library(tidyr)
combn(unique(iris$Species), 2, simplify = FALSE) %>%
structure(names = sapply(., function(.x) paste0(.x, collapse = "-"))) %>%
map_dfr(~iris %>%
pivot_longer(cols = -Species) %>%
pivot_wider(id_cols = name, names_from = Species, values_from = value) %>%
unnest(cols = c(-name)) %>%
nest_by(name) %>%
mutate(tt = list(t.test(data[[.x[[1]]]], data[[.x[[2]]]]))) %>%
summarise(across(-data, broom::tidy), .groups = "drop") %>%
mutate(across(c(-name), ~.$p.value)),
.id = "comb") %>%
pivot_wider(names_from = comb, values_from = tt)
# A tibble: 4 × 4
name `setosa-versicolor` `setosa-virginica` `versicolor-virginica`
<chr> <dbl> <dbl> <dbl>
1 Petal.Length 9.93e-46 9.27e-50 4.90e-22
2 Petal.Width 2.72e-47 2.44e-48 2.11e-25
3 Sepal.Length 3.75e-17 3.97e-25 1.87e- 7
4 Sepal.Width 2.48e-15 4.57e- 9 1.82e- 3
如果你想要尽可能快的时间,使用一个基准(下面我使用bench::mark()
)。
可以做很多改进。
当目标是速度时,data.table
包通常是一个好的开始。
此外,您要确保从热代码(循环中的代码)中取出所有冗余计算。
我能想到的最快的代码是这样的:
library(data.table)
iris <- as.data.table(iris)
# split the dataset by your target group
species_split <- split(iris, iris$Species)
# create a wrapper for the t-test
split_t_test <- function(x, i) t.test(species_split[[x[1]]][[i]],
species_split[[x[2]]][[i]])[["p.value"]]
# iterate over the columns and compute the t-tests
res <- lapply(seq_len(ncol(iris) - 1),
function(i) as.list(combn(names(species_split), 2, split_t_test, i = i)))
# combine the results
df <- rbindlist(res)
这比您的原始代码快大约 10 倍。
详细基准测试
1) 定义函数
original_function <- function(data) {
Test <- as.data.frame(matrix(data = NA, nrow = 4, ncol = 3))
for (t in 1:(ncol(data) - 1)) {
Test[t, ] <- combn(
as.character(unique(data$Species)),
2,
function(x) {
t.test(
x = data[data$Species == x[1], ][, t],
y = data[data$Species == x[2], ][, t]
)[["p.value"]]
}
)
}
return(Test)
}
# take out as much as possible from the loop
base_function <- function(data) {
unique_species <- as.character(unique(data$Species))
t_test_function <- function(x, i)
t.test(data[data$Species == x[1], ][, i],
data[data$Species == x[2], ][, i])[["p.value"]]
res <- lapply(seq_len(ncol(data) - 1),
function(i) {
combn(unique_species, 2, t_test_function, i = i)
})
return(do.call(rbind, res))
}
# split the dataset first to avoid the lookup for the Species in the loop
split_function <- function(data) {
species_split <- base::split(data, data$Species)
split_t_test <- function(x, i)
t.test(species_split[[x[1]]][, i],
species_split[[x[2]]][, i])[["p.value"]]
res <- lapply(seq_len(ncol(data) - 1),
function(i) combn(names(species_split), 2, split_t_test, i = i))
return(do.call(rbind, res))
}
# use data.table
datatable_version <- function(data) {
unique_species <- as.character(data[, unique(Species)])
dt_t_test <- function(x, i)
t.test(data[Species == x[1]][[i]], data[Species == x[2]][[i]])[["p.value"]]
rbindlist(lapply(seq_len(ncol(data) - 1),
function(i) as.list(combn(unique_species, 2, dt_t_test, i = i))))
}
# combine the split and data.table
dt_split <- function(data) {
species_split <- split(data, data$Species)
split_t_test <- function(x, i)
t.test(species_split[[x[1]]][[i]],
species_split[[x[2]]][[i]])[["p.value"]]
res <- lapply(seq_len(ncol(data) - 1),
function(i) as.list(combn(names(species_split), 2, split_t_test, i = i)))
return(rbindlist(res))
}
2) 计算基准
library(data.table)
iris_dt <- as.data.table(iris)
bench::mark(
original = original_function(iris),
base = base_function(iris),
split = split_function(iris),
datatable = datatable_version(iris_dt),
dt_split = dt_split(iris_dt),
check = FALSE # datatable returns a data.table not a data.frame
)
#> # A tibble: 5 x 13
#> expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
#> 1 original 7.52ms 11.1ms 51.4 245.5KB 0 26 0 505ms <NULL> <Rprofmem [605~ <bench_tm [~ <tibble [26 ~
#> 2 base 6.75ms 9.38ms 102. 243.6KB 0 51 0 500ms <NULL> <Rprofmem [603~ <bench_tm [~ <tibble [51 ~
#> 3 split 2.56ms 3.48ms 216. 44.3KB 2.57 84 1 389ms <NULL> <Rprofmem [167~ <bench_tm [~ <tibble [85 ~
#> 4 datatable 7.9ms 9.99ms 83.7 562.2KB 2.15 39 1 466ms <NULL> <Rprofmem [439~ <bench_tm [~ <tibble [40 ~
#> 5 dt_split 2.74ms 3.32ms 277. 161.6KB 0 139 0 502ms <NULL> <Rprofmem [660~ <bench_tm [~ <tibble [139~
在具有 100,000 个观察值的较大文件上计算基准。
set.seed(15212)
idx <- sample.int(nrow(iris), 1e5, replace = TRUE)
large_iris <- iris[idx, ]
large_iris_dt <- iris_dt[idx, ]
bench::mark(
original = original_function(large_iris),
base = base_function(large_iris),
split = split_function(large_iris),
datatable = datatable_version(large_iris_dt),
dt_split = dt_split(large_iris_dt),
check = FALSE, # datatable returns a data.table not a data.frame
min_time = 2
)
#> # A tibble: 5 x 13
#> expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
#> 1 original 158.4ms 179.2ms 5.49 147.8MB 9.99 11 20 2s <NULL> <Rprofmem [617 ~ <bench_tm ~ <tibble [11 ~
#> 2 base 145.3ms 167.9ms 5.84 146.8MB 10.2 12 21 2.05s <NULL> <Rprofmem [793 ~ <bench_tm ~ <tibble [12 ~
#> 3 split 19.8ms 23.7ms 31.9 25.2MB 11.0 64 22 2.01s <NULL> <Rprofmem [146 ~ <bench_tm ~ <tibble [64 ~
#> 4 datatable 49.9ms 72.4ms 12.7 68MB 10.3 26 21 2.04s <NULL> <Rprofmem [392 ~ <bench_tm ~ <tibble [26 ~
#> 5 dt_split 17.4ms 18.7ms 38.8 23MB 10.9 78 22 2.01s <NULL> <Rprofmem [174 ~ <bench_tm ~ <tibble [78 ~