summarise_at 的 R multidplyr 变通了吗?
R multidplyr for summarise_at work around?
我想使用 multidplyr,它还没有 summarise_at 的任何东西。我有数百甚至数千,所以 summarise_at 是必要的,但不幸的是,multidplyr 中不可用。
寻找替代方案来解决它。
library('tidyverse')
df <- tibble(ID = c('a','a','b','c','c','e','e','f','g','g'),
var1 = floor(runif(10, min=0, max=100)),
var2 = floor(runif(10, min=0, max=100)),
var3 = floor(runif(10, min=0, max=100)),
var4 = floor(runif(10, min=0, max=100))
)
library('multidplyr')
cluster <- new_cluster(5)
#works
df %>%
group_by(ID) %>%
#partition(cluster) %>%
summarise_at(.vars = vars(starts_with('var')),sum)
#collect()
#works
df %>%
group_by(ID) %>%
partition(cluster) %>%
summarise(var1 = sum(var1),
var2 = sum(var2),
var3 = sum(var3)) %>%
collect()
#doesnt works
df %>%
group_by(ID) %>%
partition(cluster) %>%
summarise_at(.vars = vars(starts_with('var')),sum) %>%
collect()
我什至试过这个
#Define character string vector to replace command line
sum_var <- select(df,starts_with('var')) %>% names()
sum_var_str <- paste0(sum_var," = sum(",sum_var,")")
sum_var_str <- str_c(sum_var_str, collapse = ", ")
> sum_var
[1] "var1" "var2" "var3" "var4"
> sum_var_str
[1] "var1 = sum(var1), var2 = sum(var2), var3 = sum(var3), var4 = sum(var4)"
#works
df %>%
group_by(ID) %>%
{ eval(parse(text = sprintf("summarise(., %s, .groups = 'drop')", sum_var_str))) }
#doesn't works
df %>%
group_by(ID) %>%
partition(cluster) %>%
{ eval(parse(text = sprintf("summarise(., %s, .groups = 'drop')", sum_var_str))) } %>%
collect()
找到解决方案
library('dplyr')
library('multidplyr')
library('parallel')
cluster <- new_cluster(detectCores())
df <- tibble(ID = c('a','a','b','c','c','e','e','f','g','g'),
var1 = floor(runif(10, min=0, max=100)),
var2 = floor(runif(10, min=0, max=100)),
var3 = floor(runif(10, min=0, max=100)),
var4 = floor(runif(10, min=0, max=100))
)
sum_var <- select(df,starts_with('var')) %>% names()
#assign vector to cluster
cluster_assign(cluster, sum_var = sum_var)
cluster_library(cluster, 'dplyr')
df %>%
group_by(ID) %>%
partition(cluster) %>%
summarise(across(all_of(sum_var), sum)) %>%
collect()
# A tibble: 6 x 5
ID var1 var2 var3 var4
<chr> <dbl> <dbl> <dbl> <dbl>
1 a 57 72 85 118
2 b 46 50 80 33
3 c 82 156 96 154
4 e 122 107 93 120
5 f 33 7 49 36
6 g 99 79 83 56
我想使用 multidplyr,它还没有 summarise_at 的任何东西。我有数百甚至数千,所以 summarise_at 是必要的,但不幸的是,multidplyr 中不可用。
寻找替代方案来解决它。
library('tidyverse')
df <- tibble(ID = c('a','a','b','c','c','e','e','f','g','g'),
var1 = floor(runif(10, min=0, max=100)),
var2 = floor(runif(10, min=0, max=100)),
var3 = floor(runif(10, min=0, max=100)),
var4 = floor(runif(10, min=0, max=100))
)
library('multidplyr')
cluster <- new_cluster(5)
#works
df %>%
group_by(ID) %>%
#partition(cluster) %>%
summarise_at(.vars = vars(starts_with('var')),sum)
#collect()
#works
df %>%
group_by(ID) %>%
partition(cluster) %>%
summarise(var1 = sum(var1),
var2 = sum(var2),
var3 = sum(var3)) %>%
collect()
#doesnt works
df %>%
group_by(ID) %>%
partition(cluster) %>%
summarise_at(.vars = vars(starts_with('var')),sum) %>%
collect()
我什至试过这个
#Define character string vector to replace command line
sum_var <- select(df,starts_with('var')) %>% names()
sum_var_str <- paste0(sum_var," = sum(",sum_var,")")
sum_var_str <- str_c(sum_var_str, collapse = ", ")
> sum_var
[1] "var1" "var2" "var3" "var4"
> sum_var_str
[1] "var1 = sum(var1), var2 = sum(var2), var3 = sum(var3), var4 = sum(var4)"
#works
df %>%
group_by(ID) %>%
{ eval(parse(text = sprintf("summarise(., %s, .groups = 'drop')", sum_var_str))) }
#doesn't works
df %>%
group_by(ID) %>%
partition(cluster) %>%
{ eval(parse(text = sprintf("summarise(., %s, .groups = 'drop')", sum_var_str))) } %>%
collect()
找到解决方案
library('dplyr')
library('multidplyr')
library('parallel')
cluster <- new_cluster(detectCores())
df <- tibble(ID = c('a','a','b','c','c','e','e','f','g','g'),
var1 = floor(runif(10, min=0, max=100)),
var2 = floor(runif(10, min=0, max=100)),
var3 = floor(runif(10, min=0, max=100)),
var4 = floor(runif(10, min=0, max=100))
)
sum_var <- select(df,starts_with('var')) %>% names()
#assign vector to cluster
cluster_assign(cluster, sum_var = sum_var)
cluster_library(cluster, 'dplyr')
df %>%
group_by(ID) %>%
partition(cluster) %>%
summarise(across(all_of(sum_var), sum)) %>%
collect()
# A tibble: 6 x 5
ID var1 var2 var3 var4
<chr> <dbl> <dbl> <dbl> <dbl>
1 a 57 72 85 118
2 b 46 50 80 33
3 c 82 156 96 154
4 e 122 107 93 120
5 f 33 7 49 36
6 g 99 79 83 56