基于 sparklyr 和标准评估 (SE) 的函数
sparklyr and standard evaluation (SE) based functions
我正在尝试编写一个函数来执行 sdf_pivot()
a 创建一个 Spark DataFrame,其列名包括原始变量或列的名称。
set.seed(80)
df <- data.frame(id = c(1:5),
var1 = sample(LETTERS[1:12], 5, replace = TRUE),
var2 = sample(LETTERS[13:16], 5, replace = TRUE))
ref <- copy_to(sc, df, "mytbl")
glimpse(ref)
Observations: 5
Variables: 3
$ id <int> 1, 2, 3, 4, 5
$ var1 <chr> "F", "G", "J", "A", "H"
$ var2 <chr> "M", "O", "O", "O", "O"
这是 var1
的预期结果,无需编写函数:
ref %>%
dplyr::select(id, var1) %>%
dplyr::mutate(newvar1 = paste0("var1_",var1)) %>%
sparklyr::sdf_pivot(formula = id ~ newvar1, fun.aggregate = "count") %>%
sparklyr::na.replace(0)
# Source: table<sparklyr_tmp_56f96ab7d507> [?? x 6]
# Database: spark_connection
id var1_A var1_F var1_G var1_H var1_J
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 0 1 0 0 0
2 3 0 0 0 0 1
3 5 0 0 0 1 0
4 4 1 0 0 0 0
5 2 0 0 1 0 0
低于我的一个函数版本,这当然不起作用,我也尝试过 quote
和 deparse
,但我都被 mutate_
卡住了和 sdf_pivot
.
myPivotFunction <- function(sdf, varname, newvarname){
mutate_op <- paste0(newvarname," = ", "var1_", varname)
sdf %>%
dplyr::select_(.dots = list('id', varname)) %>%
mutate_(.dots = setNames(newvarname, mutate_op)) %>%
sparklyr::sdf_pivot(formula = id ~ newvar1, fun.aggregate = "count") %>%
sparklyr::na.replace(0)
}
一点点 rlang
应该可以解决问题:
library(rlang)
library(glue)
myPivotFunction <- function(sdf, varname, newvarname){
exprs <- c("id", glue('paste0("var1_", {varname})')) %>%
setNames(c("id", newvarname)) %>%
lapply(parse_quosure)
sdf %>%
transmute(!!! exprs) %>%
sdf_pivot(
formula = as.formula(glue("id ~ {newvarname}")),
fun.aggregate = "count") %>%
na.replace(0)
}
我正在尝试编写一个函数来执行 sdf_pivot()
a 创建一个 Spark DataFrame,其列名包括原始变量或列的名称。
set.seed(80)
df <- data.frame(id = c(1:5),
var1 = sample(LETTERS[1:12], 5, replace = TRUE),
var2 = sample(LETTERS[13:16], 5, replace = TRUE))
ref <- copy_to(sc, df, "mytbl")
glimpse(ref)
Observations: 5
Variables: 3
$ id <int> 1, 2, 3, 4, 5
$ var1 <chr> "F", "G", "J", "A", "H"
$ var2 <chr> "M", "O", "O", "O", "O"
这是 var1
的预期结果,无需编写函数:
ref %>%
dplyr::select(id, var1) %>%
dplyr::mutate(newvar1 = paste0("var1_",var1)) %>%
sparklyr::sdf_pivot(formula = id ~ newvar1, fun.aggregate = "count") %>%
sparklyr::na.replace(0)
# Source: table<sparklyr_tmp_56f96ab7d507> [?? x 6]
# Database: spark_connection
id var1_A var1_F var1_G var1_H var1_J
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 0 1 0 0 0
2 3 0 0 0 0 1
3 5 0 0 0 1 0
4 4 1 0 0 0 0
5 2 0 0 1 0 0
低于我的一个函数版本,这当然不起作用,我也尝试过 quote
和 deparse
,但我都被 mutate_
卡住了和 sdf_pivot
.
myPivotFunction <- function(sdf, varname, newvarname){
mutate_op <- paste0(newvarname," = ", "var1_", varname)
sdf %>%
dplyr::select_(.dots = list('id', varname)) %>%
mutate_(.dots = setNames(newvarname, mutate_op)) %>%
sparklyr::sdf_pivot(formula = id ~ newvar1, fun.aggregate = "count") %>%
sparklyr::na.replace(0)
}
一点点 rlang
应该可以解决问题:
library(rlang)
library(glue)
myPivotFunction <- function(sdf, varname, newvarname){
exprs <- c("id", glue('paste0("var1_", {varname})')) %>%
setNames(c("id", newvarname)) %>%
lapply(parse_quosure)
sdf %>%
transmute(!!! exprs) %>%
sdf_pivot(
formula = as.formula(glue("id ~ {newvarname}")),
fun.aggregate = "count") %>%
na.replace(0)
}