使用 "Expand.Grid" 和 "mapply" 计算函数
Using "Expand.Grid" and "mapply" to evaluate functions
我正在使用 R 编程语言。我正在尝试在我定义的两个网格中包含的点上评估我定义的函数。
首先,我为这个问题创建了一些示例数据:
#load library
library(dplyr)
library(data.table)
# create some data for this example
set.seed(123)
a1 = rnorm(1000,100,10)
b1 = rnorm(1000,100,5)
c1 = sample.int(1000, 1000, replace = TRUE)
train_data = data.frame(a1,b1,c1)
然后,我定义了两个网格:
#grid_1 - does not work for some reason
random_1 <- seq(80,100,5)
random_2 <- seq(random_1,120,5)
random_3 <- seq(85,120,5)
random_4 <- seq(random_3,120,5)
split_1 = seq(0,1,0.5)
split_2 = seq(0,1,0.5)
split_3 = seq(0,1,0.5)
DF_2 <- expand.grid(random_1 , random_2, random_3, random_4, split_1, split_2, split_3)
#grid_2
random_1 <- seq(80,100,5)
random_2 <- seq(85,120,5)
random_3 <- seq(85,120,5)
random_4 <- seq(90,120,5)
split_1 = seq(0,1,0.5)
split_2 = seq(0,1,0.5)
split_3 = seq(0,1,0.5)
DF_1 <- expand.grid(random_1 , random_2, random_3, random_4, split_1, split_2, split_3)
接下来,我定义了我要在这两个网格中包含的点上计算的函数(“grid_function”):
#### define function
results_table <- data.frame()
grid_function <- function(random_1, random_2, random_3, random_4, split_1, split_2, split_3) {
#bin data according to random criteria
train_data <- train_data %>% mutate(cat = ifelse(a1 <= random_1 & b1 <= random_3, "a", ifelse(a1 <= random_2 & b1 <= random_4, "b", "c")))
train_data$cat = as.factor(train_data$cat)
#new splits
a_table = train_data %>%
filter(cat == "a") %>%
select(a1, b1, c1, cat)
b_table = train_data %>%
filter(cat == "b") %>%
select(a1, b1, c1, cat)
c_table = train_data %>%
filter(cat == "c") %>%
select(a1, b1, c1, cat)
#calculate random quantile ("quant") for each bin
table_a = data.frame(a_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_1)))
table_b = data.frame(b_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_2)))
table_c = data.frame(c_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_3)))
#create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
table_a$diff = ifelse(table_a$quant > table_a$c1,1,0)
table_b$diff = ifelse(table_b$quant > table_b$c1,1,0)
table_c$diff = ifelse(table_c$quant > table_c$c1,1,0)
#group all tables
final_table = rbind(table_a, table_b, table_c)
#create a table: for each bin, calculate the average of "diff"
final_table_2 = data.frame(final_table %>%
group_by(cat) %>%
summarize(
mean = mean(diff)
))
#add "total mean" to this table
final_table_2 = data.frame(final_table_2 %>% add_row(cat = "total", mean = mean(final_table$diff)))
#format this table: add the random criteria to this table for reference
final_table_2$random_1 = random_1
final_table_2$random_2 = random_2
final_table_2$random_3 = random_3
final_table_2$random_4 = random_4
final_table_2$split_1 = split_1
final_table_2$split_2 = split_2
final_table_2$split_3 = split_3
final_table_2$iteration_number = i
results_table <- rbind(results_table, final_table_2)
final_results = dcast(setDT(results_table), iteration_number + random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')
}
问题: 现在,我正在尝试计算“DF_1”和“DF_2”中包含的每个点的“grid_function” ”。我发现了两个相关的 Whosebug 帖子,其中提出了类似的问题,但提供的答案似乎对我不起作用。
方法一:
group_by(DF_2, random_1, random_2, random_3, random_4, split_1, split_2, split_3) %>% dplyr::mutate(z = grid_function(random_1, random_2, random_3, random_4, split_1, split_2, split_3))
Error: Must group by variables found in `.data`.
* Column `random_1` is not found.
* Column `random_2` is not found.
* Column `random_3` is not found.
* Column `random_4` is not found.
* Column `split_1` is not found.
* Column `split_2` is not found.
* Column `split_3` is not found.
Run `rlang::last_error()` to see where the error occurred.
方法二:apply function using expand.grid in R
do.call(mapply, c(function(random_1, random_2, random_3, random_4, split_1, split_2, split_3)) , unname(DF_2)))
Error: unexpected ')' in "do.call(mapply, c(function(random_1, random_2, random_3, random_4, split_1, split_2, split_3))"
有人可以告诉我如何解决这个问题吗?
我认为 grid_function
中存在错误。当我尝试手动触发它时它确实会产生错误:
> unlist(DF_1[1,])
random_1 random_2 random_3 random_4 split_1 split_2 split_3
80 85 85 90 0 0 0
> grid_function(80,85,85,90,0,0,0)
`summarise()` ungrouping output (override with `.groups` argument)
Error in grid_function(80, 85, 85, 90, 0, 0, 0) :
object 'i' not found
或者我用错了。为了测试目的,我制作了自己的简单网格函数。它生成中位数。
grid_function2 <- function(random_1 , random_2, random_3, random_4, split_1, split_2, split_3){
return(median(c(random_1 , random_2, random_3, random_4, split_1, split_2, split_3)))
}
为了在网格上应用该函数,您应该将参数名称定义为 colnames
:
# DF_1 is the expanded grid from your question
colnames(DF_1) <- c("random_1" , "random_2", "random_3",
"random_4", "split_1", "split_2", "split_3")
然后你可以在其上应用一个带有命名参数的函数:
resultdf1 <- apply(DF_1,1, # 1 means rows
FUN=function(x){
do.call(
# Call Function grid_function2 with the arguments in
# a list
grid_function2,
# force list type for the arguments
as.list(
# make the row to a named vector
unlist(x)
)
)
}
)
我正在使用 R 编程语言。我正在尝试在我定义的两个网格中包含的点上评估我定义的函数。
首先,我为这个问题创建了一些示例数据:
#load library
library(dplyr)
library(data.table)
# create some data for this example
set.seed(123)
a1 = rnorm(1000,100,10)
b1 = rnorm(1000,100,5)
c1 = sample.int(1000, 1000, replace = TRUE)
train_data = data.frame(a1,b1,c1)
然后,我定义了两个网格:
#grid_1 - does not work for some reason
random_1 <- seq(80,100,5)
random_2 <- seq(random_1,120,5)
random_3 <- seq(85,120,5)
random_4 <- seq(random_3,120,5)
split_1 = seq(0,1,0.5)
split_2 = seq(0,1,0.5)
split_3 = seq(0,1,0.5)
DF_2 <- expand.grid(random_1 , random_2, random_3, random_4, split_1, split_2, split_3)
#grid_2
random_1 <- seq(80,100,5)
random_2 <- seq(85,120,5)
random_3 <- seq(85,120,5)
random_4 <- seq(90,120,5)
split_1 = seq(0,1,0.5)
split_2 = seq(0,1,0.5)
split_3 = seq(0,1,0.5)
DF_1 <- expand.grid(random_1 , random_2, random_3, random_4, split_1, split_2, split_3)
接下来,我定义了我要在这两个网格中包含的点上计算的函数(“grid_function”):
#### define function
results_table <- data.frame()
grid_function <- function(random_1, random_2, random_3, random_4, split_1, split_2, split_3) {
#bin data according to random criteria
train_data <- train_data %>% mutate(cat = ifelse(a1 <= random_1 & b1 <= random_3, "a", ifelse(a1 <= random_2 & b1 <= random_4, "b", "c")))
train_data$cat = as.factor(train_data$cat)
#new splits
a_table = train_data %>%
filter(cat == "a") %>%
select(a1, b1, c1, cat)
b_table = train_data %>%
filter(cat == "b") %>%
select(a1, b1, c1, cat)
c_table = train_data %>%
filter(cat == "c") %>%
select(a1, b1, c1, cat)
#calculate random quantile ("quant") for each bin
table_a = data.frame(a_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_1)))
table_b = data.frame(b_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_2)))
table_c = data.frame(c_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_3)))
#create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
table_a$diff = ifelse(table_a$quant > table_a$c1,1,0)
table_b$diff = ifelse(table_b$quant > table_b$c1,1,0)
table_c$diff = ifelse(table_c$quant > table_c$c1,1,0)
#group all tables
final_table = rbind(table_a, table_b, table_c)
#create a table: for each bin, calculate the average of "diff"
final_table_2 = data.frame(final_table %>%
group_by(cat) %>%
summarize(
mean = mean(diff)
))
#add "total mean" to this table
final_table_2 = data.frame(final_table_2 %>% add_row(cat = "total", mean = mean(final_table$diff)))
#format this table: add the random criteria to this table for reference
final_table_2$random_1 = random_1
final_table_2$random_2 = random_2
final_table_2$random_3 = random_3
final_table_2$random_4 = random_4
final_table_2$split_1 = split_1
final_table_2$split_2 = split_2
final_table_2$split_3 = split_3
final_table_2$iteration_number = i
results_table <- rbind(results_table, final_table_2)
final_results = dcast(setDT(results_table), iteration_number + random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')
}
问题: 现在,我正在尝试计算“DF_1”和“DF_2”中包含的每个点的“grid_function” ”。我发现了两个相关的 Whosebug 帖子,其中提出了类似的问题,但提供的答案似乎对我不起作用。
方法一:
group_by(DF_2, random_1, random_2, random_3, random_4, split_1, split_2, split_3) %>% dplyr::mutate(z = grid_function(random_1, random_2, random_3, random_4, split_1, split_2, split_3))
Error: Must group by variables found in `.data`.
* Column `random_1` is not found.
* Column `random_2` is not found.
* Column `random_3` is not found.
* Column `random_4` is not found.
* Column `split_1` is not found.
* Column `split_2` is not found.
* Column `split_3` is not found.
Run `rlang::last_error()` to see where the error occurred.
方法二:apply function using expand.grid in R
do.call(mapply, c(function(random_1, random_2, random_3, random_4, split_1, split_2, split_3)) , unname(DF_2)))
Error: unexpected ')' in "do.call(mapply, c(function(random_1, random_2, random_3, random_4, split_1, split_2, split_3))"
有人可以告诉我如何解决这个问题吗?
我认为 grid_function
中存在错误。当我尝试手动触发它时它确实会产生错误:
> unlist(DF_1[1,])
random_1 random_2 random_3 random_4 split_1 split_2 split_3
80 85 85 90 0 0 0
> grid_function(80,85,85,90,0,0,0)
`summarise()` ungrouping output (override with `.groups` argument)
Error in grid_function(80, 85, 85, 90, 0, 0, 0) :
object 'i' not found
或者我用错了。为了测试目的,我制作了自己的简单网格函数。它生成中位数。
grid_function2 <- function(random_1 , random_2, random_3, random_4, split_1, split_2, split_3){
return(median(c(random_1 , random_2, random_3, random_4, split_1, split_2, split_3)))
}
为了在网格上应用该函数,您应该将参数名称定义为 colnames
:
# DF_1 is the expanded grid from your question
colnames(DF_1) <- c("random_1" , "random_2", "random_3",
"random_4", "split_1", "split_2", "split_3")
然后你可以在其上应用一个带有命名参数的函数:
resultdf1 <- apply(DF_1,1, # 1 means rows
FUN=function(x){
do.call(
# Call Function grid_function2 with the arguments in
# a list
grid_function2,
# force list type for the arguments
as.list(
# make the row to a named vector
unlist(x)
)
)
}
)