循环 if else 和 next R
Loop with if else and next R
我在 R 中将循环与 if else 和 next 结合起来。
为了重现我的问题的复杂性,我不能提供最少的示例,而是提供一大段代码。目的是在 rmse_1
和 rmse_2
列中用 min
、max
、50percentile
填充列表 df
。
您需要在# !!! change path
标记的位置根据您想要的路径更改路径。如果您更改了路径,您可以 运行 代码:
# create lists
mse_samp <- list("mse_A" = list("P10" = data.frame(number = seq(1,3,1),
mse_1 = c(2.5, 4.6, 7.8),
mse_2 = c(6.7, 8.9, 4.1)),
"P30" = data.frame(number = seq(1,3,1),
mse_1 = c(22.5, 74.6, 97.8),
mse_2 = c(56.7, 78.9, 14.1))),
"mse_B" = list("P10" = data.frame(number = seq(1,3,1),
mse_1 = c(122.5, 124.6, 127.8),
mse_2 = c(126.7, 128.9, 124.1)),
"P30" = data.frame(number = seq(1,3,1),
mse_1 = c(3422.5, 3474.6, 3497.8),
mse_2 = c(3456.7, 3478.9, 3414.1))))
# !!! change path
save(mse_samp,
file="H:\R\Forum_data\dat1.RData")
mse_samp <- list("mse_A" = list("P70" = data.frame(number = seq(1,3,1),
mse_1 = c(22.2, 77.6, 97.8, 21.2, 73.9),
mse_2 = c(26.7, 78.9, 17.1, 23.2, 82.2)),
"P80" = data.frame(number = seq(1,3,1),
mse_1 = c(1022.2, 3077.6, 9097.8, 1221.2, 7373.9),
mse_2 = c(7626.7, 2278.9, 7317.1, 7623.2, 8982.2))),
"mse_B" = list("P70" = data.frame(number = seq(1,3,1),
mse_1 = c(3722.2, 3777.6, 3797.8),
mse_2 = c(3726.7, 3778.9, 3717.1)),
"P80" = data.frame(number = seq(1,3,1),
mse_1 = c(1022.2, 3077.6, 9097.8),
mse_2 = c(7626.7, 2278.9, 7317.1))))
save(mse_samp,
file="H:\R\Forum_data\dat2.RData")
# create table for min max for different perc and runs for each paramter (loop)
n_measure <- 3 # number of different measures
npr1 <- 2 # number of different percs run1
npr2 <- 2 # number of different percs run2
targets <- c("A", "B")
for (i in 1:length(targets)) {
df <- data.frame(run = c(rep("run1", n_measure * npr1),
rep("run2", n_measure * npr2)),
perc_train = c(rep(c(0.1, 0.3), times = 1, each = n_measure), # percs run 1
rep(c(0.7, 0.8), times = 1, each = n_measure)), # percs run 2
measure = c(rep(c("min", "max", "50percentile"),
times = npr1 + npr2, each = 1)),
rmse_1 = rep(NA, n_measure * (npr1 + npr2)),
rmse_2 = rep(NA, n_measure * (npr1 + npr2))
)
assign(paste0('df_', targets[i]), df)
}
df <- list("A" = df_A, "B" = df_B)
# convert column which are factors to characters
for (i in 1:length(targets)) {
df[[i]][sapply(df[[i]], is.factor)] <- lapply(df[[i]][sapply(df[[i]], is.factor)],
as.character)
}
rm(list = c("df_A", "df_B", "df_C"))
# !!! change path
path <- c("H:\R\Forum_data\dat1.RData", # run1
# !!! change path
"H:\R\Forum_data\dat2.RData") # run2
percs_names <- c("P10", "P30", "P70", "P80")
percs <- c(0.1, 0.3, 0.7, 0.8)
targets <- c("A", "B")
run_name <- c("run1", "run2")
measure_name <- c("min", "max", "50percentile")
fill_names <- c("rmse_min_1", "rmse_min_2", "rmse_max_1", "rmse_max_2",
"percentile_50_1", "percentile_50_2")
var_name <- c("rmse_1", "rmse_2")
a_or_b <- c("a","b")
# read in data
for (i in 1:length(path)) {
load(path[i])
dat <- mse_samp
for (j in 1:length(targets)) {
for (k in 1:length(percs_names)) {
# if statement
if(percs_names[k] == names(dat[[j]][k])){
dat1 <- dat[[paste0("mse_", targets[j])]][k][[1]]
rmse_min_1 <- sqrt(min(dat1$mse_1))
rmse_min_2 <- sqrt(min(dat1$mse_2))
rmse_max_1 <- sqrt(max(dat1$mse_1))
rmse_max_2 <- sqrt(max(dat1$mse_2))
percentile_50_1 <- quantile(sqrt(dat1$mse_1), probs = 0.5)
percentile_50_2 <- quantile(sqrt(dat1$mse_2), probs = 0.5)
for (fi in 1:length(fill_names)) {
for (m in 1:length(measure_name)) {
a <- which(df[[targets[j]]]$run == run_name[i] &
df[[targets[j]]]$measure == measure_name[m] &
df[[targets[j]]]$perc_train == percs[k] &
is.na(df[[targets[j]]]$rmse_1)
)
b <- which(df[[targets[j]]]$run == run_name[i] &
df[[targets[j]]]$measure == measure_name[m] &
df[[targets[j]]]$perc_train == percs[k] &
is.na(df[[targets[j]]]$rmse_2)
)
for (v in 1:length(var_name)) {
df[[targets[j]]][eval(parse(text = a_or_b[v])), which(names(df[[targets[j]]]) == var_name[v])] <- eval(parse(text = fill_names[fi]))
}
}
}
}
else { next }
}
}
}
1. issue 运行ning 代码后,出现如下错误信息:
Error in if (percs_names[k] == names(dat[[j]][k])) { :
missing value where TRUE/FALSE needed
我猜错误可能是由if else
语句引起的。我怎样才能 运行 没有错误的代码?
2。 issue 目前只有 run1
的行被填充。 rmse_1
和 rmse_2
在行 min
、max
、50percentile
中填充了相同的值。他们应该是不同的。如何填写其他 运行 并正确填写行?最后应该没有NA
了。
虽然您坚持使用 for
-loops 这里是解决您的问题的方法 map
(类似于 lapply)和一些 tidyverse
-magic。
我有一个假设:您处理的所有数据集都存储在一个名为 data_runs_list
的列表中。在 Data 部分的答案末尾给出了一个示例(使用您的示例数据)。
所以让我们首先以更易读的格式引入这个嵌套结构:
library(tidyverse)
library(stringr)
data_runs_df <-
map(data_runs_list, ~ map(.x, bind_rows, .id = "perc") %>%
bind_rows(.id = "target")) %>%
bind_rows(.id = "run")
data_runs_df
# A tibble: 24 x 6
# run target perc number mse_1 mse_2
# <chr> <chr> <chr> <int> <dbl> <dbl>
# 1 run1 mse_A P10 1 2.5 6.7
# 2 run1 mse_A P10 2 4.6 8.9
# 3 run1 mse_A P10 3 7.8 4.1
# 4 run1 mse_A P30 1 22.5 56.7
# 5 run1 mse_A P30 2 74.6 78.9
# 6 run1 mse_A P30 3 97.8 14.1
# 7 run1 mse_B P10 1 122. 127.
# 8 run1 mse_B P10 2 125. 129.
# 9 run1 mse_B P10 3 128. 124.
# 10 run1 mse_B P30 1 3422. 3457.
# # ... with 14 more rows
为了更好地理解 bind_rows()
的作用,只取列表第一个条目的第一个条目,看看会发生什么:
bind_rows(data_runs_list[[1]][[1]], .id = "perc")
# perc number mse_1 mse_2
# 1 P10 1 2.5 6.7
# 2 P10 2 4.6 8.9
# 3 P10 3 7.8 4.1
# 4 P30 1 22.5 56.7
# 5 P30 2 74.6 78.9
# 6 P30 3 97.8 14.1
两个数据框堆叠在一起,id-column perc
保留原始列表名称。 map
然后依次应用于列表的每个级别 bind_row
,在每个级别上都有不同的 id 列。
所以这是一件可以很好地工作的东西。您想要 min
、max
和每个 运行、百分比、目标的 max
和 50% 分位数(即 median
),并分别针对两个测量值 mse_1
和 mse_2
。 group_by
与 summarize
完美结合。为了更好地处理两种不同的测量,首先将数据转换为长格式。如果您有更多测量值,您可以在 gather
-调用结束时指定它们:
data_runs_df <- data_runs_df %>%
gather(mse, value, mse_1, mse_2)
data_runs_df
# A tibble: 48 x 6
# run target perc number mse value
# <chr> <chr> <chr> <int> <chr> <dbl>
# 1 run1 mse_A P10 1 mse_1 2.5
# 2 run1 mse_A P10 2 mse_1 4.6
# 3 run1 mse_A P10 3 mse_1 7.8
# 4 run1 mse_A P30 1 mse_1 22.5
# 5 run1 mse_A P30 2 mse_1 74.6
# 6 run1 mse_A P30 3 mse_1 97.8
# 7 run1 mse_B P10 1 mse_1 122.
# 8 run1 mse_B P10 2 mse_1 125.
# 9 run1 mse_B P10 3 mse_1 128.
# 10 run1 mse_B P30 1 mse_1 3422.
# ... with 38 more rows
现在,在计算测量值之前,我们快速重命名目标和 mse 列,然后将 group_by
与 summarize
结合使用:
data_info <- data_runs_df %>%
mutate(mse = str_c("r", mse),
target = str_remove(target, "mse_")) %>%
group_by(run, target, perc, mse) %>%
summarize(min = min(sqrt(value)),
max = max(sqrt(value)),
median = median(sqrt(value)))
data_info
# A tibble: 16 x 7
# Groups: run, target, perc [?]
# run target perc mse min max median
# <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
# 1 run1 A P10 rmse_1 1.58 2.79 2.14
# 2 run1 A P10 rmse_2 2.02 2.98 2.59
# 3 run1 A P30 rmse_1 4.74 9.89 8.64
# 4 run1 A P30 rmse_2 3.75 8.88 7.53
# 5 run1 B P10 rmse_1 11.1 11.3 11.2
# 6 run1 B P10 rmse_2 11.1 11.4 11.3
# 7 run1 B P30 rmse_1 58.5 59.1 58.9
# 8 run1 B P30 rmse_2 58.4 59.0 58.8
# 9 run2 A P70 rmse_1 4.71 9.89 8.81
# 10 run2 A P70 rmse_2 4.14 8.88 5.17
# 11 run2 A P80 rmse_1 32.0 95.4 55.5
# 12 run2 A P80 rmse_2 47.7 87.3 85.5
# 13 run2 B P70 rmse_1 61.0 61.6 61.5
# 14 run2 B P70 rmse_2 61.0 61.5 61.0
# 15 run2 B P80 rmse_1 32.0 95.4 55.5
# 16 run2 B P80 rmse_2 47.7 87.3 85.5
现在最后一步,让一切都完全符合您的要求,我们需要 gather
及其对应物 spread
:
data_info <- data_info %>%
gather(measure, value, min, max, median) %>%
spread(mse, value)
data_info
# A tibble: 24 x 6
# Groups: run, target, perc [8]
# run target perc measure rmse_1 rmse_2
# <chr> <chr> <chr> <chr> <dbl> <dbl>
# 1 run1 A P10 max 2.79 2.98
# 2 run1 A P10 median 2.14 2.59
# 3 run1 A P10 min 1.58 2.02
# 4 run1 A P30 max 9.89 8.88
# 5 run1 A P30 median 8.64 7.53
# 6 run1 A P30 min 4.74 3.75
# 7 run1 B P10 max 11.3 11.4
# 8 run1 B P10 median 11.2 11.3
# 9 run1 B P10 min 11.1 11.1
# 10 run1 B P30 max 59.1 59.0
# ... with 14 more rows
每次调用两次:
data_runs_df <-
map(data_runs_list, ~ map(.x, bind_rows, .id = "perc") %>%
bind_rows(.id = "target")) %>%
bind_rows(.id = "run")
data_info <- data_runs_df %>%
gather(mse, value, mse_1, mse_2) %>%
mutate(mse = str_c("r", mse),
target = str_remove(target, "mse_")) %>%
group_by(run, target, perc, mse) %>%
summarize(min = min(sqrt(value)),
max = max(sqrt(value)),
median = median(sqrt(value))) %>%
gather(measure, value, min, max, median) %>%
spread(mse, value)
如果您坚持使用您使用的列表格式,您可以这样做:
data_info_list <- map(c("A", "B"), function(x) filter(data_info, target == x))
names(data_info_list) <- c("A", "B")
数据
mse_samp1 <-
list("mse_A" = list("P10" = data.frame(number = 1:3, mse_1 = c(2.5, 4.6, 7.8), mse_2 = c(6.7, 8.9, 4.1)),
"P30" = data.frame(number = 1:3, mse_1 = c(22.5, 74.6, 97.8), mse_2 = c(56.7, 78.9, 14.1))),
"mse_B" = list("P10" = data.frame(number = 1:3, mse_1 = c(122.5, 124.6, 127.8), mse_2 = c(126.7, 128.9, 124.1)),
"P30" = data.frame(number = 1:3, mse_1 = c(3422.5, 3474.6, 3497.8), mse_2 = c(3456.7, 3478.9, 3414.1))))
mse_samp2 <-
list("mse_A" = list("P70" = data.frame(number = 1:3, mse_1 = c(22.2, 77.6, 97.8), mse_2 = c(26.7, 78.9, 17.1)),
"P80" = data.frame(number = 1:3, mse_1 = c(1022.2, 3077.6, 9097.8), mse_2 = c(7626.7, 2278.9, 7317.1))),
"mse_B" = list("P70" = data.frame(number = 1:3, mse_1 = c(3722.2, 3777.6, 3797.8), mse_2 = c(3726.7, 3778.9, 3717.1)),
"P80" = data.frame(number = 1:3, mse_1 = c(1022.2, 3077.6, 9097.8), mse_2 = c(7626.7, 2278.9, 7317.1))))
data_runs_list <- list(run1 = mse_samp1, run2 = mse_samp2)
我在 R 中将循环与 if else 和 next 结合起来。
为了重现我的问题的复杂性,我不能提供最少的示例,而是提供一大段代码。目的是在 rmse_1
和 rmse_2
列中用 min
、max
、50percentile
填充列表 df
。
您需要在# !!! change path
标记的位置根据您想要的路径更改路径。如果您更改了路径,您可以 运行 代码:
# create lists
mse_samp <- list("mse_A" = list("P10" = data.frame(number = seq(1,3,1),
mse_1 = c(2.5, 4.6, 7.8),
mse_2 = c(6.7, 8.9, 4.1)),
"P30" = data.frame(number = seq(1,3,1),
mse_1 = c(22.5, 74.6, 97.8),
mse_2 = c(56.7, 78.9, 14.1))),
"mse_B" = list("P10" = data.frame(number = seq(1,3,1),
mse_1 = c(122.5, 124.6, 127.8),
mse_2 = c(126.7, 128.9, 124.1)),
"P30" = data.frame(number = seq(1,3,1),
mse_1 = c(3422.5, 3474.6, 3497.8),
mse_2 = c(3456.7, 3478.9, 3414.1))))
# !!! change path
save(mse_samp,
file="H:\R\Forum_data\dat1.RData")
mse_samp <- list("mse_A" = list("P70" = data.frame(number = seq(1,3,1),
mse_1 = c(22.2, 77.6, 97.8, 21.2, 73.9),
mse_2 = c(26.7, 78.9, 17.1, 23.2, 82.2)),
"P80" = data.frame(number = seq(1,3,1),
mse_1 = c(1022.2, 3077.6, 9097.8, 1221.2, 7373.9),
mse_2 = c(7626.7, 2278.9, 7317.1, 7623.2, 8982.2))),
"mse_B" = list("P70" = data.frame(number = seq(1,3,1),
mse_1 = c(3722.2, 3777.6, 3797.8),
mse_2 = c(3726.7, 3778.9, 3717.1)),
"P80" = data.frame(number = seq(1,3,1),
mse_1 = c(1022.2, 3077.6, 9097.8),
mse_2 = c(7626.7, 2278.9, 7317.1))))
save(mse_samp,
file="H:\R\Forum_data\dat2.RData")
# create table for min max for different perc and runs for each paramter (loop)
n_measure <- 3 # number of different measures
npr1 <- 2 # number of different percs run1
npr2 <- 2 # number of different percs run2
targets <- c("A", "B")
for (i in 1:length(targets)) {
df <- data.frame(run = c(rep("run1", n_measure * npr1),
rep("run2", n_measure * npr2)),
perc_train = c(rep(c(0.1, 0.3), times = 1, each = n_measure), # percs run 1
rep(c(0.7, 0.8), times = 1, each = n_measure)), # percs run 2
measure = c(rep(c("min", "max", "50percentile"),
times = npr1 + npr2, each = 1)),
rmse_1 = rep(NA, n_measure * (npr1 + npr2)),
rmse_2 = rep(NA, n_measure * (npr1 + npr2))
)
assign(paste0('df_', targets[i]), df)
}
df <- list("A" = df_A, "B" = df_B)
# convert column which are factors to characters
for (i in 1:length(targets)) {
df[[i]][sapply(df[[i]], is.factor)] <- lapply(df[[i]][sapply(df[[i]], is.factor)],
as.character)
}
rm(list = c("df_A", "df_B", "df_C"))
# !!! change path
path <- c("H:\R\Forum_data\dat1.RData", # run1
# !!! change path
"H:\R\Forum_data\dat2.RData") # run2
percs_names <- c("P10", "P30", "P70", "P80")
percs <- c(0.1, 0.3, 0.7, 0.8)
targets <- c("A", "B")
run_name <- c("run1", "run2")
measure_name <- c("min", "max", "50percentile")
fill_names <- c("rmse_min_1", "rmse_min_2", "rmse_max_1", "rmse_max_2",
"percentile_50_1", "percentile_50_2")
var_name <- c("rmse_1", "rmse_2")
a_or_b <- c("a","b")
# read in data
for (i in 1:length(path)) {
load(path[i])
dat <- mse_samp
for (j in 1:length(targets)) {
for (k in 1:length(percs_names)) {
# if statement
if(percs_names[k] == names(dat[[j]][k])){
dat1 <- dat[[paste0("mse_", targets[j])]][k][[1]]
rmse_min_1 <- sqrt(min(dat1$mse_1))
rmse_min_2 <- sqrt(min(dat1$mse_2))
rmse_max_1 <- sqrt(max(dat1$mse_1))
rmse_max_2 <- sqrt(max(dat1$mse_2))
percentile_50_1 <- quantile(sqrt(dat1$mse_1), probs = 0.5)
percentile_50_2 <- quantile(sqrt(dat1$mse_2), probs = 0.5)
for (fi in 1:length(fill_names)) {
for (m in 1:length(measure_name)) {
a <- which(df[[targets[j]]]$run == run_name[i] &
df[[targets[j]]]$measure == measure_name[m] &
df[[targets[j]]]$perc_train == percs[k] &
is.na(df[[targets[j]]]$rmse_1)
)
b <- which(df[[targets[j]]]$run == run_name[i] &
df[[targets[j]]]$measure == measure_name[m] &
df[[targets[j]]]$perc_train == percs[k] &
is.na(df[[targets[j]]]$rmse_2)
)
for (v in 1:length(var_name)) {
df[[targets[j]]][eval(parse(text = a_or_b[v])), which(names(df[[targets[j]]]) == var_name[v])] <- eval(parse(text = fill_names[fi]))
}
}
}
}
else { next }
}
}
}
1. issue 运行ning 代码后,出现如下错误信息:
Error in if (percs_names[k] == names(dat[[j]][k])) { :
missing value where TRUE/FALSE needed
我猜错误可能是由if else
语句引起的。我怎样才能 运行 没有错误的代码?
2。 issue 目前只有 run1
的行被填充。 rmse_1
和 rmse_2
在行 min
、max
、50percentile
中填充了相同的值。他们应该是不同的。如何填写其他 运行 并正确填写行?最后应该没有NA
了。
虽然您坚持使用 for
-loops 这里是解决您的问题的方法 map
(类似于 lapply)和一些 tidyverse
-magic。
我有一个假设:您处理的所有数据集都存储在一个名为 data_runs_list
的列表中。在 Data 部分的答案末尾给出了一个示例(使用您的示例数据)。
所以让我们首先以更易读的格式引入这个嵌套结构:
library(tidyverse)
library(stringr)
data_runs_df <-
map(data_runs_list, ~ map(.x, bind_rows, .id = "perc") %>%
bind_rows(.id = "target")) %>%
bind_rows(.id = "run")
data_runs_df
# A tibble: 24 x 6
# run target perc number mse_1 mse_2
# <chr> <chr> <chr> <int> <dbl> <dbl>
# 1 run1 mse_A P10 1 2.5 6.7
# 2 run1 mse_A P10 2 4.6 8.9
# 3 run1 mse_A P10 3 7.8 4.1
# 4 run1 mse_A P30 1 22.5 56.7
# 5 run1 mse_A P30 2 74.6 78.9
# 6 run1 mse_A P30 3 97.8 14.1
# 7 run1 mse_B P10 1 122. 127.
# 8 run1 mse_B P10 2 125. 129.
# 9 run1 mse_B P10 3 128. 124.
# 10 run1 mse_B P30 1 3422. 3457.
# # ... with 14 more rows
为了更好地理解 bind_rows()
的作用,只取列表第一个条目的第一个条目,看看会发生什么:
bind_rows(data_runs_list[[1]][[1]], .id = "perc")
# perc number mse_1 mse_2
# 1 P10 1 2.5 6.7
# 2 P10 2 4.6 8.9
# 3 P10 3 7.8 4.1
# 4 P30 1 22.5 56.7
# 5 P30 2 74.6 78.9
# 6 P30 3 97.8 14.1
两个数据框堆叠在一起,id-column perc
保留原始列表名称。 map
然后依次应用于列表的每个级别 bind_row
,在每个级别上都有不同的 id 列。
所以这是一件可以很好地工作的东西。您想要 min
、max
和每个 运行、百分比、目标的 max
和 50% 分位数(即 median
),并分别针对两个测量值 mse_1
和 mse_2
。 group_by
与 summarize
完美结合。为了更好地处理两种不同的测量,首先将数据转换为长格式。如果您有更多测量值,您可以在 gather
-调用结束时指定它们:
data_runs_df <- data_runs_df %>%
gather(mse, value, mse_1, mse_2)
data_runs_df
# A tibble: 48 x 6
# run target perc number mse value
# <chr> <chr> <chr> <int> <chr> <dbl>
# 1 run1 mse_A P10 1 mse_1 2.5
# 2 run1 mse_A P10 2 mse_1 4.6
# 3 run1 mse_A P10 3 mse_1 7.8
# 4 run1 mse_A P30 1 mse_1 22.5
# 5 run1 mse_A P30 2 mse_1 74.6
# 6 run1 mse_A P30 3 mse_1 97.8
# 7 run1 mse_B P10 1 mse_1 122.
# 8 run1 mse_B P10 2 mse_1 125.
# 9 run1 mse_B P10 3 mse_1 128.
# 10 run1 mse_B P30 1 mse_1 3422.
# ... with 38 more rows
现在,在计算测量值之前,我们快速重命名目标和 mse 列,然后将 group_by
与 summarize
结合使用:
data_info <- data_runs_df %>%
mutate(mse = str_c("r", mse),
target = str_remove(target, "mse_")) %>%
group_by(run, target, perc, mse) %>%
summarize(min = min(sqrt(value)),
max = max(sqrt(value)),
median = median(sqrt(value)))
data_info
# A tibble: 16 x 7
# Groups: run, target, perc [?]
# run target perc mse min max median
# <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
# 1 run1 A P10 rmse_1 1.58 2.79 2.14
# 2 run1 A P10 rmse_2 2.02 2.98 2.59
# 3 run1 A P30 rmse_1 4.74 9.89 8.64
# 4 run1 A P30 rmse_2 3.75 8.88 7.53
# 5 run1 B P10 rmse_1 11.1 11.3 11.2
# 6 run1 B P10 rmse_2 11.1 11.4 11.3
# 7 run1 B P30 rmse_1 58.5 59.1 58.9
# 8 run1 B P30 rmse_2 58.4 59.0 58.8
# 9 run2 A P70 rmse_1 4.71 9.89 8.81
# 10 run2 A P70 rmse_2 4.14 8.88 5.17
# 11 run2 A P80 rmse_1 32.0 95.4 55.5
# 12 run2 A P80 rmse_2 47.7 87.3 85.5
# 13 run2 B P70 rmse_1 61.0 61.6 61.5
# 14 run2 B P70 rmse_2 61.0 61.5 61.0
# 15 run2 B P80 rmse_1 32.0 95.4 55.5
# 16 run2 B P80 rmse_2 47.7 87.3 85.5
现在最后一步,让一切都完全符合您的要求,我们需要 gather
及其对应物 spread
:
data_info <- data_info %>%
gather(measure, value, min, max, median) %>%
spread(mse, value)
data_info
# A tibble: 24 x 6
# Groups: run, target, perc [8]
# run target perc measure rmse_1 rmse_2
# <chr> <chr> <chr> <chr> <dbl> <dbl>
# 1 run1 A P10 max 2.79 2.98
# 2 run1 A P10 median 2.14 2.59
# 3 run1 A P10 min 1.58 2.02
# 4 run1 A P30 max 9.89 8.88
# 5 run1 A P30 median 8.64 7.53
# 6 run1 A P30 min 4.74 3.75
# 7 run1 B P10 max 11.3 11.4
# 8 run1 B P10 median 11.2 11.3
# 9 run1 B P10 min 11.1 11.1
# 10 run1 B P30 max 59.1 59.0
# ... with 14 more rows
每次调用两次:
data_runs_df <-
map(data_runs_list, ~ map(.x, bind_rows, .id = "perc") %>%
bind_rows(.id = "target")) %>%
bind_rows(.id = "run")
data_info <- data_runs_df %>%
gather(mse, value, mse_1, mse_2) %>%
mutate(mse = str_c("r", mse),
target = str_remove(target, "mse_")) %>%
group_by(run, target, perc, mse) %>%
summarize(min = min(sqrt(value)),
max = max(sqrt(value)),
median = median(sqrt(value))) %>%
gather(measure, value, min, max, median) %>%
spread(mse, value)
如果您坚持使用您使用的列表格式,您可以这样做:
data_info_list <- map(c("A", "B"), function(x) filter(data_info, target == x))
names(data_info_list) <- c("A", "B")
数据
mse_samp1 <-
list("mse_A" = list("P10" = data.frame(number = 1:3, mse_1 = c(2.5, 4.6, 7.8), mse_2 = c(6.7, 8.9, 4.1)),
"P30" = data.frame(number = 1:3, mse_1 = c(22.5, 74.6, 97.8), mse_2 = c(56.7, 78.9, 14.1))),
"mse_B" = list("P10" = data.frame(number = 1:3, mse_1 = c(122.5, 124.6, 127.8), mse_2 = c(126.7, 128.9, 124.1)),
"P30" = data.frame(number = 1:3, mse_1 = c(3422.5, 3474.6, 3497.8), mse_2 = c(3456.7, 3478.9, 3414.1))))
mse_samp2 <-
list("mse_A" = list("P70" = data.frame(number = 1:3, mse_1 = c(22.2, 77.6, 97.8), mse_2 = c(26.7, 78.9, 17.1)),
"P80" = data.frame(number = 1:3, mse_1 = c(1022.2, 3077.6, 9097.8), mse_2 = c(7626.7, 2278.9, 7317.1))),
"mse_B" = list("P70" = data.frame(number = 1:3, mse_1 = c(3722.2, 3777.6, 3797.8), mse_2 = c(3726.7, 3778.9, 3717.1)),
"P80" = data.frame(number = 1:3, mse_1 = c(1022.2, 3077.6, 9097.8), mse_2 = c(7626.7, 2278.9, 7317.1))))
data_runs_list <- list(run1 = mse_samp1, run2 = mse_samp2)