使用 purrr 标记许多模型的回归样本
Tag regression samples for many models using purrr
我想用 purrr
函数标记多元回归模型中使用的样本。借鉴 ,我可以临时实现此目标如下:
library(dplyr)
df <- mtcars %>%
mutate(disp = replace(hp, c(2, 3), NA)) %>%
mutate(wt = replace(wt, c(3, 4, 5), NA))
s1 <- lm(mpg ~ disp, data = df)
df$samp1 <- TRUE
df$samp1[na.action(s1)] <- FALSE
s2 <- lm(mpg ~ wt, data = df)
df$samp2 <- TRUE
df$samp2[na.action(s2)] <- FALSE
如何使用 purrr
将 samp1
和 samp2
添加到 df
?
这似乎太复杂了,但这是我能想到的。 (如果没有 运行 线性模型本身作为管道的一部分,这样做会更有效率,即只识别使用了哪些样本——这可能可以用 model.frame()
和一些适当的加入...
library(dplyr)
library(purrr)
library(broom)
library(tibble)
## same as before, but also convert rownames to a column
df <- mtcars %>%
mutate(disp = replace(hp, c(2, 3), NA),
wt = replace(wt, c(3, 4, 5), NA)) %>%
rownames_to_column("model")
## (1) set up vector of vars and give it names (for later .id=)
dd <- c("disp", "wt") %>%
setNames(c("samp1", "samp2")) %>%
## (2) construct formulas for lm
map(reformulate, response = "mpg") %>%
## (3) fit the lm
map(lm, data = df) %>%
## (4) generate fitted values
map_dfr(augment, newdata=df, .id="samp") %>%
select(samp, model, .fitted) %>%
## (5) identify which observations were *not* used
mutate(val = !is.na(.fitted)) %>%
## (6) pivot from one long column to two half-length columns
pivot_wider(names_from=samp, values_from=val, id_cols= model) %>%
## (7) add to df
full_join(df, by = "model")
此版本没有 运行 模型。
## helper function: returns logical vector of whether observation
## was included in model frame or not
drop_vec <- function(mf) {
nn <- attr(mf, "na.action")
incl <- rep(TRUE, nrow(mf) + length(nn))
incl[nn] <- FALSE
incl
}
## first few bits are the same as above
dd <- c("disp", "wt") %>%
setNames(c("samp1", "samp2")) %>%
map(reformulate, response = "mpg") %>%
## only construct model frames - don't run lm()
map(model.frame, data = df) %>%
## apply helper function
map(drop_vec) %>%
## stick them together
bind_cols(df)
我唯一不喜欢这个解决方案的是 samp
列在开头;将不得不大惊小怪才能将它们作为数据框中的 last 列。
across
应该有更简洁的方法来做到这一点,但最终可能会比它的价值更丑陋或更复杂。一种足够简单的方法是使用您想要的新列名称制作模型列表,为每个创建一个 samp*
列,然后减少连接到一个数据框中。最后一点有效,因为您知道您有所有相同的列可以加入。
library(dplyr)
mods <- list(samp1 = s1, samp2 = s2)
df_out <- purrr::imap(mods, function(mod, col) {
df %>%
tibble::rownames_to_column("id") %>%
mutate({{ col }} := id %in% names(na.action(mod)))
}) %>%
purrr::reduce(inner_join)
#> Joining, by = c("id", "mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear", "carb")
head(df_out)
#> id mpg cyl disp hp drat wt qsec vs am gear carb samp1
#> 1 Mazda RX4 21.0 6 110 110 3.90 2.620 16.46 0 1 4 4 FALSE
#> 2 Mazda RX4 Wag 21.0 6 NA 110 3.90 2.875 17.02 0 1 4 4 TRUE
#> 3 Datsun 710 22.8 4 NA 93 3.85 NA 18.61 1 1 4 1 TRUE
#> 4 Hornet 4 Drive 21.4 6 110 110 3.08 NA 19.44 1 0 3 1 FALSE
#> 5 Hornet Sportabout 18.7 8 175 175 3.15 NA 17.02 0 0 3 2 FALSE
#> 6 Valiant 18.1 6 105 105 2.76 3.460 20.22 1 0 3 1 FALSE
#> samp2
#> 1 FALSE
#> 2 FALSE
#> 3 TRUE
#> 4 TRUE
#> 5 TRUE
#> 6 FALSE
如果您想走更重的潮汐路线,您可能会找到一些线索的帖子是 and
我还不太了解,但这里有一个使用自定义函数的简洁方法:
flag_use <- function(df, model, name) {
mutate(df, {{name}} := !row_number() %in% na.action( {{model}} ))
}
df %>%
flag_use(s1, "samp1") %>%
flag_use(s2, "samp2")
我想用 purrr
函数标记多元回归模型中使用的样本。借鉴
library(dplyr)
df <- mtcars %>%
mutate(disp = replace(hp, c(2, 3), NA)) %>%
mutate(wt = replace(wt, c(3, 4, 5), NA))
s1 <- lm(mpg ~ disp, data = df)
df$samp1 <- TRUE
df$samp1[na.action(s1)] <- FALSE
s2 <- lm(mpg ~ wt, data = df)
df$samp2 <- TRUE
df$samp2[na.action(s2)] <- FALSE
如何使用 purrr
将 samp1
和 samp2
添加到 df
?
这似乎太复杂了,但这是我能想到的。 (如果没有 运行 线性模型本身作为管道的一部分,这样做会更有效率,即只识别使用了哪些样本——这可能可以用 model.frame()
和一些适当的加入...
library(dplyr)
library(purrr)
library(broom)
library(tibble)
## same as before, but also convert rownames to a column
df <- mtcars %>%
mutate(disp = replace(hp, c(2, 3), NA),
wt = replace(wt, c(3, 4, 5), NA)) %>%
rownames_to_column("model")
## (1) set up vector of vars and give it names (for later .id=)
dd <- c("disp", "wt") %>%
setNames(c("samp1", "samp2")) %>%
## (2) construct formulas for lm
map(reformulate, response = "mpg") %>%
## (3) fit the lm
map(lm, data = df) %>%
## (4) generate fitted values
map_dfr(augment, newdata=df, .id="samp") %>%
select(samp, model, .fitted) %>%
## (5) identify which observations were *not* used
mutate(val = !is.na(.fitted)) %>%
## (6) pivot from one long column to two half-length columns
pivot_wider(names_from=samp, values_from=val, id_cols= model) %>%
## (7) add to df
full_join(df, by = "model")
此版本没有 运行 模型。
## helper function: returns logical vector of whether observation
## was included in model frame or not
drop_vec <- function(mf) {
nn <- attr(mf, "na.action")
incl <- rep(TRUE, nrow(mf) + length(nn))
incl[nn] <- FALSE
incl
}
## first few bits are the same as above
dd <- c("disp", "wt") %>%
setNames(c("samp1", "samp2")) %>%
map(reformulate, response = "mpg") %>%
## only construct model frames - don't run lm()
map(model.frame, data = df) %>%
## apply helper function
map(drop_vec) %>%
## stick them together
bind_cols(df)
我唯一不喜欢这个解决方案的是 samp
列在开头;将不得不大惊小怪才能将它们作为数据框中的 last 列。
across
应该有更简洁的方法来做到这一点,但最终可能会比它的价值更丑陋或更复杂。一种足够简单的方法是使用您想要的新列名称制作模型列表,为每个创建一个 samp*
列,然后减少连接到一个数据框中。最后一点有效,因为您知道您有所有相同的列可以加入。
library(dplyr)
mods <- list(samp1 = s1, samp2 = s2)
df_out <- purrr::imap(mods, function(mod, col) {
df %>%
tibble::rownames_to_column("id") %>%
mutate({{ col }} := id %in% names(na.action(mod)))
}) %>%
purrr::reduce(inner_join)
#> Joining, by = c("id", "mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear", "carb")
head(df_out)
#> id mpg cyl disp hp drat wt qsec vs am gear carb samp1
#> 1 Mazda RX4 21.0 6 110 110 3.90 2.620 16.46 0 1 4 4 FALSE
#> 2 Mazda RX4 Wag 21.0 6 NA 110 3.90 2.875 17.02 0 1 4 4 TRUE
#> 3 Datsun 710 22.8 4 NA 93 3.85 NA 18.61 1 1 4 1 TRUE
#> 4 Hornet 4 Drive 21.4 6 110 110 3.08 NA 19.44 1 0 3 1 FALSE
#> 5 Hornet Sportabout 18.7 8 175 175 3.15 NA 17.02 0 0 3 2 FALSE
#> 6 Valiant 18.1 6 105 105 2.76 3.460 20.22 1 0 3 1 FALSE
#> samp2
#> 1 FALSE
#> 2 FALSE
#> 3 TRUE
#> 4 TRUE
#> 5 TRUE
#> 6 FALSE
如果您想走更重的潮汐路线,您可能会找到一些线索的帖子是
我还不太了解,但这里有一个使用自定义函数的简洁方法:
flag_use <- function(df, model, name) {
mutate(df, {{name}} := !row_number() %in% na.action( {{model}} ))
}
df %>%
flag_use(s1, "samp1") %>%
flag_use(s2, "samp2")