您如何在不编写单独代码的情况下重复过滤数据集然后 运行 回归?
How do you repeat on filtering datasets and then running regressions without writing out individual code?
如何在不编写单独代码的情况下重复过滤数据集然后运行宁回归?
我想运行对mtcars
数据进行线性回归,其中数据都是mtcars,IV是mtcars$am
,DV是mtcars$mpg
.然后我想使用分组变量 mtcars$gear
创建 3 个数据集,其中 mtcars$gear
为 3、4 或 5,然后 运行 分别对这 3 个数据集进行回归。
我目前使用的长流程如下。
感兴趣变量的唯一值:
## variables of interets
unique(mtcars$mpg)
# ---- NOTE: DV is mpg
unique(mtcars$am)
# ---- NOTE: IV is mpg
unique(mtcars$gear)
# ---- NOTE: grouping variable is gear
这是我用于回归的基线代码:
## linear regression with all data
lm__am_on_mpg__mtcars <- lm(mpg ~ am, data=mtcars)
summary(lm__am_on_mpg__mtcars)
然后我使用 tidyverse
包中的 filter()
命令创建了 3 个数据集,其中 mtcars$gear
是 3、4 或 5
### list of filtered datasets
str(mtcars__gear_is_3)
str(mtcars__gear_is_4)
str(mtcars__gear_is_5)
然后我创建了 3 个回归,其基本结构与上面的基本回归相同,但不同的数据集与不同的 mtcars$gear
水平相连。
#### when mtcars__gear_is_3 is dataset used
lm__am_on_mpg__mtcars__gear_is_3 <- lm(mpg ~ am, data=mtcars__gear_is_3)
summary(lm__am_on_mpg__mtcars__gear_is_3)
#### when mtcars__gear_is_4 is dataset used
lm__am_on_mpg__mtcars__gear_is_4 <- lm(mpg ~ am, data=mtcars__gear_is_4)
summary(lm__am_on_mpg__mtcars__gear_is_4)
#### when mtcars__gear_is_5 is dataset used
lm__am_on_mpg__mtcars__gear_is_5 <- lm(mpg ~ am, data=mtcars__gear_is_5)
summary(lm__am_on_mpg__mtcars__gear_is_5)
这似乎可行,但代码似乎也很多。我觉得这可以用更简洁的代码来完成。我想知道我是否可以通过编写以下代码来加快此过程:
(A) 使用 tidyverse
过滤方法以更短的方式创建不同的数据集
(B) 以更短的方式创建不同的回归,只在适当的时候交换数据集名称
无需长途编写所有代码。
这是我的问题:
(1) 这在 R 中一般可以做到吗?
(2) 这对数据集来说可能吗?
(2.1) 如果是,怎么办?
(3) 回归可能吗?
(3.1) 如果是,怎么做?
====================
这是我用来长期完成此任务的 R 代码
# How do you repeat on filtering datasets and then running regressions in R without writing out individual code?
## dataset of interest
mtcars
### info about dataset
head(mtcars)
str(mtcars)
columns(mtcars)
## variables of interets
unique(mtcars$mpg)
# ---- NOTE: DV is mpg
unique(mtcars$am)
# ---- NOTE: IV is mpg
unique(mtcars$gear)
# ---- NOTE: grouping variable is gear
## linear regression with all data
lm__am_on_mpg__mtcars <- lm(mpg ~ am, data=mtcars)
summary(lm__am_on_mpg__mtcars)
## filter data based on mtcars$gear
### loads tidyverse
library(tidyverse)
### when mtcars$gear == 3
#### creates filtered dataset
# ---- NOTE: starting dataset - mtcars
# ---- NOTE: ending dataset - mtcars__gear_is_3
# ---- NOTE: filter variable - gear
# ---- NOTE: filter variable value(s) - 3
##### starting dataset
str(mtcars)
##### unique values of starting dataset$filter
unique(mtcars$gear)
##### filters data into post-filter dataset
mtcars__gear_is_3 <- filter(mtcars, (gear == "3"))
##### turns post-filter dataset into data frame
mtcars__gear_is_3 <- data.frame(mtcars__gear_is_3)
##### post-filter dataset
str(mtcars__gear_is_3)
##### unique values of post-filter dataset$filter
unique(mtcars__gear_is_3$gear)
### when mtcars$gear == 4
#### creates filtered dataset
# ---- NOTE: starting dataset - mtcars
# ---- NOTE: ending dataset - mtcars__gear_is_4
# ---- NOTE: filter variable - gear
# ---- NOTE: filter variable value(s) - 4
##### starting dataset
str(mtcars)
##### unique values of starting dataset$filter
unique(mtcars$gear)
##### filters data into post-filter dataset
mtcars__gear_is_4 <- filter(mtcars, (gear == "4"))
##### turns post-filter dataset into data frame
mtcars__gear_is_4 <- data.frame(mtcars__gear_is_4)
##### post-filter dataset
str(mtcars__gear_is_4)
##### unique values of post-filter dataset$filter
unique(mtcars__gear_is_4$gear)
### when mtcars$gear == 5
#### creates filtered dataset
# ---- NOTE: starting dataset - mtcars
# ---- NOTE: ending dataset - mtcars__gear_is_5
# ---- NOTE: filter variable - gear
# ---- NOTE: filter variable value(s) - 5
##### starting dataset
str(mtcars)
##### unique values of starting dataset$filter
unique(mtcars$gear)
##### filters data into post-filter dataset
mtcars__gear_is_5 <- filter(mtcars, (gear == "5"))
##### turns post-filter dataset into data frame
mtcars__gear_is_5 <- data.frame(mtcars__gear_is_5)
##### post-filter dataset
str(mtcars__gear_is_5)
##### unique values of post-filter dataset$filter
unique(mtcars__gear_is_5$gear)
## regressions where data is filtered by gear
### list of filtered datasets
str(mtcars__gear_is_3)
str(mtcars__gear_is_4)
str(mtcars__gear_is_5)
#### when mtcars__gear_is_3 is dataset used
lm__am_on_mpg__mtcars__gear_is_3 <- lm(mpg ~ am, data=mtcars__gear_is_3)
summary(lm__am_on_mpg__mtcars__gear_is_3)
#### when mtcars__gear_is_4 is dataset used
lm__am_on_mpg__mtcars__gear_is_4 <- lm(mpg ~ am, data=mtcars__gear_is_4)
summary(lm__am_on_mpg__mtcars__gear_is_4)
#### when mtcars__gear_is_5 is dataset used
lm__am_on_mpg__mtcars__gear_is_5 <- lm(mpg ~ am, data=mtcars__gear_is_5)
summary(lm__am_on_mpg__mtcars__gear_is_5)
我会使用 dplyr nest()
。查看更多信息 here。
library(tidyverse,warn.conflict = F)
df <- mtcars %>% # Nest data by gear
group_by(gear) %>%
nest()
df$data
#> [[1]]
#> # A tibble: 12 x 10
#> mpg cyl disp hp drat wt qsec vs am carb
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 21 6 160 110 3.9 2.62 16.5 0 1 4
#> 2 21 6 160 110 3.9 2.88 17.0 0 1 4
#> 3 22.8 4 108 93 3.85 2.32 18.6 1 1 1
#> 4 24.4 4 147. 62 3.69 3.19 20 1 0 2
#> 5 22.8 4 141. 95 3.92 3.15 22.9 1 0 2
#> 6 19.2 6 168. 123 3.92 3.44 18.3 1 0 4
#> 7 17.8 6 168. 123 3.92 3.44 18.9 1 0 4
#> 8 32.4 4 78.7 66 4.08 2.2 19.5 1 1 1
#> 9 30.4 4 75.7 52 4.93 1.62 18.5 1 1 2
#> 10 33.9 4 71.1 65 4.22 1.84 19.9 1 1 1
#> 11 27.3 4 79 66 4.08 1.94 18.9 1 1 1
#> 12 21.4 4 121 109 4.11 2.78 18.6 1 1 2
#>
#> [[2]]
#> # A tibble: 15 x 10
#> mpg cyl disp hp drat wt qsec vs am carb
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 21.4 6 258 110 3.08 3.22 19.4 1 0 1
#> 2 18.7 8 360 175 3.15 3.44 17.0 0 0 2
#> 3 18.1 6 225 105 2.76 3.46 20.2 1 0 1
#> 4 14.3 8 360 245 3.21 3.57 15.8 0 0 4
#> 5 16.4 8 276. 180 3.07 4.07 17.4 0 0 3
#> 6 17.3 8 276. 180 3.07 3.73 17.6 0 0 3
#> 7 15.2 8 276. 180 3.07 3.78 18 0 0 3
#> 8 10.4 8 472 205 2.93 5.25 18.0 0 0 4
#> 9 10.4 8 460 215 3 5.42 17.8 0 0 4
#> 10 14.7 8 440 230 3.23 5.34 17.4 0 0 4
#> 11 21.5 4 120. 97 3.7 2.46 20.0 1 0 1
#> 12 15.5 8 318 150 2.76 3.52 16.9 0 0 2
#> 13 15.2 8 304 150 3.15 3.44 17.3 0 0 2
#> 14 13.3 8 350 245 3.73 3.84 15.4 0 0 4
#> 15 19.2 8 400 175 3.08 3.84 17.0 0 0 2
#>
#> [[3]]
#> # A tibble: 5 x 10
#> mpg cyl disp hp drat wt qsec vs am carb
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 26 4 120. 91 4.43 2.14 16.7 0 1 2
#> 2 30.4 4 95.1 113 3.77 1.51 16.9 1 1 2
#> 3 15.8 8 351 264 4.22 3.17 14.5 0 1 4
#> 4 19.7 6 145 175 3.62 2.77 15.5 0 1 6
#> 5 15 8 301 335 3.54 3.57 14.6 0 1 8
mod <- function(x) {
lm(mpg ~ am,data = x) # Create model
}
map(df$data, mod)
#> [[1]]
#>
#> Call:
#> lm(formula = mpg ~ am, data = x)
#>
#> Coefficients:
#> (Intercept) am
#> 21.050 5.225
#>
#>
#> [[2]]
#>
#> Call:
#> lm(formula = mpg ~ am, data = x)
#>
#> Coefficients:
#> (Intercept) am
#> 16.11 NA
#>
#>
#> [[3]]
#>
#> Call:
#> lm(formula = mpg ~ am, data = x)
#>
#> Coefficients:
#> (Intercept) am
#> 21.38 NA
df <- df %>%
mutate(model = map(data,mod))
df[[3]]
#> [[1]]
#>
#> Call:
#> lm(formula = mpg ~ am, data = x)
#>
#> Coefficients:
#> (Intercept) am
#> 21.050 5.225
#>
#>
#> [[2]]
#>
#> Call:
#> lm(formula = mpg ~ am, data = x)
#>
#> Coefficients:
#> (Intercept) am
#> 16.11 NA
#>
#>
#> [[3]]
#>
#> Call:
#> lm(formula = mpg ~ am, data = x)
#>
#> Coefficients:
#> (Intercept) am
#> 21.38 NA
由 reprex package (v0.3.0)
于 2021 年 1 月 15 日创建
也许你可以通过这样的方式实现你的目标:
library(data.table)
dt <- as.data.table(mtcars)
formulas <- paste0("lm(mpg ~ am, data = dt[gear == ", unique(dt[,gear]), "])" )
l <- lapply(formulas, function(x) eval(parse(text=x)))
要查看所有模型,只需使用:
l
或查看其中一种模型的摘要:
summary(lm[[1]])
如何在不编写单独代码的情况下重复过滤数据集然后运行宁回归?
我想运行对mtcars
数据进行线性回归,其中数据都是mtcars,IV是mtcars$am
,DV是mtcars$mpg
.然后我想使用分组变量 mtcars$gear
创建 3 个数据集,其中 mtcars$gear
为 3、4 或 5,然后 运行 分别对这 3 个数据集进行回归。
我目前使用的长流程如下。
感兴趣变量的唯一值:
## variables of interets
unique(mtcars$mpg)
# ---- NOTE: DV is mpg
unique(mtcars$am)
# ---- NOTE: IV is mpg
unique(mtcars$gear)
# ---- NOTE: grouping variable is gear
这是我用于回归的基线代码:
## linear regression with all data
lm__am_on_mpg__mtcars <- lm(mpg ~ am, data=mtcars)
summary(lm__am_on_mpg__mtcars)
然后我使用 tidyverse
包中的 filter()
命令创建了 3 个数据集,其中 mtcars$gear
是 3、4 或 5
### list of filtered datasets
str(mtcars__gear_is_3)
str(mtcars__gear_is_4)
str(mtcars__gear_is_5)
然后我创建了 3 个回归,其基本结构与上面的基本回归相同,但不同的数据集与不同的 mtcars$gear
水平相连。
#### when mtcars__gear_is_3 is dataset used
lm__am_on_mpg__mtcars__gear_is_3 <- lm(mpg ~ am, data=mtcars__gear_is_3)
summary(lm__am_on_mpg__mtcars__gear_is_3)
#### when mtcars__gear_is_4 is dataset used
lm__am_on_mpg__mtcars__gear_is_4 <- lm(mpg ~ am, data=mtcars__gear_is_4)
summary(lm__am_on_mpg__mtcars__gear_is_4)
#### when mtcars__gear_is_5 is dataset used
lm__am_on_mpg__mtcars__gear_is_5 <- lm(mpg ~ am, data=mtcars__gear_is_5)
summary(lm__am_on_mpg__mtcars__gear_is_5)
这似乎可行,但代码似乎也很多。我觉得这可以用更简洁的代码来完成。我想知道我是否可以通过编写以下代码来加快此过程:
(A) 使用 tidyverse
过滤方法以更短的方式创建不同的数据集
(B) 以更短的方式创建不同的回归,只在适当的时候交换数据集名称
无需长途编写所有代码。
这是我的问题: (1) 这在 R 中一般可以做到吗? (2) 这对数据集来说可能吗? (2.1) 如果是,怎么办? (3) 回归可能吗? (3.1) 如果是,怎么做?
====================
这是我用来长期完成此任务的 R 代码
# How do you repeat on filtering datasets and then running regressions in R without writing out individual code?
## dataset of interest
mtcars
### info about dataset
head(mtcars)
str(mtcars)
columns(mtcars)
## variables of interets
unique(mtcars$mpg)
# ---- NOTE: DV is mpg
unique(mtcars$am)
# ---- NOTE: IV is mpg
unique(mtcars$gear)
# ---- NOTE: grouping variable is gear
## linear regression with all data
lm__am_on_mpg__mtcars <- lm(mpg ~ am, data=mtcars)
summary(lm__am_on_mpg__mtcars)
## filter data based on mtcars$gear
### loads tidyverse
library(tidyverse)
### when mtcars$gear == 3
#### creates filtered dataset
# ---- NOTE: starting dataset - mtcars
# ---- NOTE: ending dataset - mtcars__gear_is_3
# ---- NOTE: filter variable - gear
# ---- NOTE: filter variable value(s) - 3
##### starting dataset
str(mtcars)
##### unique values of starting dataset$filter
unique(mtcars$gear)
##### filters data into post-filter dataset
mtcars__gear_is_3 <- filter(mtcars, (gear == "3"))
##### turns post-filter dataset into data frame
mtcars__gear_is_3 <- data.frame(mtcars__gear_is_3)
##### post-filter dataset
str(mtcars__gear_is_3)
##### unique values of post-filter dataset$filter
unique(mtcars__gear_is_3$gear)
### when mtcars$gear == 4
#### creates filtered dataset
# ---- NOTE: starting dataset - mtcars
# ---- NOTE: ending dataset - mtcars__gear_is_4
# ---- NOTE: filter variable - gear
# ---- NOTE: filter variable value(s) - 4
##### starting dataset
str(mtcars)
##### unique values of starting dataset$filter
unique(mtcars$gear)
##### filters data into post-filter dataset
mtcars__gear_is_4 <- filter(mtcars, (gear == "4"))
##### turns post-filter dataset into data frame
mtcars__gear_is_4 <- data.frame(mtcars__gear_is_4)
##### post-filter dataset
str(mtcars__gear_is_4)
##### unique values of post-filter dataset$filter
unique(mtcars__gear_is_4$gear)
### when mtcars$gear == 5
#### creates filtered dataset
# ---- NOTE: starting dataset - mtcars
# ---- NOTE: ending dataset - mtcars__gear_is_5
# ---- NOTE: filter variable - gear
# ---- NOTE: filter variable value(s) - 5
##### starting dataset
str(mtcars)
##### unique values of starting dataset$filter
unique(mtcars$gear)
##### filters data into post-filter dataset
mtcars__gear_is_5 <- filter(mtcars, (gear == "5"))
##### turns post-filter dataset into data frame
mtcars__gear_is_5 <- data.frame(mtcars__gear_is_5)
##### post-filter dataset
str(mtcars__gear_is_5)
##### unique values of post-filter dataset$filter
unique(mtcars__gear_is_5$gear)
## regressions where data is filtered by gear
### list of filtered datasets
str(mtcars__gear_is_3)
str(mtcars__gear_is_4)
str(mtcars__gear_is_5)
#### when mtcars__gear_is_3 is dataset used
lm__am_on_mpg__mtcars__gear_is_3 <- lm(mpg ~ am, data=mtcars__gear_is_3)
summary(lm__am_on_mpg__mtcars__gear_is_3)
#### when mtcars__gear_is_4 is dataset used
lm__am_on_mpg__mtcars__gear_is_4 <- lm(mpg ~ am, data=mtcars__gear_is_4)
summary(lm__am_on_mpg__mtcars__gear_is_4)
#### when mtcars__gear_is_5 is dataset used
lm__am_on_mpg__mtcars__gear_is_5 <- lm(mpg ~ am, data=mtcars__gear_is_5)
summary(lm__am_on_mpg__mtcars__gear_is_5)
我会使用 dplyr nest()
。查看更多信息 here。
library(tidyverse,warn.conflict = F)
df <- mtcars %>% # Nest data by gear
group_by(gear) %>%
nest()
df$data
#> [[1]]
#> # A tibble: 12 x 10
#> mpg cyl disp hp drat wt qsec vs am carb
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 21 6 160 110 3.9 2.62 16.5 0 1 4
#> 2 21 6 160 110 3.9 2.88 17.0 0 1 4
#> 3 22.8 4 108 93 3.85 2.32 18.6 1 1 1
#> 4 24.4 4 147. 62 3.69 3.19 20 1 0 2
#> 5 22.8 4 141. 95 3.92 3.15 22.9 1 0 2
#> 6 19.2 6 168. 123 3.92 3.44 18.3 1 0 4
#> 7 17.8 6 168. 123 3.92 3.44 18.9 1 0 4
#> 8 32.4 4 78.7 66 4.08 2.2 19.5 1 1 1
#> 9 30.4 4 75.7 52 4.93 1.62 18.5 1 1 2
#> 10 33.9 4 71.1 65 4.22 1.84 19.9 1 1 1
#> 11 27.3 4 79 66 4.08 1.94 18.9 1 1 1
#> 12 21.4 4 121 109 4.11 2.78 18.6 1 1 2
#>
#> [[2]]
#> # A tibble: 15 x 10
#> mpg cyl disp hp drat wt qsec vs am carb
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 21.4 6 258 110 3.08 3.22 19.4 1 0 1
#> 2 18.7 8 360 175 3.15 3.44 17.0 0 0 2
#> 3 18.1 6 225 105 2.76 3.46 20.2 1 0 1
#> 4 14.3 8 360 245 3.21 3.57 15.8 0 0 4
#> 5 16.4 8 276. 180 3.07 4.07 17.4 0 0 3
#> 6 17.3 8 276. 180 3.07 3.73 17.6 0 0 3
#> 7 15.2 8 276. 180 3.07 3.78 18 0 0 3
#> 8 10.4 8 472 205 2.93 5.25 18.0 0 0 4
#> 9 10.4 8 460 215 3 5.42 17.8 0 0 4
#> 10 14.7 8 440 230 3.23 5.34 17.4 0 0 4
#> 11 21.5 4 120. 97 3.7 2.46 20.0 1 0 1
#> 12 15.5 8 318 150 2.76 3.52 16.9 0 0 2
#> 13 15.2 8 304 150 3.15 3.44 17.3 0 0 2
#> 14 13.3 8 350 245 3.73 3.84 15.4 0 0 4
#> 15 19.2 8 400 175 3.08 3.84 17.0 0 0 2
#>
#> [[3]]
#> # A tibble: 5 x 10
#> mpg cyl disp hp drat wt qsec vs am carb
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 26 4 120. 91 4.43 2.14 16.7 0 1 2
#> 2 30.4 4 95.1 113 3.77 1.51 16.9 1 1 2
#> 3 15.8 8 351 264 4.22 3.17 14.5 0 1 4
#> 4 19.7 6 145 175 3.62 2.77 15.5 0 1 6
#> 5 15 8 301 335 3.54 3.57 14.6 0 1 8
mod <- function(x) {
lm(mpg ~ am,data = x) # Create model
}
map(df$data, mod)
#> [[1]]
#>
#> Call:
#> lm(formula = mpg ~ am, data = x)
#>
#> Coefficients:
#> (Intercept) am
#> 21.050 5.225
#>
#>
#> [[2]]
#>
#> Call:
#> lm(formula = mpg ~ am, data = x)
#>
#> Coefficients:
#> (Intercept) am
#> 16.11 NA
#>
#>
#> [[3]]
#>
#> Call:
#> lm(formula = mpg ~ am, data = x)
#>
#> Coefficients:
#> (Intercept) am
#> 21.38 NA
df <- df %>%
mutate(model = map(data,mod))
df[[3]]
#> [[1]]
#>
#> Call:
#> lm(formula = mpg ~ am, data = x)
#>
#> Coefficients:
#> (Intercept) am
#> 21.050 5.225
#>
#>
#> [[2]]
#>
#> Call:
#> lm(formula = mpg ~ am, data = x)
#>
#> Coefficients:
#> (Intercept) am
#> 16.11 NA
#>
#>
#> [[3]]
#>
#> Call:
#> lm(formula = mpg ~ am, data = x)
#>
#> Coefficients:
#> (Intercept) am
#> 21.38 NA
由 reprex package (v0.3.0)
于 2021 年 1 月 15 日创建也许你可以通过这样的方式实现你的目标:
library(data.table)
dt <- as.data.table(mtcars)
formulas <- paste0("lm(mpg ~ am, data = dt[gear == ", unique(dt[,gear]), "])" )
l <- lapply(formulas, function(x) eval(parse(text=x)))
要查看所有模型,只需使用:
l
或查看其中一种模型的摘要:
summary(lm[[1]])