如何创建一个 for 循环来为 R 中的 glm 进行多年组合?
How to create a for loop to go through multiple year combinations for a glm in R?
我有一个大型数据集,其中包含 Blue Rockfish 的存在和不存在 (0,1) 以及多个变量(在我的例子中,测深、曲率、东度、精细比例 BPI 和大比例 BPI)。
structure(list(Pres_Abs = c(1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L), CommonName = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = "Blue Rockfish", class = "factor"), Survey_Yea = c(2009L,
2014L, 2005L, 2015L, 2006L, 2009L, 2014L, 2015L, 2015L, 2015L,
2005L, 2014L, 2015L, 2015L, 2014L, 2015L, 2015L, 2015L, 2015L,
2006L), ca_10mbath = c(-42.6913986, -36.0038986, -36.5155983,
-44.4014816, -39.3320007, -45.7226982, -47.9375, -51.5976982,
-37.7812996, -14.1093302, -70.5976562, -41.5121307, -48.4246902,
-46.0937996, -38.9961014, -46.375, -42.6913986, -60.96875, -46.375,
-37.6601601), ca10_bpi24 = c(-12L, -2L, -2L, -2L, -2L, -2L, 7L,
37L, -2L, 77L, -2L, -2L, 17L, 7L, -2L, -2L, -12L, -2L, -2L, 67L
), ca_10m_cur = c(-0.0859528, -0.0006409, -0.0068855, -0.5154228,
-0.0390663, -0.0078316, -0.0221901, 0.792961, 0, 4.3303394, 0.0429688,
-0.4405556, -0.1947556, 0.0195274, -0.230453, -0.0093803, -0.0859528,
-0.2148438, -0.0093803, 0.0976486), ca_10m_eas = c(0.727106,
0.887252, 0.565906, 0.9994883, 0.96552, 0.960033, 0.998732, 0.772206,
0.589553, -0.4134142, -0.8266082, -0.3659272, -0.7330094, 0.0329623,
0.998884, 0.271237, 0.727106, -0.5498384, 0.271237, 0.6424425
), ca10_bpi30 = c(-15L, -15L, -15L, -15L, -15L, -15L, -15L, -15L,
-15L, 262L, -15L, -15L, -15L, -15L, -15L, -15L, -15L, -15L, -15L,
-15L)), row.names = c(2032L, 3801L, 479L, 4421L, 997L, 1551L,
3079L, 4657L, 5059L, 4104L, 261L, 2849L, 4460L, 4765L, 3535L,
4842L, 4950L, 4323L, 4833L, 752L), class = "data.frame")
此外,我有多年的数据(2005、2006、2007、2009、2014、2015)。我基本上想创建一个 glm
Pres_Abs~bathy+curvature+eastness+broadscale+finescale, data=Blue_allyears, family=binomial(link=logit))
这会经历各个年份的组合。因此,在 1 年的水平上,我使用 2005 年的数据创建了 glms,然后是 2006 年的数据,然后是 2007 年的数据等。在该代码中,我保存了 AIC、残差和零偏差、卡方、p 等数据价值等
这是我用来循环第一年数据的代码(改编自 Whosebug 上的其他人):
results <- data.frame()
for(Survey_Yea in unique(Blue_allyears$Survey_Yea)){
# dynamically generate formula
fmla <- as.formula(Pres_Abs~ca_10mbath+ca_10m_cur+ca_10m_eas+ca10_bpi30+ca10_bpi24)
# fit glm model
fit<-glm(fmla,data=Blue_allyears[Blue_allyears$Survey_Yea == Survey_Yea,],family=binomial(link=logit))
## capture summary stats
AIC <- AIC(fit)
Deviance <- deviance(fit)
NullDeviance <- fit$null.deviance
null_minus_dev<-NullDeviance-Deviance
df.residual<- fit$df.residual
df.null<-fit$df.null
df.null.minus.df.residual<-df.null-df.residual
pvalue<- with(fit, 1-pchisq(null_minus_dev , df.null.minus.df.residual))
Years<-"1"
# get coefficents of fit
cfit <- coef(summary(fit))
# create temporary data frame
df <- data.frame( Survey_Yea = Survey_Yea,
AIC = AIC(fit), Deviance = deviance(fit),NullDeviance = fit$null.deviance, null.minus.dev=NullDeviance-Deviance, df.residual= fit$df.residual , df.null=fit$df.null , df.null.minus.df.residual=df.null-df.residual, pvalue= pvalue, Years="1", stringsAsFactors = F)
# bind rows of temporary data frame to the results data frame
results <- rbind(results, df)
}
results
这段代码很棒,可以根据每年的数据创建 glms。
structure(list(Survey_Yea = c(2005L, 2006L, 2007L, 2009L, 2014L,
2015L), AIC = c(731.84838805646, 480.699964265887, 113.681123536743,
764.359566454308, 1482.05275641814, 1581.2853892652), Deviance = c(719.84838805646,
468.699964265887, 101.681123536743, 752.359566454308, 1470.05275641814,
1569.2853892652), NullDeviance = c(987.041585117362, 690.374591837705,
174.673089501106, 1059.1288918956, 2412.15218834861, 2012.89941234608
), null.minus.dev = c(267.193197060902, 221.674627571818, 72.991965964363,
306.769325441288, 942.099431930472, 443.614023080884), df.residual = c(706L,
492L, 120L, 758L, 1734L, 1446L), df.null = c(711L, 497L, 125L,
763L, 1739L, 1451L), df.null.minus.df.residual = c(5L, 5L, 5L,
5L, 5L, 5L), pvalue = c(0, 0, 2.44249065417534e-14, 0, 0, 0),
Years = c("1", "1", "1", "1", "1", "1")), row.names = c(NA,
-6L), class = "data.frame")
现在,我想查看两年的数据并创建 glms 并提取相关数据。因此,例如年迭代将是:
2005年和2006年
2005年和2007年
2005年和2009年
2005 年和 2014 年
2005 年和 2015 年
2006年和2007年
2006年和2009年
ETC....
2014 年和 2015 年
在用两年的数据完成此操作后,我想用三年的数据进行所有组合,等等,直到我开始使用所有年份的数据。
我试过添加另一个 for 循环或将 combn() 添加到我的代码中,但没有成功。
如有任何帮助,我们将不胜感激!
此外,这是我第一次发帖,如果您需要更多数据,请告诉我。谢谢!
考虑将所有处理封装在定义的方法中,在该方法中您接收年和年数的组合向量作为参数。然后,迭代 lapply
+ combn
.
函数
run_model <- function(vec, yr) {
# subset data by years
sub <- Blue_allyears[Blue_allyears$Survey_Yea %in% vec,]
# dynamically generate formula
fmla <- Pres_Abs ~ ca_10mbath+ca_10m_cur+ca_10m_eas+ca10_bpi30+ca10_bpi24
# fit glm model
fit <- glm(fmla, data=sub, family=binomial(link=logit))
## capture summary stats
AIC <- AIC(fit)
Deviance <- deviance(fit)
NullDeviance <- fit$null.deviance
null_minus_dev <- NullDeviance - Deviance
df.residual <- fit$df.residual
df.null <- fit$df.null
df.null.minus.df.residual <- df.null - df.residual
pvalue <- 1 - pchisq(null_minus_dev, df.null.minus.df.residual)
# get coefficents of fit
cfit <- coef(summary(fit))
# create temporary data frame
df <- data.frame(
Survey_Yea = paste(vec, collapse=", "),
AIC = AIC,
Deviance = Deviance,
NullDeviance = NullDeviance,
null.minus.dev = null_minus_dev,
df.residual = df.residual,
df.null = df.null,
df.null.minus.df.residual = df.null.minus.df.residual,
pvalue = pvalue,
Years = yr,
stringsAsFactors = FALSE # DEFAULT IN R 1.4.0+
)
return(df)
}
通话
years <- sort(unique(Blue_allyears$Survey_Yea))
# RETURN NESTED LIST OF MANY DATA FRAMES
results_df_list <- lapply(1:3, function(i) combn(
years, i, run_model, simplify=FALSE, num_yr=i)
)
# RETURN FLATTENED LIST OF THREE DATA FRAMES AND
# RENAME ELEMENTS
results_df_list <- setNames(
lapply(results_df_list, function(dfs) do.call(rbind, dfs)),
c("years_1", "years_2", "years_3")
)
# REVIEW EMBEDDED DATA FRAMES
View(results_df_list$years_1)
View(results_df_list$years_2)
View(results_df_list$years_3)
演示
用OP截图图片的随机数据匹配结构来演示:
set.seed(52222)
Blue_allyears <- data.frame(
Survey_Yea = sample(2005:2014, 500, replace=TRUE),
Pres_Abs = sample(0:1, 500, replace=TRUE),
ca_10mbath = runif(500),
ca_10m_cur = runif(500),
ca_10m_eas = runif(500),
ca10_bpi30 = runif(500),
ca10_bpi24 = runif(500)
)
#...run above blocks...
head(results_df_list$years_1)
# Survey_Yea AIC Deviance NullDeviance null.minus.dev df.residual df.null df.null.minus.df.residual pvalue Years
# 1 2005 83.68461 71.68461 81.77442 10.089809 53 58 5 0.07273019 1
# 2 2006 68.09388 56.09388 60.28383 4.189951 41 46 5 0.52240456 1
# 3 2007 69.25363 57.25363 62.18310 4.929472 39 44 5 0.42454811 1
# 4 2008 79.01764 67.01764 70.52444 3.506803 45 50 5 0.62235846 1
# 5 2009 81.57290 69.57290 74.19185 4.618955 48 53 5 0.46412711 1
# 6 2010 85.46602 73.46602 76.88259 3.416573 51 56 5 0.63604708 1
head(results_df_list$years_2)
# Survey_Yea AIC Deviance NullDeviance null.minus.dev df.residual df.null df.null.minus.df.residual pvalue Years
# 1 2005, 2006 152.5382 140.5382 145.0927 4.554509 100 105 5 0.4726236 2
# 2 2005, 2007 153.2814 141.2814 144.0207 2.739315 98 103 5 0.7400991 2
# 3 2005, 2008 159.2930 147.2930 152.3469 5.053860 104 109 5 0.4093425 2
# 4 2005, 2009 160.5739 148.5739 156.2174 7.643473 107 112 5 0.1770101 2
# 5 2005, 2010 167.3905 155.3905 159.5665 4.176056 110 115 5 0.5243568 2
# 6 2005, 2011 153.0582 141.0582 145.5514 4.493158 99 104 5 0.4807993 2
head(results_df_list$years_3)
# Survey_Yea AIC Deviance NullDeviance null.minus.dev df.residual df.null df.null.minus.df.residual pvalue Years
# 1 2005, 2006, 2007 219.1731 207.1731 208.5284 1.355302 145 150 5 0.9291396 3
# 2 2005, 2006, 2008 225.7515 213.7515 216.8769 3.125365 151 156 5 0.6806653 3
# 3 2005, 2006, 2009 228.9630 216.9630 221.4069 4.443965 154 159 5 0.4874155 3
# 4 2005, 2006, 2010 235.7721 223.7721 225.9108 2.138620 157 162 5 0.8296509 3
# 5 2005, 2006, 2011 218.5088 206.5088 209.4254 2.916605 146 151 5 0.7128412 3
# 6 2005, 2006, 2012 213.4275 201.4275 210.2102 8.782750 147 152 5 0.1180497 3
我有一个大型数据集,其中包含 Blue Rockfish 的存在和不存在 (0,1) 以及多个变量(在我的例子中,测深、曲率、东度、精细比例 BPI 和大比例 BPI)。
structure(list(Pres_Abs = c(1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L), CommonName = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = "Blue Rockfish", class = "factor"), Survey_Yea = c(2009L,
2014L, 2005L, 2015L, 2006L, 2009L, 2014L, 2015L, 2015L, 2015L,
2005L, 2014L, 2015L, 2015L, 2014L, 2015L, 2015L, 2015L, 2015L,
2006L), ca_10mbath = c(-42.6913986, -36.0038986, -36.5155983,
-44.4014816, -39.3320007, -45.7226982, -47.9375, -51.5976982,
-37.7812996, -14.1093302, -70.5976562, -41.5121307, -48.4246902,
-46.0937996, -38.9961014, -46.375, -42.6913986, -60.96875, -46.375,
-37.6601601), ca10_bpi24 = c(-12L, -2L, -2L, -2L, -2L, -2L, 7L,
37L, -2L, 77L, -2L, -2L, 17L, 7L, -2L, -2L, -12L, -2L, -2L, 67L
), ca_10m_cur = c(-0.0859528, -0.0006409, -0.0068855, -0.5154228,
-0.0390663, -0.0078316, -0.0221901, 0.792961, 0, 4.3303394, 0.0429688,
-0.4405556, -0.1947556, 0.0195274, -0.230453, -0.0093803, -0.0859528,
-0.2148438, -0.0093803, 0.0976486), ca_10m_eas = c(0.727106,
0.887252, 0.565906, 0.9994883, 0.96552, 0.960033, 0.998732, 0.772206,
0.589553, -0.4134142, -0.8266082, -0.3659272, -0.7330094, 0.0329623,
0.998884, 0.271237, 0.727106, -0.5498384, 0.271237, 0.6424425
), ca10_bpi30 = c(-15L, -15L, -15L, -15L, -15L, -15L, -15L, -15L,
-15L, 262L, -15L, -15L, -15L, -15L, -15L, -15L, -15L, -15L, -15L,
-15L)), row.names = c(2032L, 3801L, 479L, 4421L, 997L, 1551L,
3079L, 4657L, 5059L, 4104L, 261L, 2849L, 4460L, 4765L, 3535L,
4842L, 4950L, 4323L, 4833L, 752L), class = "data.frame")
此外,我有多年的数据(2005、2006、2007、2009、2014、2015)。我基本上想创建一个 glm
Pres_Abs~bathy+curvature+eastness+broadscale+finescale, data=Blue_allyears, family=binomial(link=logit))
这会经历各个年份的组合。因此,在 1 年的水平上,我使用 2005 年的数据创建了 glms,然后是 2006 年的数据,然后是 2007 年的数据等。在该代码中,我保存了 AIC、残差和零偏差、卡方、p 等数据价值等
这是我用来循环第一年数据的代码(改编自 Whosebug 上的其他人):
results <- data.frame()
for(Survey_Yea in unique(Blue_allyears$Survey_Yea)){
# dynamically generate formula
fmla <- as.formula(Pres_Abs~ca_10mbath+ca_10m_cur+ca_10m_eas+ca10_bpi30+ca10_bpi24)
# fit glm model
fit<-glm(fmla,data=Blue_allyears[Blue_allyears$Survey_Yea == Survey_Yea,],family=binomial(link=logit))
## capture summary stats
AIC <- AIC(fit)
Deviance <- deviance(fit)
NullDeviance <- fit$null.deviance
null_minus_dev<-NullDeviance-Deviance
df.residual<- fit$df.residual
df.null<-fit$df.null
df.null.minus.df.residual<-df.null-df.residual
pvalue<- with(fit, 1-pchisq(null_minus_dev , df.null.minus.df.residual))
Years<-"1"
# get coefficents of fit
cfit <- coef(summary(fit))
# create temporary data frame
df <- data.frame( Survey_Yea = Survey_Yea,
AIC = AIC(fit), Deviance = deviance(fit),NullDeviance = fit$null.deviance, null.minus.dev=NullDeviance-Deviance, df.residual= fit$df.residual , df.null=fit$df.null , df.null.minus.df.residual=df.null-df.residual, pvalue= pvalue, Years="1", stringsAsFactors = F)
# bind rows of temporary data frame to the results data frame
results <- rbind(results, df)
}
results
这段代码很棒,可以根据每年的数据创建 glms。
structure(list(Survey_Yea = c(2005L, 2006L, 2007L, 2009L, 2014L,
2015L), AIC = c(731.84838805646, 480.699964265887, 113.681123536743,
764.359566454308, 1482.05275641814, 1581.2853892652), Deviance = c(719.84838805646,
468.699964265887, 101.681123536743, 752.359566454308, 1470.05275641814,
1569.2853892652), NullDeviance = c(987.041585117362, 690.374591837705,
174.673089501106, 1059.1288918956, 2412.15218834861, 2012.89941234608
), null.minus.dev = c(267.193197060902, 221.674627571818, 72.991965964363,
306.769325441288, 942.099431930472, 443.614023080884), df.residual = c(706L,
492L, 120L, 758L, 1734L, 1446L), df.null = c(711L, 497L, 125L,
763L, 1739L, 1451L), df.null.minus.df.residual = c(5L, 5L, 5L,
5L, 5L, 5L), pvalue = c(0, 0, 2.44249065417534e-14, 0, 0, 0),
Years = c("1", "1", "1", "1", "1", "1")), row.names = c(NA,
-6L), class = "data.frame")
现在,我想查看两年的数据并创建 glms 并提取相关数据。因此,例如年迭代将是: 2005年和2006年 2005年和2007年 2005年和2009年 2005 年和 2014 年 2005 年和 2015 年 2006年和2007年 2006年和2009年 ETC.... 2014 年和 2015 年
在用两年的数据完成此操作后,我想用三年的数据进行所有组合,等等,直到我开始使用所有年份的数据。
我试过添加另一个 for 循环或将 combn() 添加到我的代码中,但没有成功。
如有任何帮助,我们将不胜感激!
此外,这是我第一次发帖,如果您需要更多数据,请告诉我。谢谢!
考虑将所有处理封装在定义的方法中,在该方法中您接收年和年数的组合向量作为参数。然后,迭代 lapply
+ combn
.
函数
run_model <- function(vec, yr) {
# subset data by years
sub <- Blue_allyears[Blue_allyears$Survey_Yea %in% vec,]
# dynamically generate formula
fmla <- Pres_Abs ~ ca_10mbath+ca_10m_cur+ca_10m_eas+ca10_bpi30+ca10_bpi24
# fit glm model
fit <- glm(fmla, data=sub, family=binomial(link=logit))
## capture summary stats
AIC <- AIC(fit)
Deviance <- deviance(fit)
NullDeviance <- fit$null.deviance
null_minus_dev <- NullDeviance - Deviance
df.residual <- fit$df.residual
df.null <- fit$df.null
df.null.minus.df.residual <- df.null - df.residual
pvalue <- 1 - pchisq(null_minus_dev, df.null.minus.df.residual)
# get coefficents of fit
cfit <- coef(summary(fit))
# create temporary data frame
df <- data.frame(
Survey_Yea = paste(vec, collapse=", "),
AIC = AIC,
Deviance = Deviance,
NullDeviance = NullDeviance,
null.minus.dev = null_minus_dev,
df.residual = df.residual,
df.null = df.null,
df.null.minus.df.residual = df.null.minus.df.residual,
pvalue = pvalue,
Years = yr,
stringsAsFactors = FALSE # DEFAULT IN R 1.4.0+
)
return(df)
}
通话
years <- sort(unique(Blue_allyears$Survey_Yea))
# RETURN NESTED LIST OF MANY DATA FRAMES
results_df_list <- lapply(1:3, function(i) combn(
years, i, run_model, simplify=FALSE, num_yr=i)
)
# RETURN FLATTENED LIST OF THREE DATA FRAMES AND
# RENAME ELEMENTS
results_df_list <- setNames(
lapply(results_df_list, function(dfs) do.call(rbind, dfs)),
c("years_1", "years_2", "years_3")
)
# REVIEW EMBEDDED DATA FRAMES
View(results_df_list$years_1)
View(results_df_list$years_2)
View(results_df_list$years_3)
演示
用OP截图图片的随机数据匹配结构来演示:
set.seed(52222)
Blue_allyears <- data.frame(
Survey_Yea = sample(2005:2014, 500, replace=TRUE),
Pres_Abs = sample(0:1, 500, replace=TRUE),
ca_10mbath = runif(500),
ca_10m_cur = runif(500),
ca_10m_eas = runif(500),
ca10_bpi30 = runif(500),
ca10_bpi24 = runif(500)
)
#...run above blocks...
head(results_df_list$years_1)
# Survey_Yea AIC Deviance NullDeviance null.minus.dev df.residual df.null df.null.minus.df.residual pvalue Years
# 1 2005 83.68461 71.68461 81.77442 10.089809 53 58 5 0.07273019 1
# 2 2006 68.09388 56.09388 60.28383 4.189951 41 46 5 0.52240456 1
# 3 2007 69.25363 57.25363 62.18310 4.929472 39 44 5 0.42454811 1
# 4 2008 79.01764 67.01764 70.52444 3.506803 45 50 5 0.62235846 1
# 5 2009 81.57290 69.57290 74.19185 4.618955 48 53 5 0.46412711 1
# 6 2010 85.46602 73.46602 76.88259 3.416573 51 56 5 0.63604708 1
head(results_df_list$years_2)
# Survey_Yea AIC Deviance NullDeviance null.minus.dev df.residual df.null df.null.minus.df.residual pvalue Years
# 1 2005, 2006 152.5382 140.5382 145.0927 4.554509 100 105 5 0.4726236 2
# 2 2005, 2007 153.2814 141.2814 144.0207 2.739315 98 103 5 0.7400991 2
# 3 2005, 2008 159.2930 147.2930 152.3469 5.053860 104 109 5 0.4093425 2
# 4 2005, 2009 160.5739 148.5739 156.2174 7.643473 107 112 5 0.1770101 2
# 5 2005, 2010 167.3905 155.3905 159.5665 4.176056 110 115 5 0.5243568 2
# 6 2005, 2011 153.0582 141.0582 145.5514 4.493158 99 104 5 0.4807993 2
head(results_df_list$years_3)
# Survey_Yea AIC Deviance NullDeviance null.minus.dev df.residual df.null df.null.minus.df.residual pvalue Years
# 1 2005, 2006, 2007 219.1731 207.1731 208.5284 1.355302 145 150 5 0.9291396 3
# 2 2005, 2006, 2008 225.7515 213.7515 216.8769 3.125365 151 156 5 0.6806653 3
# 3 2005, 2006, 2009 228.9630 216.9630 221.4069 4.443965 154 159 5 0.4874155 3
# 4 2005, 2006, 2010 235.7721 223.7721 225.9108 2.138620 157 162 5 0.8296509 3
# 5 2005, 2006, 2011 218.5088 206.5088 209.4254 2.916605 146 151 5 0.7128412 3
# 6 2005, 2006, 2012 213.4275 201.4275 210.2102 8.782750 147 152 5 0.1180497 3