如何创建一个 for 循环来为 R 中的 glm 进行多年组合？

Question

我有一个大型数据集，其中包含 Blue Rockfish 的存在和不存在 (0,1) 以及多个变量（在我的例子中，测深、曲率、东度、精细比例 BPI 和大比例 BPI）。

structure(list(Pres_Abs = c(1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 
0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L), CommonName = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), .Label = "Blue Rockfish", class = "factor"), Survey_Yea = c(2009L, 
2014L, 2005L, 2015L, 2006L, 2009L, 2014L, 2015L, 2015L, 2015L, 
2005L, 2014L, 2015L, 2015L, 2014L, 2015L, 2015L, 2015L, 2015L, 
2006L), ca_10mbath = c(-42.6913986, -36.0038986, -36.5155983, 
-44.4014816, -39.3320007, -45.7226982, -47.9375, -51.5976982, 
-37.7812996, -14.1093302, -70.5976562, -41.5121307, -48.4246902, 
-46.0937996, -38.9961014, -46.375, -42.6913986, -60.96875, -46.375, 
-37.6601601), ca10_bpi24 = c(-12L, -2L, -2L, -2L, -2L, -2L, 7L, 
37L, -2L, 77L, -2L, -2L, 17L, 7L, -2L, -2L, -12L, -2L, -2L, 67L
), ca_10m_cur = c(-0.0859528, -0.0006409, -0.0068855, -0.5154228, 
-0.0390663, -0.0078316, -0.0221901, 0.792961, 0, 4.3303394, 0.0429688, 
-0.4405556, -0.1947556, 0.0195274, -0.230453, -0.0093803, -0.0859528, 
-0.2148438, -0.0093803, 0.0976486), ca_10m_eas = c(0.727106, 
0.887252, 0.565906, 0.9994883, 0.96552, 0.960033, 0.998732, 0.772206, 
0.589553, -0.4134142, -0.8266082, -0.3659272, -0.7330094, 0.0329623, 
0.998884, 0.271237, 0.727106, -0.5498384, 0.271237, 0.6424425
), ca10_bpi30 = c(-15L, -15L, -15L, -15L, -15L, -15L, -15L, -15L, 
-15L, 262L, -15L, -15L, -15L, -15L, -15L, -15L, -15L, -15L, -15L, 
-15L)), row.names = c(2032L, 3801L, 479L, 4421L, 997L, 1551L, 
3079L, 4657L, 5059L, 4104L, 261L, 2849L, 4460L, 4765L, 3535L, 
4842L, 4950L, 4323L, 4833L, 752L), class = "data.frame")

此外，我有多年的数据（2005、2006、2007、2009、2014、2015）。我基本上想创建一个 glm

Pres_Abs~bathy+curvature+eastness+broadscale+finescale, data=Blue_allyears, family=binomial(link=logit))

这会经历各个年份的组合。因此，在 1 年的水平上，我使用 2005 年的数据创建了 glms，然后是 2006 年的数据，然后是 2007 年的数据等。在该代码中，我保存了 AIC、残差和零偏差、卡方、p 等数据价值等

这是我用来循环第一年数据的代码（改编自 Whosebug 上的其他人）：

results <- data.frame()


for(Survey_Yea in unique(Blue_allyears$Survey_Yea)){
  
  # dynamically generate formula
        fmla <- as.formula(Pres_Abs~ca_10mbath+ca_10m_cur+ca_10m_eas+ca10_bpi30+ca10_bpi24)

        # fit glm model
        fit<-glm(fmla,data=Blue_allyears[Blue_allyears$Survey_Yea == Survey_Yea,],family=binomial(link=logit))

        ## capture summary stats
        
        AIC <- AIC(fit)
        Deviance <- deviance(fit)
        NullDeviance <- fit$null.deviance
        null_minus_dev<-NullDeviance-Deviance
        df.residual<- fit$df.residual      
        df.null<-fit$df.null 
        df.null.minus.df.residual<-df.null-df.residual
       pvalue<- with(fit, 1-pchisq(null_minus_dev ,  df.null.minus.df.residual))
       Years<-"1"
   

        # get coefficents of fit
        cfit <- coef(summary(fit))

        # create temporary data frame
        df <- data.frame( Survey_Yea = Survey_Yea, 
                         AIC = AIC(fit), Deviance = deviance(fit),NullDeviance = fit$null.deviance, null.minus.dev=NullDeviance-Deviance, df.residual= fit$df.residual ,   df.null=fit$df.null , df.null.minus.df.residual=df.null-df.residual,  pvalue= pvalue,  Years="1", stringsAsFactors = F)

        # bind rows of temporary data frame to the results data frame
        results <- rbind(results, df)
}


results

这段代码很棒，可以根据每年的数据创建 glms。

structure(list(Survey_Yea = c(2005L, 2006L, 2007L, 2009L, 2014L, 
2015L), AIC = c(731.84838805646, 480.699964265887, 113.681123536743, 
764.359566454308, 1482.05275641814, 1581.2853892652), Deviance = c(719.84838805646, 
468.699964265887, 101.681123536743, 752.359566454308, 1470.05275641814, 
1569.2853892652), NullDeviance = c(987.041585117362, 690.374591837705, 
174.673089501106, 1059.1288918956, 2412.15218834861, 2012.89941234608
), null.minus.dev = c(267.193197060902, 221.674627571818, 72.991965964363, 
306.769325441288, 942.099431930472, 443.614023080884), df.residual = c(706L, 
492L, 120L, 758L, 1734L, 1446L), df.null = c(711L, 497L, 125L, 
763L, 1739L, 1451L), df.null.minus.df.residual = c(5L, 5L, 5L, 
5L, 5L, 5L), pvalue = c(0, 0, 2.44249065417534e-14, 0, 0, 0), 
    Years = c("1", "1", "1", "1", "1", "1")), row.names = c(NA, 
-6L), class = "data.frame")

现在，我想查看两年的数据并创建 glms 并提取相关数据。因此，例如年迭代将是： 2005年和2006年 2005年和2007年 2005年和2009年 2005 年和 2014 年 2005 年和 2015 年 2006年和2007年 2006年和2009年 ETC.... 2014 年和 2015 年

在用两年的数据完成此操作后，我想用三年的数据进行所有组合，等等，直到我开始使用所有年份的数据。

我试过添加另一个 for 循环或将 combn() 添加到我的代码中，但没有成功。

如有任何帮助，我们将不胜感激！

此外，这是我第一次发帖，如果您需要更多数据，请告诉我。谢谢！

Answer 1

考虑将所有处理封装在定义的方法中，在该方法中您接收年和年数的组合向量作为参数。然后，迭代 lapply + combn.

函数

run_model <- function(vec, yr) {
  # subset data by years
  sub <- Blue_allyears[Blue_allyears$Survey_Yea %in% vec,]
  
  # dynamically generate formula
  fmla <- Pres_Abs ~ ca_10mbath+ca_10m_cur+ca_10m_eas+ca10_bpi30+ca10_bpi24
  
  # fit glm model
  fit <- glm(fmla, data=sub, family=binomial(link=logit))
  
  ## capture summary stats
  AIC <- AIC(fit)
  Deviance <- deviance(fit)
  NullDeviance <- fit$null.deviance
  null_minus_dev <- NullDeviance - Deviance
  df.residual <- fit$df.residual      
  df.null <- fit$df.null 
  df.null.minus.df.residual <- df.null - df.residual
  pvalue <- 1 - pchisq(null_minus_dev,  df.null.minus.df.residual)
  
  # get coefficents of fit
  cfit <- coef(summary(fit))
  
  # create temporary data frame
  df <- data.frame(
    Survey_Yea = paste(vec, collapse=", "), 
    AIC = AIC,
    Deviance = Deviance,
    NullDeviance = NullDeviance, 
    null.minus.dev = null_minus_dev,
    df.residual = df.residual,   
    df.null = df.null, 
    df.null.minus.df.residual = df.null.minus.df.residual,
    pvalue = pvalue, 
    Years = yr, 
    stringsAsFactors = FALSE   # DEFAULT IN R 1.4.0+
  )
  
  return(df)
}

通话

years <- sort(unique(Blue_allyears$Survey_Yea))

# RETURN NESTED LIST OF MANY DATA FRAMES
results_df_list <- lapply(1:3, function(i) combn(
  years, i, run_model, simplify=FALSE, num_yr=i)
)

# RETURN FLATTENED LIST OF THREE DATA FRAMES AND
# RENAME ELEMENTS
results_df_list <- setNames(
  lapply(results_df_list, function(dfs) do.call(rbind, dfs)),
  c("years_1", "years_2", "years_3")
)

# REVIEW EMBEDDED DATA FRAMES
View(results_df_list$years_1)
View(results_df_list$years_2)
View(results_df_list$years_3)

演示

用OP截图图片的随机数据匹配结构来演示：

set.seed(52222)
Blue_allyears <- data.frame(
  Survey_Yea = sample(2005:2014, 500, replace=TRUE),
  Pres_Abs = sample(0:1, 500, replace=TRUE),
  ca_10mbath = runif(500),
  ca_10m_cur = runif(500),
  ca_10m_eas = runif(500),
  ca10_bpi30 = runif(500),
  ca10_bpi24 = runif(500)
)

#...run above blocks...

head(results_df_list$years_1)
#   Survey_Yea      AIC Deviance NullDeviance null.minus.dev df.residual df.null df.null.minus.df.residual     pvalue Years
# 1       2005 83.68461 71.68461     81.77442      10.089809          53      58                         5 0.07273019     1
# 2       2006 68.09388 56.09388     60.28383       4.189951          41      46                         5 0.52240456     1
# 3       2007 69.25363 57.25363     62.18310       4.929472          39      44                         5 0.42454811     1
# 4       2008 79.01764 67.01764     70.52444       3.506803          45      50                         5 0.62235846     1
# 5       2009 81.57290 69.57290     74.19185       4.618955          48      53                         5 0.46412711     1
# 6       2010 85.46602 73.46602     76.88259       3.416573          51      56                         5 0.63604708     1

head(results_df_list$years_2)
#   Survey_Yea      AIC Deviance NullDeviance null.minus.dev df.residual df.null df.null.minus.df.residual    pvalue Years
# 1 2005, 2006 152.5382 140.5382     145.0927       4.554509         100     105                         5 0.4726236     2
# 2 2005, 2007 153.2814 141.2814     144.0207       2.739315          98     103                         5 0.7400991     2
# 3 2005, 2008 159.2930 147.2930     152.3469       5.053860         104     109                         5 0.4093425     2
# 4 2005, 2009 160.5739 148.5739     156.2174       7.643473         107     112                         5 0.1770101     2
# 5 2005, 2010 167.3905 155.3905     159.5665       4.176056         110     115                         5 0.5243568     2
# 6 2005, 2011 153.0582 141.0582     145.5514       4.493158          99     104                         5 0.4807993     2

head(results_df_list$years_3)
#         Survey_Yea      AIC Deviance NullDeviance null.minus.dev df.residual df.null df.null.minus.df.residual    pvalue Years
# 1 2005, 2006, 2007 219.1731 207.1731     208.5284       1.355302         145     150                         5 0.9291396     3
# 2 2005, 2006, 2008 225.7515 213.7515     216.8769       3.125365         151     156                         5 0.6806653     3
# 3 2005, 2006, 2009 228.9630 216.9630     221.4069       4.443965         154     159                         5 0.4874155     3
# 4 2005, 2006, 2010 235.7721 223.7721     225.9108       2.138620         157     162                         5 0.8296509     3
# 5 2005, 2006, 2011 218.5088 206.5088     209.4254       2.916605         146     151                         5 0.7128412     3
# 6 2005, 2006, 2012 213.4275 201.4275     210.2102       8.782750         147     152                         5 0.1180497     3

如何创建一个 for 循环来为 R 中的 glm 进行多年组合？

How to create a for loop to go through multiple year combinations for a glm in R?

combinations

for-loop

r

glm