如何使用带插入符号的 nls 进行交叉验证

How to use nls with caret to do cross-validation

我已经使用 nls 将多个模型拟合到相同的数据,并且我正在尝试弄清楚如何使用 caret 进行 K 折交叉验证(例如,here). asked a general question about using nls in caret for a single model. However, the answer referred them to this resource,这超出了我的理解(即如何适应nls),尤其是对于拟合几个不同的nls函数。

以下是适合 nls 的模型对象的示例数据和示例:

library("caret")

df <-
  structure(list(c = c(123.86, 208.75, 141.5, 230.73, 143.4, 209.31, 
  161.15, 130.87, 232.05, 121.61, 176.31, 139.01, 131.92, 156.61, 
  150.05, 121, 134.12, 146.83, 181.39, 115, 147.87, 161.49, 107.65, 
  115.51, 144.11), q = c(0.028, 0.004, 0.049, 0.001, 0.049, 0.004, 
  0.016, 0.015, 0.003, 0.026, 0.002, 0.009, 0.148, 0.012, 0.017, 
  0.086, 0.02, 0.038, 0.003, 0.031, 0.011, 0.032, 0.132, 0.093, 
  0.026)), row.names = c(NA, -25L), class = c("tbl_df", "tbl", 
  "data.frame"))

# Model 1
eq1 <- function(q,a,n) (a*q**(-1/n))
eq1_fit <- nls(c ~ eq1(q,a,n), data=df,start=list(a=380, n=5))

# Model 2
eq2 <- function(q,h,g,n,c0) (h*exp(-g*q**(1/n))+c0)
eq2_fit <- nls(c ~ eq2(q,h,g,n=5,c0=6), data=df,start=list(h=100,g=1))

有几个额外的模型都适合使用 nls 相同的数据,我想按照示例或类似 shown here 做 5 倍或 10 倍 CV。使用 tidymodelsworkflowsets 的解决方案也可以。感谢您的帮助!

您可以使用 rsample 包中的 vfold_cv() 函数执行此操作,该包位于 tidymodels 生态系统中。

df <-
  structure(list(c = c(123.86, 208.75, 141.5, 230.73, 143.4, 209.31, 
                       161.15, 130.87, 232.05, 121.61, 176.31, 139.01, 131.92, 156.61, 
                       150.05, 121, 134.12, 146.83, 181.39, 115, 147.87, 161.49, 107.65, 
                       115.51, 144.11), q = c(0.028, 0.004, 0.049, 0.001, 0.049, 0.004, 
                                              0.016, 0.015, 0.003, 0.026, 0.002, 0.009, 0.148, 0.012, 0.017, 
                                              0.086, 0.02, 0.038, 0.003, 0.031, 0.011, 0.032, 0.132, 0.093, 
                                              0.026)), row.names = c(NA, -25L), class = c("tbl_df", "tbl", 
                                                                                          "data.frame"))


library(rsample)
library(tidyverse)
## define equations 
eq1 <- function(q,a,n) (a*q**(-1/n))
eq2 <- function(q,h,g,n,c0) (h*exp(-g*q**(1/n))+c0)


eq1_fit <- nls(c ~ eq1(q,a,n), data=df,start=list(a=380, n=5))
eq2_fit <- nls(c ~ eq2(q,h,g,n=5,c0=6), data=df,start=list(h=100,g=1))

y <- df$c
## create the squared errors getting model predictions
## from the assessment partition
r1 <- (y - predict(eq1_fit, newdata=df))
r2 <- (y - predict(eq2_fit, newdata=df))


e1 <- (y - predict(eq1_fit, newdata=df))^2
e2 <- (y - predict(eq2_fit, newdata=df))^2





## define function that will do the model fitting and assessment
cv_mods <- function(split, ...){
  ## fit models with the analysis partition
  eq1_fit <- nls(c ~ eq1(q,a,n), data=analysis(split),start=list(a=380, n=5))
  eq2_fit <- nls(c ~ eq2(q,h,g,n=5,c0=6), data=analysis(split),start=list(h=100,g=1))
  ## take the dependent variable from the assessment partition
  y <- assessment(split)$c
  ## create the residuals  getting model predictions
  ## from the assessment partition
  e1 <- (y - predict(eq1_fit, newdata=assessment(split)))
  e2 <- (y - predict(eq2_fit, newdata=assessment(split)))
  ## return the cross-validated residuals from both models as 
  ## a data frame. 
  data.frame(e1 = e1, e2 = e2)
}

## estimate the cross-validation 
## the vfold_cv function sets up the cross-validation partitions
## I used 10 repeats here for speed, but in the "real world" you 
## would probably want lots more 
out <- vfold_cv(df,
                v=10,
                repeats = 10) %>%
  ## estimate the cv on all of the partitions
  mutate(err = map(splits,
                   cv_mods)) %>% 
  ## unnest the error column to turn it into two 
  ## columns in your data frame
  unnest(err)  

## First analysis: Pr(mod1 better than mod2)
out1 <- out %>% 
  ## group by repeat 
  group_by(id) %>% 
  ## calculate the standard deviation of the errors for each level of id
  summarise(across(e1:e2, ~sd(.x))) %>% 
  ## calculate the probability across repeats that model 1 is better
  ## than model 2
  summarise(p_eq1_better = mean(e1<e2))

out1
#> # A tibble: 1 × 1
#>   p_eq1_better
#>          <dbl>
#> 1            1


## alternatively, follow similar steps to above, 
## but get the average cross-validation error 
## for each model: 
out2 <- out %>% group_by(id) %>% 
  ## sum up the sums of squared errors across partitions
  summarise(across(e1:e2, ~sd(.x))) %>% 
  ## calculate average CV error:
  summarise(across(e1:e2, ~mean(.x)))

out2
#> # A tibble: 1 × 2
#>      e1    e2
#>   <dbl> <dbl>
#> 1  19.7  21.2

reprex package (v2.0.1)

于 2022-04-12 创建