在 tidymodels recipes::recipe() 中创建一个多元矩阵

Create a multivariate matrix in tidymodels recipes::recipe()

我正在尝试对一个模型进行 k 折交叉验证,该模型根据卫星图像预测树种断面积比例的联合分布。这需要使用 DiricihletReg::DirichReg() 函数,而这又需要使用 DirichletReg::DR_data() 函数将响应变量准备为矩阵。我最初尝试在 caret:: 包中完成此操作,但我发现 caret:: 不支持多变量响应。从那以后,我尝试在 tidymodels:: 软件包套件中实现它。根据 parsnip::(我很欣赏 Max Kuhn 的蔬菜幽默)包中有关如何注册新模型的文档,我创建了一个“DREG”模型和一个“DR”引擎。当我简单地在单个训练数据集上调用它时,我的注册模型就可以工作,但我的目标是进行 kfolds 交叉验证,实现 vfolds_cv()workflow() 和 'fit_resample()' 函数。使用我目前拥有的代码,我收到警告消息:

Warning message:
All models failed. See the `.notes` column. 

这些注释指出 Error in get(resp_char, environment(oformula)): object 'cbind(PSME, TSHE, ALRU2)' not found 我认为这是由于使用 DR_data() 将响应变量预处理为 Dirichlet::DirichReg() 到 运行 所需的格式适当地。我认为我需要实施的解决方案涉及在 recipe() 调用或 set_fit() 调用中进行预处理,当我向 parsnip:: 注册此模型时。我在指定配方时尝试使用 step_mutate() 函数,但它在每一列上执行一个函数,而不是将函数与列一起作为输入应用。这导致 fit_resample() 输出的“注释”中出现以下错误:

Must subset columns with a valid subscript vector.
Subscript has the wrong type `quosures`.
It must be numeric or character.

有没有一种方法可以使用带有 step_*() 函数的 DR_data() 函数或使用 pre= set_fit()set_pred()?

中的参数

下面是我的可重现示例:

##Loading Necessary Packages##
library(tidymodels)
library(DirichletReg)

##Creating Fake Data##
set.seed(88)#For reproducibility

#Response variables#
PSME_BA<-rnorm(100,50, 15)
TSHE_BA<-rnorm(100,40,12)
ALRU2_BA<-rnorm(100,20,0.5)
Total_BA<-PSME_BA+TSHE_BA+ALRU2_BA

#Predictor variables#
B1<-runif(100, 0, 2000)
B2<-runif(100, 0, 1800)
B3<-runif(100, 0, 3000)

#Dataset for modeling#
DF<-data.frame(PSME=PSME_BA/Total_BA, TSHE=TSHE_BA/Total_BA, ALRU2=ALRU2_BA/Total_BA,
               B1=B1, B2=B2, B3=B3)

##Modeling the data using Dirichlet regression with repeated k-folds cross validation##
#Registering the model to parsnip::#
set_new_model("DREG")
set_model_mode(model="DREG", mode="regression")
set_model_engine("DREG", mode="regression", eng="DR")
set_dependency("DREG", eng="DR", pkg="DirichletReg")

set_model_arg(
  model = "DREG",
  eng = "DR",
  parsnip = "param",
  original = "model",
  func = list(pkg = "DirichletReg", fun = "DirichReg"),
  has_submodel = FALSE
)

DREG <-
  function(mode = "regression",  param = NULL) {
    # Check for correct mode
    if (mode  != "regression") {
      rlang::abort("`mode` should be 'regression'")
    }
    
    # Capture the arguments in quosures
    args <- list(sub_classes = rlang::enquo(param))
    
    # Save some empty slots for future parts of the specification
    new_model_spec(
      "DREG",
      args=args,
      eng_args = NULL,
      mode = mode,
      method = NULL,
      engine = NULL
    )
  }

set_fit(
  model = "DREG",
  eng = "DR",
  mode = "regression",
  value = list(
    interface = "formula",
    protect = NULL,
    func = c(pkg = "DirichletReg", fun = "DirichReg"),
    defaults = list()
  )
)

set_encoding(
  model = "DREG",
  eng = "DR",
  mode = "regression",
  options = list(
    predictor_indicators = "none",
    compute_intercept = TRUE,
    remove_intercept = TRUE,
    allow_sparse_x = FALSE
  )
)

set_pred(
  model = "DREG",
  eng = "DR",
  mode = "regression",
  type = "numeric",
  value = list(
    pre = NULL,
    post = NULL,
    func = c(fun = "predict.DirichletRegModel"),
    args =
      list(
        object = expr(object$fit),
        newdata = expr(new_data),
        type = "response"
      )
  )
)

##Running the Model##
DF$Y<-DR_data(DF[,c(1:3)]) #Preparing the response variables 

dreg_spec<-DREG(param="alternative") %>% 
  set_engine("DR")

dreg_mod<-dreg_spec %>% 
  fit(Y~B1+B2+B3, data = DF)#Model works when simply run on single dataset

##Attempting Crossvalidation##
#First attempt - simply call Y as the response variable in the recipe#
kfolds<-vfold_cv(DF, v=10, repeats = 2)
rcp<-recipe(Y~B1+B2+B3, data=DF)

dreg_fit<- workflow() %>% 
  add_model(dreg_spec) %>% 
  add_recipe(rcp)

dreg_rsmpl<-dreg_fit %>% 
  fit_resamples(kfolds)#Throws warning about all models failing

#second attempt - use step_mutate_at()#
rcp<-recipe(~B1+B2+B3, data=DF) %>% 
  step_mutate_at(fn=DR_data, var=vars(PSME, TSHE, ALRU2))

dreg_fit<- workflow() %>% 
  add_model(dreg_spec) %>% 
  add_recipe(rcp)

dreg_rsmpl<-dreg_fit %>% 
  fit_resamples(kfolds)#Throws warning about all models failing

这行得通,但我不确定这是否符合您的预期。

首先——获取 CV 和 DR_data()

的数据设置

我不知道有哪个软件包构建了本质上是 CV 和 DirichletReg 的翻译。因此,该部分是手动完成的。您可能会惊讶地发现它并没有那么复杂。

使用您创建的数据和为 tidymodels 创建的建模对象(前缀为 set_ 的对象),我创建了您尝试使用的 CV 结构。

df1 <- data.frame(PSME = PSME_BA/Total_BA, TSHE = TSHE_BA/Total_BA, 
                  ALRU2=ALRU2_BA/Total_BA, B1, B2, B3)

set.seed(88)
kDf2 <- kDf1 <- vfold_cv(df1, v=10, repeats = 2)

对于 kDf2 中标识的 20 个子集数据帧中的每一个,我使用 DR_data 为模型设置数据。

# convert to DR_data (each folds and repeats)
df2 <- map(1:20,
           .f = function(x){
             in_ids = kDf1$splits[[x]]$in_id
             dd <- kDf1$splits[[x]]$data[in_ids, ] # filter rows BEFORE DR_data
             dd$Y <- DR_data(dd[, 1:3]) 
             kDf1$splits[[x]]$data <<- dd
           })

因为我对tidymodels不是很熟悉,接下来使用DirichReg进行建模。然后我又用 tidymodels 做了一次并比较了它们。 (输出是相同的。)

DirichReg 拟合模型和总结

set.seed(88)
# perform crossfold validation on Dirichlet Model
df2.fit <- map(1:20,
               .f = function(x){
                 Rpt = kDf1$splits[[x]]$id$id
                 Fld = kDf1$splits[[x]]$id$id2
                 daf = kDf1$splits[[x]]$data
                 fit = DirichReg(Y ~ B1 + B2, daf)
                 list(Rept = Rpt, Fold = Fld, fit = fit)
               })
# summary of each fitted model
fit.a <- map(1:20,
             .f = function(x){
             summary(df2.fit[[x]]$fit)
             })

tidymodels 和拟合摘要(代码看起来一样,但有一些不同——虽然输出是一样的)

# I'm not sure what 'alternative' is supposed to do here?
dreg_spec <- DREG(param="alternative") %>%  # this is not model = alternative
  set_engine("DR")

set.seed(88)
dfa.fit <- map(1:20,
               .f = function(x){
                 Rpt = kDf1$splits[[x]]$id$id
                 Fld = kDf1$splits[[x]]$id$id2
                 daf = kDf1$splits[[x]]$data
                 fit = dreg_spec %>% 
                   fit(Y ~ B1 + B2, data = daf)
                 list(Rept = Rpt, Fold = Fld, fit = fit)
               })

afit.a <- map(1:20,
              .f = function(x){
                summary(dfa.fit[[x]]$fit$fit) # extra nest for parsnip
              })

如果你想看第一个模型?

fit.a[[1]]
afit.a[[1]]

如果您想要 AIC 最低的模型?

# comare AIC, BIC, and liklihood?
# what do you percieve best fit with?
fmin = min(unlist(map(1:20, ~fit.a[[.x]]$aic)))  # dir

# find min AIC model number
paste0((map(1:20, ~ifelse(fit.a[[.x]]$aic == fmin, .x, ""))), collapse = "")

fit.a[[19]]
afit.a[[19]]