Stan 模型后验预测超出可能的数据范围

Stan Model Posterior Predictions Outside Possible Range of Data

我现在正在学习 Stan 中的建模技巧,从中获得了很多乐趣。现在我正在努力研究我的受试者间和受试者内混合析因实验设计模型。有不同的受试者组,每个受试者都表示他们对三种不同饮料(水、无咖啡因和咖啡)中的每一种饮料减少咖啡因戒断的期望程度。结果变量——减少戒断的预期——通过视觉模拟量表从 0 到 10 进行测量,其中 0 表示没有减少戒断的预期,10 表示非常高的戒断减少预期。我想测试三种不同饮料的预期戒断减少潜力是否存在组间差异。

这是数据

df <- data.frame(id = rep(1:46, each = 3),
                 group = c(3,3,3,1,1,1,3,3,3,1,1,1,3,3,3,1,1,1,3,3,3,2,2,2,1,1,1,3,3,3,3,3,3,2,2,2,3,3,3,1,1,1,2,2,2,3,3,3,2,2,2,2,2,2,3,3,3,1,1,1,2,2,2,3,3,3,2,2,2,3,3,3,3,3,3,2,2,2,3,3,3,3,3,3,1,1,1,3,3,3,3,3,3,1,1,1,2,2,2,2,2,2,1,1,1,2,2,2,2,2,2,1,1,1,1,1,1,2,2,2,2,2,2,1,1,1,1,1,1,3,3,3,1,1,1,3,3,3),
                 bevType = rep(c(3,2,1), times = 46),
                 score = c(2.9,1.0,0.0,9.5,5.0,4.5,9.0,3.0,5.0,5.0,0.0,3.0,9.5,2.0,3.0,8.5,0.0,6.0,5.2,3.0,4.0,8.4,7.0,2.0,10.0,0.0,3.0,7.3,1.0,1.8,8.5,2.0,9.0,10.0,5.0,10.0,8.3,2.0,5.0,6.0,0.0,5.0,6.0,0.0,5.0,10.0,0.0,5.0,6.8,1.0,4.8,8.0,1.0,4.0,7.0,4.0,6.0,6.5,1.0,3.1,9.0,1.0,0.0,6.0,0.0,2.0,9.5,4.0,6.0,8.0,1.0,3.8,0.4,0.0,7.0,7.0,0.0,3.0,9.0,2.0,5.0,9.5,2.0,7.0,7.9,5.0,4.9,8.0,1.0,1.0,9.3,5.0,7.9,6.5,2.0,3.0,8.0,2.0,6.0,10.0,0.0,5.0,6.0,0.0,5.0,6.8,0.1,7.0,8.0,3.0,9.1,8.2,0.0,7.9,8.2,5.0,0.0,9.2,1.0,3.1,9.1,3.0,0.6,5.7,2.0,5.1,7.0,0.0,7.4,8.0,1.0,1.5,9.1,4.0,4.3,8.5,8.0,5.0))

现在是模型。该模型有一个总均值参数 a,一个分类预测变量,表示组与总均值的偏差 bGroup,一个表示不同饮料类型与总均值 bBev 的偏差的术语,一个术语每个受试者的截距 bSubj,以及饮料互动组的术语 bGxB。我还估计了每种饮料类型的单独噪声参数。

为了允许后验预测检查,我使用 generated quantities 块和 normal_rng 函数从联合后验中提取。

### Step 1: Put data into list
dList <- list(N = 138,
              nSubj = 46,
              nGroup = 3,
              nBev = 3,
              sIndex = df$id,
              gIndex = df$group,
              bIndex = df$bevType,
              score = df$score,
              gMean = 4.718841,
              gSD = 3.17)


#### Step 1 model
write("
      data{
      int<lower=1> N;
      int<lower=1> nSubj;
      int<lower=1> nGroup;
      int<lower=1> nBev;
      int<lower=1,upper=nSubj> sIndex[N];
      int<lower=1,upper=nGroup> gIndex[N];
      int<lower=1,upper=nBev> bIndex[N];
      real score[N];
      real gMean;
      real gSD;
      }

      parameters{
      real a;
      vector[nSubj] bSubj;
      vector[nGroup] bGroup;
      vector[nBev] bBev;
      vector[nBev] bGxB[nGroup];      // vector of vectors, stan no good with matrix
      vector[nBev] sigma;  
      real<lower=0> sigma_a;
      real<lower=0> sigma_s;
      real<lower=0> sigma_g;
      real<lower=0> sigma_b;
      real<lower=0> sigma_gb;
      }

      model{
      vector[N] mu;

      //hyper-priors
      sigma_s ~ normal(0,10);     
      sigma_g ~ normal(0,10);
      sigma_b ~ normal(0,10);
      sigma_gb ~ normal(0,10);


      //priors
      sigma ~ cauchy(0,1);
      a ~ normal(gMean, gSD);
      bSubj ~ normal(0, sigma_s);
      bGroup ~ normal(0,sigma_g);
      bBev ~ normal(0,sigma_b);
      for (i in 1:nGroup) {               //hierarchical prior on interaction
      bGxB[i] ~ normal(0, sigma_gb);
      }

      // likelihood
      for (i in 1:N){
      score[i] ~ normal(a + bGroup[gIndex[i]] + bBev[bIndex[i]] + bSubj[sIndex[i]] + bGxB[gIndex[i]][bIndex[i]], sigma[bIndex[i]]);
      }
      }

      generated quantities{
      real y_draw[N];
      for (i in 1:N) {
      y_draw[i] = normal_rng(a + bGroup[gIndex[i]] + bBev[bIndex[i]] + bSubj[sIndex[i]] + bGxB[gIndex[i]][bIndex[i]], sigma[bIndex[i]]);
      }
      }
      ", file = "temp.stan")

##### Step 3: generate the chains
mod <-  stan(file = "temp.stan",
             data = dList,
             iter = 5000,  
             warmup = 3000, 
             cores = 1,
             chains = 1)

接下来我们从联合后验中提取绘图,并生成组均值、上限和下限 95% HPDI 的估计值。首先我们需要一个函数来计算 HPDI

HPDIFunct <- function (vector) { 
  sortVec <- sort(vector)
  ninetyFiveVec <- ceiling(.95*length(sortVec))
  fiveVec <- length(sortVec) - length(ninetyFiveVec)
  diffVec <- sapply(1:fiveVec, function (i) sortVec[i + ninetyFiveVec] - sortVec[i])
  minVal <- sortVec[which.min(diffVec)]
  maxVal <- sortVec[which.min(diffVec) + ninetyFiveVec]
  return(list(sortVec, minVal, maxVal))
}

现在从后验中提取平局

#### Step 5: Posterior predictive checks
y_draw <- data.frame(extract(mod, pars = "y_draw"))


And plot the mean, lower HPDI and upper HPDI draws of these draws against the actual data. 

df$drawMean <- apply(y_draw, 2, mean)
df$HPDI_Low <- apply(y_draw, 2, function(i) HPDIFunct(i)[[2]][1])
df$HPDI_Hi <- apply(y_draw, 2, function(i) HPDIFunct(i)[[3]][1])

### Step 6: plot posterior draws against actual data
ggplot(df, aes(x = factor(bevType), colour = factor(group))) +
       geom_jitter(aes(y = score), shape = 1, position = position_dodge(width=0.9)) +
       geom_point(aes(y = drawMean), position = position_dodge(width=0.9), stat = "summary", fun.y = "mean", shape = 3, size = 3, stroke = 2) +
       geom_point(aes(y = HPDI_Low), position = position_dodge(width=0.9), stat = "summary", fun.y = "mean", shape = 1, size = 3, stroke = 1) +
       geom_point(aes(y = HPDI_Hi), position = position_dodge(width=0.9), stat = "summary", fun.y = "mean", shape = 1, size = 3, stroke = 1) +
       scale_colour_manual(name = "Experimental Group", labels = c("Group 1", "Group 2", "Group 3"), values = c("#616a6b", "#00AFBB", "#E7B800")) +  
       scale_x_discrete(labels = c("Water", "Decaf", "Coffee")) + 
       labs(x = "Beverage Type", y = "Expectancy of Withdrawal Alleviation") +
       scale_y_continuous(breaks = seq(0,10,2)) +
       theme(axis.text.x = element_text(size = 12),
             axis.title.x = element_text(face = "bold"),
             axis.title.y = element_text(face = "bold"),
             axis.text.y = element_text(size = 12),
             legend.title = element_text(size = 13, face = "bold"))

查看图表,对于水资源预期,该模型似乎很好地代表了数据的中心(十字)和分布(空心圆)。但这打破了 Decaf 和 Coffee 的预期。对于 Decaf 预期,较低的 HPDI 低于可能值的范围(下限 = 0),并且从后部绘制的图的分布(在每组中由空心圆圈表示)太大。 Coffee 组的 HPDI 上限也 高于 数据范围(上限 = 10),对于实际数据来说差距过大。

所以我的问题是:

如何将绘制从关节后方限制到数据的实际范围?

在 Stan 中是否有某种蛮力方法来限制从后方抽签?或者对三种饮料条件下的方差差异进行更具适应性的估计是否会更有效(在这种情况下,这更像是一个 CV 问题而不是 SO 问题)?

约束后验变量的标准方法是使用 link 函数对其进行变换。这就是逻辑回归和泊松回归等广义线性模型 (GLM) 的工作方式。例如,要从正数到无约束,我们使用对数变换。要从 (0, 1) 中的概率变为不受约束,我们使用对数几率变换。

如果您的结果是 1-10 范围内的序数值,则尊重该数据范围的常见方法是 ordinal logistic regression

为了扩展@Bob Carpenter 的回答,这里有两种方法可以解决这个问题。 (我最近有理由使用这两个,并且努力让它们起来 运行。这可能对像我这样的其他初学者有用。)

方法 1:有序 Logistic 回归

我们将假设每个用户对每个响应都有一个 "true" 期望,这是一个任意连续的尺度,并将其建模为一个潜在变量。如果用户的实际响应属于 K 类别,我们还会对这些类别之间的 K - 1 分割点进行建模。用户选择给定响应类别的概率等于相关切点之间逻辑 pdf 下的面积。

Stan 模型看起来像这样。主要区别在于该模型拟合了一个额外的 cutpoints 有序向量,并使用 ordered_logistic 分布。 (我还将 sigmas 上的先验更改为 Cauchy,以保持它们为正,然后切换到 non-centered parameterization。但这些更改与手头的问题无关。)编辑: 还为我们要对其进行预测的新(假设)观察添加了输入,并为这些预测添加了新的生成量。

data {
  // the real data
  int<lower=1> N;
  int<lower=1> nSubj;
  int<lower=1> nGroup;
  int<lower=1> nBev;
  int minResponse;
  int maxResponse;
  int<lower=1,upper=nSubj> sIndex[N];
  int<lower=1,upper=nGroup> gIndex[N];
  int<lower=1,upper=nBev> bIndex[N];
  int<lower=minResponse,upper=maxResponse> score[N];
  // hypothetical observations for new predictions
  int<lower=1> nNewPred;
  int<lower=0> nNewSubj;
  int<lower=0> nNewGroup;
  int<lower=0> nNewBev;
  int<lower=1,upper=nSubj+nNewSubj> sNewIndex[nNewPred];
  int<lower=1,upper=nGroup+nNewGroup> gNewIndex[nNewPred];
  int<lower=1,upper=nBev+nNewBev> bNewIndex[nNewPred];
}

parameters {
  real a;
  vector[nSubj] bSubj;
  vector[nGroup] bGroup;
  vector[nBev] bBev;
  vector[nBev] bGxB[nGroup];
  real<lower=0> sigma_s;
  real<lower=0> sigma_g;
  real<lower=0> sigma_b;
  real<lower=0> sigma_gb;
  ordered[maxResponse - minResponse] cutpoints;
}

model {
  // hyper-priors
  sigma_s ~ cauchy(0, 1);
  sigma_g ~ cauchy(0, 1);
  sigma_b ~ cauchy(0, 1);
  sigma_gb ~ cauchy(0, 1);
  // priors
  a ~ std_normal();
  bSubj ~ std_normal();
  bGroup ~ std_normal();
  bBev ~ std_normal();
  for (i in 1:nGroup) {
    bGxB[i] ~ std_normal();
  }
  // likelihood
  for(i in 1:N) {
    score[i] ~ ordered_logistic(a +
                                  (bGroup[gIndex[i]] * sigma_g) +
                                  (bBev[bIndex[i]] * sigma_b) +
                                  (bSubj[sIndex[i]] * sigma_s) +
                                  (bGxB[gIndex[i]][bIndex[i]] * sigma_gb),
                                cutpoints);
  }
}

generated quantities {
  real y_draw[N];
  real y_new_pred[nNewPred];
  vector[nGroup+nNewGroup] bNewGroup;
  vector[nBev+nNewBev] bNewBev;
  vector[nSubj+nNewSubj] bNewSubj;
  vector[nBev+nNewBev] bNewGxB[nGroup+nNewGroup];
  // generate posterior predictions for the real data
  for (i in 1:N) {
    y_draw[i] = ordered_logistic_rng(a +
                                       (bGroup[gIndex[i]] * sigma_g) +
                                       (bBev[bIndex[i]] * sigma_b) +
                                       (bSubj[sIndex[i]] * sigma_s) +
                                       (bGxB[gIndex[i]][bIndex[i]] * sigma_gb),
                                     cutpoints);
  }
  // generate predictions for the new observations
  for (i in 1:(nGroup+nNewGroup)) {
    if (i <= nGroup) { bNewGroup[i] = bGroup[i]; }
    else { bNewGroup[i] = normal_rng(0, 1); }
  }
  for (i in 1:(nBev+nNewBev)) {
    if (i <= nBev) { bNewBev[i] = bBev[i]; }
    else { bNewBev[i] = normal_rng(0, 1); }
  }
  for (i in 1:(nSubj+nNewSubj)) {
    if (i <= nSubj) { bNewSubj[i] = bSubj[i]; }
    else { bNewSubj[i] = normal_rng(0, 1); }
  }
  for (i in 1:(nBev+nNewBev)) {
    for(j in 1:(nGroup+nNewGroup)) {
      if (i <= nBev && j <= nGroup) { bNewGxB[i][j] = bGxB[i][j]; }
      else { bNewGxB[i][j] = normal_rng(0, 1); }
    }
  }
  for (i in 1:nNewPred) {
    y_new_pred[i] = ordered_logistic_rng(a +
                                           (bNewGroup[gNewIndex[i]] * sigma_g) +
                                           (bNewBev[bNewIndex[i]] * sigma_b) +
                                           (bNewSubj[sNewIndex[i]] * sigma_s) +
                                           (bNewGxB[gNewIndex[i]][bNewIndex[i]] * sigma_gb),
                                         cutpoints);
  }
}

您的数据集中的回复似乎被记录到最接近的十分之一,因此给我们提供了 0 到 10 之间的 101 个可能类别。为了将所有内容都保持为 Stan-friendly 整数,我们可以将所有回复乘以 10 .(我还为每个响应添加了一个,因为当其中一个可能的类别为零时我无法拟合模型。)编辑: 为假设的 "subject 47" 添加了新的测试数据, 每个 group/beverage.

一个观察值
new.pred.obs = expand.grid(group = 1:3, bevType = 2:3) %>%
  mutate(id = max(df$id) + 1)
dList <- list(N = 138,
              nSubj = 46,
              nGroup = 3,
              nBev = 3,
              minResponse = 1,
              maxResponse = 101,
              sIndex = df$id,
              gIndex = df$group,
              bIndex = df$bevType,
              score = (df$score * 10) + 1,
              nNewPred = nrow(new.pred.obs),
              nNewSubj = 1,
              nNewGroup = 0,
              nNewBev = 0,
              sNewIndex = new.pred.obs$id,
              gNewIndex = new.pred.obs$group,
              bNewIndex = new.pred.obs$bevType)

我们提取y_draw后,我们可以将其转换回原来的比例:

y_draw <- (data.frame(extract(mod, pars = "y_draw")) - 1) / 10

其他都和以前一样。现在后验预测正确地限制在 [0, 10].

为了在原始量表上得出关于饮料之间差异的推论,我们可以使用我们的假设数据的预测。对于每个样本,我们在每个 group/beverage 组合中都有一个新主题的预测输出。我们可以比较每个样本和组中 "coffee" 与 "decaf" 的响应:

# Get predictions for hypothetical observations
new.preds.df = data.frame(rstan::extract(mod, pars = "y_new_pred")) %>%
  rownames_to_column("sample") %>%
  gather(obs, pred, -sample) %>%
  mutate(obs = gsub("y_new_pred\.", "", obs),
         pred = (pred - 1) / 10) %>%
  inner_join(new.pred.obs %>%
               rownames_to_column("obs") %>%
               mutate(bevType = paste("bev", bevType, sep = ""),
                      group = paste("Group", group)),
             by = c("obs")) %>%
  select(-obs) %>%
  spread(bevType, pred) %>%
  mutate(bevTypeDiff = bev3 - bev2)

(或者,我们可以在 R 中或在单独的 Stan 模型中对新观察结果进行此预测;有关如何完成此操作的示例,请参阅 here。)

方法 2:Beta 回归

一旦我们得到多达 101 个响应类别,将这些可能性称为离散类别似乎有点奇怪。感觉更自然地说,就像您的原始模型试图做的那样,我们正在捕获恰好介于 0 和 10 之间的连续结果。此外,在有序逻辑回归中,响应类别不必定期相对于潜在变量间隔。 (这是一个特性,而不是错误;例如,对于 Likert 响应,不能保证 "Strongly agree" 和 "Agree" 之间的差异与 "Agree" 和 "Neither agree not disagree".) 因此,很难说 "distance" 某个特定因素会导致响应在原始尺度上移动(与潜在变量的尺度相反)。但是上面模型推断的切点间隔非常规律,这再次表明数据集中的结果已经合理 scale-like:

# Get the sampled parameters
sampled.params.df = data.frame(as.array(mod)[,1,]) %>%
  select(-matches("y_draw")) %>%
  rownames_to_column("iteration")

# Plot selected cutpoints
sampled.params.df %>%
  select(matches("cutpoints")) %>%
  gather(cutpoint, value) %>%
  mutate(cutpoint.num = as.numeric(gsub("^cutpoints\.([0-9]+)\.$", "\1", cutpoint))) %>%
  group_by(cutpoint.num) %>%
  summarize(mean.value = mean(value),
            lower.95 = quantile(value, 0.025),
            lower.50 = quantile(value, 0.25),
            upper.50 = quantile(value, 0.75),
            upper.95 = quantile(value, .975)) %>%
  ggplot(aes(x = cutpoint.num, y = mean.value)) +
  geom_point(size = 3) +
  geom_linerange(aes(ymin = lower.95, ymax = upper.95)) +
  geom_linerange(aes(ymin = lower.50, ymax = upper.50), size = 2) +
  scale_x_continuous("cutpoint", breaks = seq(0, 100, 10)) +
  scale_y_continuous("") +
  theme_bw()

(粗线和细线分别代表 50% 和 95% 的区间。我很享受每 10 个分界点的小 "jump",这表明受试者被视为 5.9 与 6.0 的差异更大比5.8 vs. 5.9。但效果似乎很温和。规模似乎也向高端延伸了一点,但同样,它并不太剧烈。)

对于有界区间内的连续结果,我们可以使用beta distribution; see here and here进一步讨论。

对于 beta 分布,我们需要两个参数,muphi,这两个参数都必须是正数。在此示例中,我允许 mu 不受限制并在将其送入 beta 分布之前应用 inv_logit;我将 phi 约束为正数,并先验给它一个柯西。但是您可以通过多种方式做到这一点。我还编写了一整套 mu 参数,但只有一个 phi;同样,您可以尝试其他选项。

data {
  int<lower=1> N;
  int<lower=1> nSubj;
  int<lower=1> nGroup;
  int<lower=1> nBev;
  int<lower=1,upper=nSubj> sIndex[N];
  int<lower=1,upper=nGroup> gIndex[N];
  int<lower=1,upper=nBev> bIndex[N];
  vector<lower=0,upper=1>[N] score;
}

parameters {
  real a;
  real a_phi;
  vector[nSubj] bSubj;
  vector[nGroup] bGroup;
  vector[nBev] bBev;
  vector[nBev] bGxB[nGroup];
  real<lower=0> sigma_s;
  real<lower=0> sigma_g;
  real<lower=0> sigma_b;
  real<lower=0> sigma_gb;
}

model {
  vector[N] mu;
  //hyper-priors
  sigma_s ~ cauchy(0, 1);
  sigma_g ~ cauchy(0, 1);
  sigma_b ~ cauchy(0, 1);
  sigma_gb ~ cauchy(0, 1);
  //priors
  a ~ std_normal();
  a_phi ~ cauchy(0, 1);
  bSubj ~ std_normal();
  bGroup ~ std_normal();
  bBev ~ std_normal();
  for (i in 1:nGroup) {
    bGxB[i] ~ std_normal();
  }
  // likelihood
  for(i in 1:N) {
    mu[i] = a +
             (bGroup[gIndex[i]] * sigma_g) +
             (bBev[bIndex[i]] * sigma_b) +
             (bSubj[sIndex[i]] * sigma_s) +
             (bGxB[gIndex[i]][bIndex[i]] * sigma_gb);
    score[i] ~ beta(inv_logit(mu[i]) .* a_phi,
                    (1 - inv_logit(mu[i])) .* a_phi);
  }
}

generated quantities {
  real y_draw[N];
  real temp_mu;
  for (i in 1:N) {
    temp_mu = a +
               (bGroup[gIndex[i]] * sigma_g) +
               (bBev[bIndex[i]] * sigma_b) +
               (bSubj[sIndex[i]] * sigma_s) +
               (bGxB[gIndex[i]][bIndex[i]] * sigma_gb);
    y_draw[i] = beta_rng(inv_logit(temp_mu) .* a_phi,
                         (1 - inv_logit(temp_mu)) .* a_phi);
  }
}

(0, 1) 支持 beta 分布,所以我们将观察到的分数除以 10。(如果我们给它的分数正好是 0 或 1,模型也会失败,所以我将所有这些分数转换为 0.01和 0.99,分别。)

dList.beta <- list(N = 138,
                   nSubj = 46,
                   nGroup = 3,
                   nBev = 3,
                   sIndex = df$id,
                   gIndex = df$group,
                   bIndex = df$bevType,
                   score = ifelse(df$score == 0, 0.01,
                                  ifelse(df$score == 10, 0.99,
                                         df$score / 10)))

提取y_draw时取消转换,然后步骤同上。

y_draw.beta <- data.frame(extract(mod.beta, pars = "y_draw")) * 10

再一次,后部绘图的边界是正确的。