将函数应用于 R 中的嵌套 tibble 数据结构

Applying function to a nested tibble data structure in R

我是 R 和 tidyverse 的新手,需要计算嵌套数据的分位数。例如,考虑以下 table:

> tbl=  
  subgroup        boot              
  <chr>           <list>            
1 aaa           <tibble [30 × 23]>
2 bbb           <tibble [30 × 23]>
3 ccc           <tibble [30 × 23]>

其中 boot 包含另一个 tibble,其中包含 30 个引导复制和 23 列(各种变量)。例如:

> tbl$boot
[[1]]
# A tibble: 30 x 23
   optimal_cutpoint AUC_b AUC_oob misclassification_c… misclassification_… acc_b acc_oob sensitivity_b sensitivity_oob specificity_b specificity_oob kappa_b
              <dbl> <dbl>   <dbl>                <dbl>               <dbl> <dbl>   <dbl>         <dbl>           <dbl>         <dbl>           <dbl>   <dbl>
 1              187 0.967   0.903                    3                   4 0.923   0.765             1           1             0.870           0.556   0.845
 2              270 0.946   0.729                    5                   5 0.872   0.643             1           0.625         0.783           0.667   0.747
 3              195 0.926   0.886                   11                   2 0.718   0.833             1           1             0.56            0.6     0.477
 4              187 0.881   0.893                    9                   3 0.769   0.8               1           1             0.625           0.625   0.562
 5              195 0.963   0.933                    7                   2 0.821   0.875             1           1             0.682           0.667   0.651
 6              203 0.926   0.944                    7                   2 0.821   0.882             1           1             0.65            0.778   0.644
 7              195 0.944   0.931                    7                   2 0.821   0.882             1           1             0.611           0.778   0.629
 8              153 0.908   1                        4                   4 0.897   0.667             1           1             0.789           0.5     0.794
 9              203 0.962   0.922                    8                   2 0.795   0.875             1           1             0.652           0.75    0.606
10              195 0.883   0.94                    11                   2 0.718   0.9               1           1             0.542           0.8     0.476
# ... with 20 more rows, and 11 more variables: kappa_oob <dbl>, TP_b <dbl>, FP_b <dbl>, TN_b <int>, FN_b <int>, TP_oob <dbl>, FP_oob <dbl>, TN_oob <int>,
#   FN_oob <int>, roc_curve_b <list>, roc_curve_oob <list>

[[2]]
# A tibble: 30 x 23
   optimal_cutpoint AUC_b AUC_oob misclassification_c… misclassification_… acc_b acc_oob sensitivity_b sensitivity_oob specificity_b specificity_oob kappa_b
              <dbl> <dbl>   <dbl>                <dbl>               <dbl> <dbl>   <dbl>         <dbl>           <dbl>         <dbl>           <dbl>   <dbl>
 1               72 0.842   0.81                    11                   6 0.788   0.7               1           1             0.577           0.4     0.577
 2               72 0.735   0.95                    10                   5 0.808   0.75              1           1             0.545           0.5     0.581
 3               80 0.787   0.907                   11                   5 0.788   0.667             1           0.833         0.522           0.556   0.549
 4               72 0.856   0.833                    9                   6 0.827   0.647             1           1             0.64            0.333   0.649
 5               72 0.88    0.778                   11                   5 0.788   0.706             1           1             0.593           0.375   0.583
 6               72 0.666   0.959                   16                   4 0.692   0.818             1           1             0.304           0.636   0.328
 7               43 0.708   0.941                   19                   7 0.635   0.731             1           1             0.24            0.462   0.247
 8               68 0.866   0.85                    12                   6 0.769   0.7               1           1             0.5             0.4     0.519
 9               80 0.801   0.872                   16                   5 0.692   0.773             1           0.923         0.407           0.556   0.398
10               80 0.877   0.809                    8                   8 0.846   0.619             1           0.909         0.652           0.3     0.677
# ... with 20 more rows, and 11 more variables: kappa_oob <dbl>, TP_b <dbl>, FP_b <dbl>, TN_b <int>, FN_b <int>, TP_oob <dbl>, FP_oob <dbl>, TN_oob <int>,
#   FN_oob <int>, roc_curve_b <list>, roc_curve_oob <list>

[[3]]
# A tibble: 30 x 23
   optimal_cutpoint AUC_b AUC_oob misclassification_c… misclassification_… acc_b acc_oob sensitivity_b sensitivity_oob specificity_b specificity_oob kappa_b
              <dbl> <dbl>   <dbl>                <dbl>               <dbl> <dbl>   <dbl>         <dbl>           <dbl>         <dbl>           <dbl>   <dbl>
 1             187  0.892   0.95                     8                   2 0.778   0.846             1               1         0.529           0.75    0.543
 2             144. 0.928   0.929                    5                   2 0.861   0.818             1               1         0.688           0.5     0.710
 3             142. 0.926   0.889                    6                   3 0.833   0.75              1               1         0.667           0.5     0.667
 4             187  0.931   0.929                    5                   1 0.861   0.889             1               1         0.688           0.857   0.710
 5             187  0.916   0.852                    3                   4 0.917   0.733             1               1         0.812           0.333   0.828
 6             142. 0.937   0.875                    5                   3 0.861   0.786             1               1         0.667           0.625   0.7  
 7             187  0.963   0.857                    6                   3 0.833   0.75              1               1         0.667           0.571   0.667
 8             142. 0.950   0.917                    6                   2 0.833   0.8               1               1         0.647           0.667   0.659
 9             187  0.950   0.971                    3                   3 0.917   0.75              1               1         0.842           0.4     0.834
10             150. 0.938   0.952                    5                   3 0.861   0.769             1               1         0.688           0.5     0.710
# ... with 20 more rows, and 11 more variables: kappa_oob <dbl>, TP_b <dbl>, FP_b <dbl>, TN_b <int>, FN_b <int>, TP_oob <dbl>, FP_oob <dbl>, TN_oob <int>,
#   FN_oob <int>, roc_curve_b <list>, roc_curve_oob <list>

因此,从 boot tibble 我只需要提取 optimal_cutpoint 列并为每个 'aaa'、'bbb' 计算分位数(2.5% 和 97.5%) , 'ccc':

> qnt.aaa <- quantile(tbl$boot[[1]]$optimal_cutpoint, c(0.025, 0.975))
> qnt.bbb <- quantile(tbl$boot[[2]]$optimal_cutpoint, c(0.025, 0.975))
> qnt.ccc <- quantile(tbl$boot[[3]]$optimal_cutpoint, c(0.025, 0.975))

所以理想情况下我想要以下table:

> tbl.new=  
      subgroup        ci.low         ci.upp         
      <chr>           <dbl>          <dbl>
    1 aaa           qnt.aaa[1]    qnt.aaa[2]
    2 bbb           qnt.bbb[1]    qnt.bbb[2]
    3 ccc           qnt.ccc[1]    qnt.ccc[2]

(当然是数值而不是qnt.

我想我可以用一种非常笨拙的方式实现它,但我想学习如何使用 tidyverse 方法并使其整洁。

您可以使用 rowwise 单独处理每一行,mutate 添加新列,最后 ungroup 再次组合行以进行进一步计算:

library(tidyverse)
tbl.new <- 
  tbl %>%
  rowwise() %>%
  mutate(ci.low = quantile(boot$optimal_cutpoint, 0.025),
         ci.up = quantile(boot$optimal_cutpoint, 0.975)) %>%
  ungroup()

请注意,代码未经测试,因为您的示例数据不能直接在 R 中使用(也许下次尝试 dput 一小部分 :))

我们可以在 nest 之后使用 map 在 'subgroup'

library(tidyverse)
tbl %>% 
  group_by(subgroup) %>%
  nest %>% 
  mutate(cls = map(data, ~ 
                quantile(.x$boot[[1]]$optimal_cutpoint, c(0.025, 0.975)) %>% 
                as.list %>% 
                as_tibble %>% 
                rename_all(~ c("ci.low", "ci.upp")))) %>% 
  select(-data) %>%
  unnest

数据

tbl <- structure(list(subgroup = c("aaa", "bbb", "ccc"), boot = list(
    structure(list(optimal_cutpoint = c(187L, 270L, 195L, 187L, 
    195L, 203L, 195L, 153L, 203L, 195L), AUC_b = c(0.967, 0.946, 
    0.926, 0.881, 0.963, 0.926, 0.944, 0.908, 0.962, 0.883), 
        AUC_oob = c(0.903, 0.729, 0.886, 0.893, 0.933, 0.944, 
        0.931, 1, 0.922, 0.94), misclassification_c. = c(3L, 
        5L, 11L, 9L, 7L, 7L, 7L, 4L, 8L, 11L), misclassification_. = c(4L, 
        5L, 2L, 3L, 2L, 2L, 2L, 4L, 2L, 2L), acc_b = c(0.923, 
        0.872, 0.718, 0.769, 0.821, 0.821, 0.821, 0.897, 0.795, 
        0.718), acc_oob = c(0.765, 0.643, 0.833, 0.8, 0.875, 
        0.882, 0.882, 0.667, 0.875, 0.9), sensitivity_b = c(1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), sensitivity_oob = c(1, 
        0.625, 1, 1, 1, 1, 1, 1, 1, 1), specificity_b = c(0.87, 
        0.783, 0.56, 0.625, 0.682, 0.65, 0.611, 0.789, 0.652, 
        0.542), specificity_oob = c(0.556, 0.667, 0.6, 0.625, 
        0.667, 0.778, 0.778, 0.5, 0.75, 0.8), kappa_b = c(0.845, 
        0.747, 0.477, 0.562, 0.651, 0.644, 0.629, 0.794, 0.606, 
        0.476)), .Names = c("optimal_cutpoint", "AUC_b", "AUC_oob", 
    "misclassification_c.", "misclassification_.", "acc_b", "acc_oob", 
    "sensitivity_b", "sensitivity_oob", "specificity_b", "specificity_oob", 
    "kappa_b"), row.names = c("1", "2", "3", "4", "5", "6", "7", 
    "8", "9", "10"), class = c("tbl_df", "tbl", "data.frame")), 
    structure(list(optimal_cutpoint = c(187L, 270L, 195L, 187L, 
    195L, 203L, 195L, 153L, 203L, 195L), AUC_b = c(0.967, 0.946, 
    0.926, 0.881, 0.963, 0.926, 0.944, 0.908, 0.962, 0.883), 
        AUC_oob = c(0.903, 0.729, 0.886, 0.893, 0.933, 0.944, 
        0.931, 1, 0.922, 0.94), misclassification_c. = c(3L, 
        5L, 11L, 9L, 7L, 7L, 7L, 4L, 8L, 11L), misclassification_. = c(4L, 
        5L, 2L, 3L, 2L, 2L, 2L, 4L, 2L, 2L), acc_b = c(0.923, 
        0.872, 0.718, 0.769, 0.821, 0.821, 0.821, 0.897, 0.795, 
        0.718), acc_oob = c(0.765, 0.643, 0.833, 0.8, 0.875, 
        0.882, 0.882, 0.667, 0.875, 0.9), sensitivity_b = c(1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), sensitivity_oob = c(1, 
        0.625, 1, 1, 1, 1, 1, 1, 1, 1), specificity_b = c(0.87, 
        0.783, 0.56, 0.625, 0.682, 0.65, 0.611, 0.789, 0.652, 
        0.542), specificity_oob = c(0.556, 0.667, 0.6, 0.625, 
        0.667, 0.778, 0.778, 0.5, 0.75, 0.8), kappa_b = c(0.845, 
        0.747, 0.477, 0.562, 0.651, 0.644, 0.629, 0.794, 0.606, 
        0.476)), .Names = c("optimal_cutpoint", "AUC_b", "AUC_oob", 
    "misclassification_c.", "misclassification_.", "acc_b", "acc_oob", 
    "sensitivity_b", "sensitivity_oob", "specificity_b", "specificity_oob", 
    "kappa_b"), row.names = c("1", "2", "3", "4", "5", "6", "7", 
    "8", "9", "10"), class = c("tbl_df", "tbl", "data.frame")), 
    structure(list(optimal_cutpoint = c(187L, 270L, 195L, 187L, 
    195L, 203L, 195L, 153L, 203L, 195L), AUC_b = c(0.967, 0.946, 
    0.926, 0.881, 0.963, 0.926, 0.944, 0.908, 0.962, 0.883), 
        AUC_oob = c(0.903, 0.729, 0.886, 0.893, 0.933, 0.944, 
        0.931, 1, 0.922, 0.94), misclassification_c. = c(3L, 
        5L, 11L, 9L, 7L, 7L, 7L, 4L, 8L, 11L), misclassification_. = c(4L, 
        5L, 2L, 3L, 2L, 2L, 2L, 4L, 2L, 2L), acc_b = c(0.923, 
        0.872, 0.718, 0.769, 0.821, 0.821, 0.821, 0.897, 0.795, 
        0.718), acc_oob = c(0.765, 0.643, 0.833, 0.8, 0.875, 
        0.882, 0.882, 0.667, 0.875, 0.9), sensitivity_b = c(1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), sensitivity_oob = c(1, 
        0.625, 1, 1, 1, 1, 1, 1, 1, 1), specificity_b = c(0.87, 
        0.783, 0.56, 0.625, 0.682, 0.65, 0.611, 0.789, 0.652, 
        0.542), specificity_oob = c(0.556, 0.667, 0.6, 0.625, 
        0.667, 0.778, 0.778, 0.5, 0.75, 0.8), kappa_b = c(0.845, 
        0.747, 0.477, 0.562, 0.651, 0.644, 0.629, 0.794, 0.606, 
        0.476)), .Names = c("optimal_cutpoint", "AUC_b", "AUC_oob", 
    "misclassification_c.", "misclassification_.", "acc_b", "acc_oob", 
    "sensitivity_b", "sensitivity_oob", "specificity_b", "specificity_oob", 
    "kappa_b"), row.names = c("1", "2", "3", "4", "5", "6", "7", 
    "8", "9", "10"), class = c("tbl_df", "tbl", "data.frame")))), .Names = c("subgroup", 
"boot"), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
))