Применение функции к вложенной структуре данных Tibble в R

Я новичок в R и Tidyverse и мне нужно вычислить квантиль данных, которые вложены. Например, рассмотрим следующую таблицу:

> tbl=  
  subgroup        boot              
  <chr>           <list>            
1 aaa           <tibble [30 × 23]>
2 bbb           <tibble [30 × 23]>
3 ccc           <tibble [30 × 23]>

где boot содержит еще один тиббл с 30 загрузочными репликами и 23 столбцами (различные переменные). Например:

> tbl$boot
[[1]]
# A tibble: 30 x 23
   optimal_cutpoint AUC_b AUC_oob misclassification_c… misclassification_… acc_b acc_oob sensitivity_b sensitivity_oob specificity_b specificity_oob kappa_b
              <dbl> <dbl>   <dbl>                <dbl>               <dbl> <dbl>   <dbl>         <dbl>           <dbl>         <dbl>           <dbl>   <dbl>
 1              187 0.967   0.903                    3                   4 0.923   0.765             1           1             0.870           0.556   0.845
 2              270 0.946   0.729                    5                   5 0.872   0.643             1           0.625         0.783           0.667   0.747
 3              195 0.926   0.886                   11                   2 0.718   0.833             1           1             0.56            0.6     0.477
 4              187 0.881   0.893                    9                   3 0.769   0.8               1           1             0.625           0.625   0.562
 5              195 0.963   0.933                    7                   2 0.821   0.875             1           1             0.682           0.667   0.651
 6              203 0.926   0.944                    7                   2 0.821   0.882             1           1             0.65            0.778   0.644
 7              195 0.944   0.931                    7                   2 0.821   0.882             1           1             0.611           0.778   0.629
 8              153 0.908   1                        4                   4 0.897   0.667             1           1             0.789           0.5     0.794
 9              203 0.962   0.922                    8                   2 0.795   0.875             1           1             0.652           0.75    0.606
10              195 0.883   0.94                    11                   2 0.718   0.9               1           1             0.542           0.8     0.476
# ... with 20 more rows, and 11 more variables: kappa_oob <dbl>, TP_b <dbl>, FP_b <dbl>, TN_b <int>, FN_b <int>, TP_oob <dbl>, FP_oob <dbl>, TN_oob <int>,
#   FN_oob <int>, roc_curve_b <list>, roc_curve_oob <list>

[[2]]
# A tibble: 30 x 23
   optimal_cutpoint AUC_b AUC_oob misclassification_c… misclassification_… acc_b acc_oob sensitivity_b sensitivity_oob specificity_b specificity_oob kappa_b
              <dbl> <dbl>   <dbl>                <dbl>               <dbl> <dbl>   <dbl>         <dbl>           <dbl>         <dbl>           <dbl>   <dbl>
 1               72 0.842   0.81                    11                   6 0.788   0.7               1           1             0.577           0.4     0.577
 2               72 0.735   0.95                    10                   5 0.808   0.75              1           1             0.545           0.5     0.581
 3               80 0.787   0.907                   11                   5 0.788   0.667             1           0.833         0.522           0.556   0.549
 4               72 0.856   0.833                    9                   6 0.827   0.647             1           1             0.64            0.333   0.649
 5               72 0.88    0.778                   11                   5 0.788   0.706             1           1             0.593           0.375   0.583
 6               72 0.666   0.959                   16                   4 0.692   0.818             1           1             0.304           0.636   0.328
 7               43 0.708   0.941                   19                   7 0.635   0.731             1           1             0.24            0.462   0.247
 8               68 0.866   0.85                    12                   6 0.769   0.7               1           1             0.5             0.4     0.519
 9               80 0.801   0.872                   16                   5 0.692   0.773             1           0.923         0.407           0.556   0.398
10               80 0.877   0.809                    8                   8 0.846   0.619             1           0.909         0.652           0.3     0.677
# ... with 20 more rows, and 11 more variables: kappa_oob <dbl>, TP_b <dbl>, FP_b <dbl>, TN_b <int>, FN_b <int>, TP_oob <dbl>, FP_oob <dbl>, TN_oob <int>,
#   FN_oob <int>, roc_curve_b <list>, roc_curve_oob <list>

[[3]]
# A tibble: 30 x 23
   optimal_cutpoint AUC_b AUC_oob misclassification_c… misclassification_… acc_b acc_oob sensitivity_b sensitivity_oob specificity_b specificity_oob kappa_b
              <dbl> <dbl>   <dbl>                <dbl>               <dbl> <dbl>   <dbl>         <dbl>           <dbl>         <dbl>           <dbl>   <dbl>
 1             187  0.892   0.95                     8                   2 0.778   0.846             1               1         0.529           0.75    0.543
 2             144. 0.928   0.929                    5                   2 0.861   0.818             1               1         0.688           0.5     0.710
 3             142. 0.926   0.889                    6                   3 0.833   0.75              1               1         0.667           0.5     0.667
 4             187  0.931   0.929                    5                   1 0.861   0.889             1               1         0.688           0.857   0.710
 5             187  0.916   0.852                    3                   4 0.917   0.733             1               1         0.812           0.333   0.828
 6             142. 0.937   0.875                    5                   3 0.861   0.786             1               1         0.667           0.625   0.7  
 7             187  0.963   0.857                    6                   3 0.833   0.75              1               1         0.667           0.571   0.667
 8             142. 0.950   0.917                    6                   2 0.833   0.8               1               1         0.647           0.667   0.659
 9             187  0.950   0.971                    3                   3 0.917   0.75              1               1         0.842           0.4     0.834
10             150. 0.938   0.952                    5                   3 0.861   0.769             1               1         0.688           0.5     0.710
# ... with 20 more rows, and 11 more variables: kappa_oob <dbl>, TP_b <dbl>, FP_b <dbl>, TN_b <int>, FN_b <int>, TP_oob <dbl>, FP_oob <dbl>, TN_oob <int>,
#   FN_oob <int>, roc_curve_b <list>, roc_curve_oob <list>

Итак, из boot Тибле мне нужно только извлечь optimal_cutpoint столбец и вычислить квантили (2,5% и 97,5%) для каждого из 'aaa', 'bbb', 'ccc':

> qnt.aaa <- quantile(tbl$boot[[1]]$optimal_cutpoint, c(0.025, 0.975))
> qnt.bbb <- quantile(tbl$boot[[2]]$optimal_cutpoint, c(0.025, 0.975))
> qnt.ccc <- quantile(tbl$boot[[3]]$optimal_cutpoint, c(0.025, 0.975))

Так что в идеале я хотел бы иметь следующую таблицу:

> tbl.new=  
      subgroup        ci.low         ci.upp         
      <chr>           <dbl>          <dbl>
    1 aaa           qnt.aaa[1]    qnt.aaa[2]
    2 bbb           qnt.bbb[1]    qnt.bbb[2]
    3 ccc           qnt.ccc[1]    qnt.ccc[2]

(конечно, числовые значения вместо qnt.)

Я думаю, что я могу реализовать это очень неловко, но я хотел бы узнать, как использовать подход Tidyverse и сделать его аккуратным.

2 ответа

Решение

Ты можешь использовать rowwise обрабатывать каждый ряд индивидуально, mutate добавить новые столбцы и, наконец, ungroup снова объединить строки для дальнейших расчетов:

library(tidyverse)
tbl.new <- 
  tbl %>%
  rowwise() %>%
  mutate(ci.low = quantile(boot$optimal_cutpoint, 0.025),
         ci.up = quantile(boot$optimal_cutpoint, 0.975)) %>%
  ungroup()

Обратите внимание, что код не проверен, поскольку данные вашего примера не могут напрямую использоваться в R (возможно, попробуйте dput небольшая порция в следующий раз:))

Мы могли бы использовать map после nestв "подгруппе"

library(tidyverse)
tbl %>% 
  group_by(subgroup) %>%
  nest %>% 
  mutate(cls = map(data, ~ 
                quantile(.x$boot[[1]]$optimal_cutpoint, c(0.025, 0.975)) %>% 
                as.list %>% 
                as_tibble %>% 
                rename_all(~ c("ci.low", "ci.upp")))) %>% 
  select(-data) %>%
  unnest

данные

tbl <- structure(list(subgroup = c("aaa", "bbb", "ccc"), boot = list(
    structure(list(optimal_cutpoint = c(187L, 270L, 195L, 187L, 
    195L, 203L, 195L, 153L, 203L, 195L), AUC_b = c(0.967, 0.946, 
    0.926, 0.881, 0.963, 0.926, 0.944, 0.908, 0.962, 0.883), 
        AUC_oob = c(0.903, 0.729, 0.886, 0.893, 0.933, 0.944, 
        0.931, 1, 0.922, 0.94), misclassification_c. = c(3L, 
        5L, 11L, 9L, 7L, 7L, 7L, 4L, 8L, 11L), misclassification_. = c(4L, 
        5L, 2L, 3L, 2L, 2L, 2L, 4L, 2L, 2L), acc_b = c(0.923, 
        0.872, 0.718, 0.769, 0.821, 0.821, 0.821, 0.897, 0.795, 
        0.718), acc_oob = c(0.765, 0.643, 0.833, 0.8, 0.875, 
        0.882, 0.882, 0.667, 0.875, 0.9), sensitivity_b = c(1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), sensitivity_oob = c(1, 
        0.625, 1, 1, 1, 1, 1, 1, 1, 1), specificity_b = c(0.87, 
        0.783, 0.56, 0.625, 0.682, 0.65, 0.611, 0.789, 0.652, 
        0.542), specificity_oob = c(0.556, 0.667, 0.6, 0.625, 
        0.667, 0.778, 0.778, 0.5, 0.75, 0.8), kappa_b = c(0.845, 
        0.747, 0.477, 0.562, 0.651, 0.644, 0.629, 0.794, 0.606, 
        0.476)), .Names = c("optimal_cutpoint", "AUC_b", "AUC_oob", 
    "misclassification_c.", "misclassification_.", "acc_b", "acc_oob", 
    "sensitivity_b", "sensitivity_oob", "specificity_b", "specificity_oob", 
    "kappa_b"), row.names = c("1", "2", "3", "4", "5", "6", "7", 
    "8", "9", "10"), class = c("tbl_df", "tbl", "data.frame")), 
    structure(list(optimal_cutpoint = c(187L, 270L, 195L, 187L, 
    195L, 203L, 195L, 153L, 203L, 195L), AUC_b = c(0.967, 0.946, 
    0.926, 0.881, 0.963, 0.926, 0.944, 0.908, 0.962, 0.883), 
        AUC_oob = c(0.903, 0.729, 0.886, 0.893, 0.933, 0.944, 
        0.931, 1, 0.922, 0.94), misclassification_c. = c(3L, 
        5L, 11L, 9L, 7L, 7L, 7L, 4L, 8L, 11L), misclassification_. = c(4L, 
        5L, 2L, 3L, 2L, 2L, 2L, 4L, 2L, 2L), acc_b = c(0.923, 
        0.872, 0.718, 0.769, 0.821, 0.821, 0.821, 0.897, 0.795, 
        0.718), acc_oob = c(0.765, 0.643, 0.833, 0.8, 0.875, 
        0.882, 0.882, 0.667, 0.875, 0.9), sensitivity_b = c(1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), sensitivity_oob = c(1, 
        0.625, 1, 1, 1, 1, 1, 1, 1, 1), specificity_b = c(0.87, 
        0.783, 0.56, 0.625, 0.682, 0.65, 0.611, 0.789, 0.652, 
        0.542), specificity_oob = c(0.556, 0.667, 0.6, 0.625, 
        0.667, 0.778, 0.778, 0.5, 0.75, 0.8), kappa_b = c(0.845, 
        0.747, 0.477, 0.562, 0.651, 0.644, 0.629, 0.794, 0.606, 
        0.476)), .Names = c("optimal_cutpoint", "AUC_b", "AUC_oob", 
    "misclassification_c.", "misclassification_.", "acc_b", "acc_oob", 
    "sensitivity_b", "sensitivity_oob", "specificity_b", "specificity_oob", 
    "kappa_b"), row.names = c("1", "2", "3", "4", "5", "6", "7", 
    "8", "9", "10"), class = c("tbl_df", "tbl", "data.frame")), 
    structure(list(optimal_cutpoint = c(187L, 270L, 195L, 187L, 
    195L, 203L, 195L, 153L, 203L, 195L), AUC_b = c(0.967, 0.946, 
    0.926, 0.881, 0.963, 0.926, 0.944, 0.908, 0.962, 0.883), 
        AUC_oob = c(0.903, 0.729, 0.886, 0.893, 0.933, 0.944, 
        0.931, 1, 0.922, 0.94), misclassification_c. = c(3L, 
        5L, 11L, 9L, 7L, 7L, 7L, 4L, 8L, 11L), misclassification_. = c(4L, 
        5L, 2L, 3L, 2L, 2L, 2L, 4L, 2L, 2L), acc_b = c(0.923, 
        0.872, 0.718, 0.769, 0.821, 0.821, 0.821, 0.897, 0.795, 
        0.718), acc_oob = c(0.765, 0.643, 0.833, 0.8, 0.875, 
        0.882, 0.882, 0.667, 0.875, 0.9), sensitivity_b = c(1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), sensitivity_oob = c(1, 
        0.625, 1, 1, 1, 1, 1, 1, 1, 1), specificity_b = c(0.87, 
        0.783, 0.56, 0.625, 0.682, 0.65, 0.611, 0.789, 0.652, 
        0.542), specificity_oob = c(0.556, 0.667, 0.6, 0.625, 
        0.667, 0.778, 0.778, 0.5, 0.75, 0.8), kappa_b = c(0.845, 
        0.747, 0.477, 0.562, 0.651, 0.644, 0.629, 0.794, 0.606, 
        0.476)), .Names = c("optimal_cutpoint", "AUC_b", "AUC_oob", 
    "misclassification_c.", "misclassification_.", "acc_b", "acc_oob", 
    "sensitivity_b", "sensitivity_oob", "specificity_b", "specificity_oob", 
    "kappa_b"), row.names = c("1", "2", "3", "4", "5", "6", "7", 
    "8", "9", "10"), class = c("tbl_df", "tbl", "data.frame")))), .Names = c("subgroup", 
"boot"), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
))
Другие вопросы по тегам