Применение функции к вложенной структуре данных Tibble в R
Я новичок в R и Tidyverse и мне нужно вычислить квантиль данных, которые вложены. Например, рассмотрим следующую таблицу:
> tbl=
subgroup boot
<chr> <list>
1 aaa <tibble [30 × 23]>
2 bbb <tibble [30 × 23]>
3 ccc <tibble [30 × 23]>
где boot
содержит еще один тиббл с 30 загрузочными репликами и 23 столбцами (различные переменные). Например:
> tbl$boot
[[1]]
# A tibble: 30 x 23
optimal_cutpoint AUC_b AUC_oob misclassification_c… misclassification_… acc_b acc_oob sensitivity_b sensitivity_oob specificity_b specificity_oob kappa_b
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 187 0.967 0.903 3 4 0.923 0.765 1 1 0.870 0.556 0.845
2 270 0.946 0.729 5 5 0.872 0.643 1 0.625 0.783 0.667 0.747
3 195 0.926 0.886 11 2 0.718 0.833 1 1 0.56 0.6 0.477
4 187 0.881 0.893 9 3 0.769 0.8 1 1 0.625 0.625 0.562
5 195 0.963 0.933 7 2 0.821 0.875 1 1 0.682 0.667 0.651
6 203 0.926 0.944 7 2 0.821 0.882 1 1 0.65 0.778 0.644
7 195 0.944 0.931 7 2 0.821 0.882 1 1 0.611 0.778 0.629
8 153 0.908 1 4 4 0.897 0.667 1 1 0.789 0.5 0.794
9 203 0.962 0.922 8 2 0.795 0.875 1 1 0.652 0.75 0.606
10 195 0.883 0.94 11 2 0.718 0.9 1 1 0.542 0.8 0.476
# ... with 20 more rows, and 11 more variables: kappa_oob <dbl>, TP_b <dbl>, FP_b <dbl>, TN_b <int>, FN_b <int>, TP_oob <dbl>, FP_oob <dbl>, TN_oob <int>,
# FN_oob <int>, roc_curve_b <list>, roc_curve_oob <list>
[[2]]
# A tibble: 30 x 23
optimal_cutpoint AUC_b AUC_oob misclassification_c… misclassification_… acc_b acc_oob sensitivity_b sensitivity_oob specificity_b specificity_oob kappa_b
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 72 0.842 0.81 11 6 0.788 0.7 1 1 0.577 0.4 0.577
2 72 0.735 0.95 10 5 0.808 0.75 1 1 0.545 0.5 0.581
3 80 0.787 0.907 11 5 0.788 0.667 1 0.833 0.522 0.556 0.549
4 72 0.856 0.833 9 6 0.827 0.647 1 1 0.64 0.333 0.649
5 72 0.88 0.778 11 5 0.788 0.706 1 1 0.593 0.375 0.583
6 72 0.666 0.959 16 4 0.692 0.818 1 1 0.304 0.636 0.328
7 43 0.708 0.941 19 7 0.635 0.731 1 1 0.24 0.462 0.247
8 68 0.866 0.85 12 6 0.769 0.7 1 1 0.5 0.4 0.519
9 80 0.801 0.872 16 5 0.692 0.773 1 0.923 0.407 0.556 0.398
10 80 0.877 0.809 8 8 0.846 0.619 1 0.909 0.652 0.3 0.677
# ... with 20 more rows, and 11 more variables: kappa_oob <dbl>, TP_b <dbl>, FP_b <dbl>, TN_b <int>, FN_b <int>, TP_oob <dbl>, FP_oob <dbl>, TN_oob <int>,
# FN_oob <int>, roc_curve_b <list>, roc_curve_oob <list>
[[3]]
# A tibble: 30 x 23
optimal_cutpoint AUC_b AUC_oob misclassification_c… misclassification_… acc_b acc_oob sensitivity_b sensitivity_oob specificity_b specificity_oob kappa_b
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 187 0.892 0.95 8 2 0.778 0.846 1 1 0.529 0.75 0.543
2 144. 0.928 0.929 5 2 0.861 0.818 1 1 0.688 0.5 0.710
3 142. 0.926 0.889 6 3 0.833 0.75 1 1 0.667 0.5 0.667
4 187 0.931 0.929 5 1 0.861 0.889 1 1 0.688 0.857 0.710
5 187 0.916 0.852 3 4 0.917 0.733 1 1 0.812 0.333 0.828
6 142. 0.937 0.875 5 3 0.861 0.786 1 1 0.667 0.625 0.7
7 187 0.963 0.857 6 3 0.833 0.75 1 1 0.667 0.571 0.667
8 142. 0.950 0.917 6 2 0.833 0.8 1 1 0.647 0.667 0.659
9 187 0.950 0.971 3 3 0.917 0.75 1 1 0.842 0.4 0.834
10 150. 0.938 0.952 5 3 0.861 0.769 1 1 0.688 0.5 0.710
# ... with 20 more rows, and 11 more variables: kappa_oob <dbl>, TP_b <dbl>, FP_b <dbl>, TN_b <int>, FN_b <int>, TP_oob <dbl>, FP_oob <dbl>, TN_oob <int>,
# FN_oob <int>, roc_curve_b <list>, roc_curve_oob <list>
Итак, из boot
Тибле мне нужно только извлечь optimal_cutpoint
столбец и вычислить квантили (2,5% и 97,5%) для каждого из 'aaa', 'bbb', 'ccc':
> qnt.aaa <- quantile(tbl$boot[[1]]$optimal_cutpoint, c(0.025, 0.975))
> qnt.bbb <- quantile(tbl$boot[[2]]$optimal_cutpoint, c(0.025, 0.975))
> qnt.ccc <- quantile(tbl$boot[[3]]$optimal_cutpoint, c(0.025, 0.975))
Так что в идеале я хотел бы иметь следующую таблицу:
> tbl.new=
subgroup ci.low ci.upp
<chr> <dbl> <dbl>
1 aaa qnt.aaa[1] qnt.aaa[2]
2 bbb qnt.bbb[1] qnt.bbb[2]
3 ccc qnt.ccc[1] qnt.ccc[2]
(конечно, числовые значения вместо qnt.
)
Я думаю, что я могу реализовать это очень неловко, но я хотел бы узнать, как использовать подход Tidyverse и сделать его аккуратным.
2 ответа
Ты можешь использовать rowwise
обрабатывать каждый ряд индивидуально, mutate
добавить новые столбцы и, наконец, ungroup
снова объединить строки для дальнейших расчетов:
library(tidyverse)
tbl.new <-
tbl %>%
rowwise() %>%
mutate(ci.low = quantile(boot$optimal_cutpoint, 0.025),
ci.up = quantile(boot$optimal_cutpoint, 0.975)) %>%
ungroup()
Обратите внимание, что код не проверен, поскольку данные вашего примера не могут напрямую использоваться в R (возможно, попробуйте dput
небольшая порция в следующий раз:))
Мы могли бы использовать map
после nest
в "подгруппе"
library(tidyverse)
tbl %>%
group_by(subgroup) %>%
nest %>%
mutate(cls = map(data, ~
quantile(.x$boot[[1]]$optimal_cutpoint, c(0.025, 0.975)) %>%
as.list %>%
as_tibble %>%
rename_all(~ c("ci.low", "ci.upp")))) %>%
select(-data) %>%
unnest
данные
tbl <- structure(list(subgroup = c("aaa", "bbb", "ccc"), boot = list(
structure(list(optimal_cutpoint = c(187L, 270L, 195L, 187L,
195L, 203L, 195L, 153L, 203L, 195L), AUC_b = c(0.967, 0.946,
0.926, 0.881, 0.963, 0.926, 0.944, 0.908, 0.962, 0.883),
AUC_oob = c(0.903, 0.729, 0.886, 0.893, 0.933, 0.944,
0.931, 1, 0.922, 0.94), misclassification_c. = c(3L,
5L, 11L, 9L, 7L, 7L, 7L, 4L, 8L, 11L), misclassification_. = c(4L,
5L, 2L, 3L, 2L, 2L, 2L, 4L, 2L, 2L), acc_b = c(0.923,
0.872, 0.718, 0.769, 0.821, 0.821, 0.821, 0.897, 0.795,
0.718), acc_oob = c(0.765, 0.643, 0.833, 0.8, 0.875,
0.882, 0.882, 0.667, 0.875, 0.9), sensitivity_b = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), sensitivity_oob = c(1,
0.625, 1, 1, 1, 1, 1, 1, 1, 1), specificity_b = c(0.87,
0.783, 0.56, 0.625, 0.682, 0.65, 0.611, 0.789, 0.652,
0.542), specificity_oob = c(0.556, 0.667, 0.6, 0.625,
0.667, 0.778, 0.778, 0.5, 0.75, 0.8), kappa_b = c(0.845,
0.747, 0.477, 0.562, 0.651, 0.644, 0.629, 0.794, 0.606,
0.476)), .Names = c("optimal_cutpoint", "AUC_b", "AUC_oob",
"misclassification_c.", "misclassification_.", "acc_b", "acc_oob",
"sensitivity_b", "sensitivity_oob", "specificity_b", "specificity_oob",
"kappa_b"), row.names = c("1", "2", "3", "4", "5", "6", "7",
"8", "9", "10"), class = c("tbl_df", "tbl", "data.frame")),
structure(list(optimal_cutpoint = c(187L, 270L, 195L, 187L,
195L, 203L, 195L, 153L, 203L, 195L), AUC_b = c(0.967, 0.946,
0.926, 0.881, 0.963, 0.926, 0.944, 0.908, 0.962, 0.883),
AUC_oob = c(0.903, 0.729, 0.886, 0.893, 0.933, 0.944,
0.931, 1, 0.922, 0.94), misclassification_c. = c(3L,
5L, 11L, 9L, 7L, 7L, 7L, 4L, 8L, 11L), misclassification_. = c(4L,
5L, 2L, 3L, 2L, 2L, 2L, 4L, 2L, 2L), acc_b = c(0.923,
0.872, 0.718, 0.769, 0.821, 0.821, 0.821, 0.897, 0.795,
0.718), acc_oob = c(0.765, 0.643, 0.833, 0.8, 0.875,
0.882, 0.882, 0.667, 0.875, 0.9), sensitivity_b = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), sensitivity_oob = c(1,
0.625, 1, 1, 1, 1, 1, 1, 1, 1), specificity_b = c(0.87,
0.783, 0.56, 0.625, 0.682, 0.65, 0.611, 0.789, 0.652,
0.542), specificity_oob = c(0.556, 0.667, 0.6, 0.625,
0.667, 0.778, 0.778, 0.5, 0.75, 0.8), kappa_b = c(0.845,
0.747, 0.477, 0.562, 0.651, 0.644, 0.629, 0.794, 0.606,
0.476)), .Names = c("optimal_cutpoint", "AUC_b", "AUC_oob",
"misclassification_c.", "misclassification_.", "acc_b", "acc_oob",
"sensitivity_b", "sensitivity_oob", "specificity_b", "specificity_oob",
"kappa_b"), row.names = c("1", "2", "3", "4", "5", "6", "7",
"8", "9", "10"), class = c("tbl_df", "tbl", "data.frame")),
structure(list(optimal_cutpoint = c(187L, 270L, 195L, 187L,
195L, 203L, 195L, 153L, 203L, 195L), AUC_b = c(0.967, 0.946,
0.926, 0.881, 0.963, 0.926, 0.944, 0.908, 0.962, 0.883),
AUC_oob = c(0.903, 0.729, 0.886, 0.893, 0.933, 0.944,
0.931, 1, 0.922, 0.94), misclassification_c. = c(3L,
5L, 11L, 9L, 7L, 7L, 7L, 4L, 8L, 11L), misclassification_. = c(4L,
5L, 2L, 3L, 2L, 2L, 2L, 4L, 2L, 2L), acc_b = c(0.923,
0.872, 0.718, 0.769, 0.821, 0.821, 0.821, 0.897, 0.795,
0.718), acc_oob = c(0.765, 0.643, 0.833, 0.8, 0.875,
0.882, 0.882, 0.667, 0.875, 0.9), sensitivity_b = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), sensitivity_oob = c(1,
0.625, 1, 1, 1, 1, 1, 1, 1, 1), specificity_b = c(0.87,
0.783, 0.56, 0.625, 0.682, 0.65, 0.611, 0.789, 0.652,
0.542), specificity_oob = c(0.556, 0.667, 0.6, 0.625,
0.667, 0.778, 0.778, 0.5, 0.75, 0.8), kappa_b = c(0.845,
0.747, 0.477, 0.562, 0.651, 0.644, 0.629, 0.794, 0.606,
0.476)), .Names = c("optimal_cutpoint", "AUC_b", "AUC_oob",
"misclassification_c.", "misclassification_.", "acc_b", "acc_oob",
"sensitivity_b", "sensitivity_oob", "specificity_b", "specificity_oob",
"kappa_b"), row.names = c("1", "2", "3", "4", "5", "6", "7",
"8", "9", "10"), class = c("tbl_df", "tbl", "data.frame")))), .Names = c("subgroup",
"boot"), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
))