即使缺少数据点,如何以特定顺序(月年)显示宽 table?
How to display wide table with specific order (month year) even when data points are missing?
> df_1
# A tibble: 47 x 3
# Groups: therapy_class [9]
therapy_class Year_month count
<ord> <yearmon> <int>
1 ALK Inhibitors Dec 2019 16
2 ALK Inhibitors Jan 2020 14
3 ALK Inhibitors Feb 2020 14
4 ALK Inhibitors Mar 2020 22
5 ALK Inhibitors Apr 2020 13
6 ALK Inhibitors May 2020 17
7 Anti-VEGF-based therapies Dec 2019 33
8 Anti-VEGF-based therapies Jan 2020 35
9 Anti-VEGF-based therapies Feb 2020 36
10 Anti-VEGF-based therapies Mar 2020 20
# … with 37 more rows
A tibble: 10 x 7
therapy_class `Dec 2019` `Jan 2020` `Feb 2020` `Mar 2020` `Apr 2020` `May 2020`
<ord> <chr> <chr> <chr> <chr> <chr> <chr>
1 ALK Inhibitors "16 <br>[2.7%]" "14 <br>[2.0%]" "14 <br>[2.2%]" "22 <br>[3.3%]" "13 <br>[2.1%]" "17 <br>[3.4%]"
2 Anti-VEGF-based therapies "33 <br>[5.6%]" "35 <br>[4.9%]" "36 <br>[5.7%]" "20 <br>[3.0%]" "21 <br>[3.4%]" "20 <br>[4.0%]"
3 EGFR TKIs "52 <br>[8.8%]" "57 <br>[8.0%]" "60 <br>[9.5%]" "52 <br>[7.8%]" "56 <br>[9.2%]" "49 <br>[9.8%]"
4 EGFR-antibody based therapies "" "" "" "" "" ""
5 Non-platinum-based chemotherapy… "1 <br>[0.2%]" "4 <br>[0.6%]" "4 <br>[0.6%]" "" "1 <br>[0.2%]" ""
6 IO-based therapies "308 <br>[52.0%]" "385 <br>[54.0%]" "330 <br>[52.3%]" "379 <br>[56.7%]" "345 <br>[56.4%]" "265 <br>[52.9%]"
7 Platinum-based chemotherapy com… "123 <br>[20.8%]" "147 <br>[20.6%]" "128 <br>[20.3%]" "134 <br>[20.1%]" "120 <br>[19.6%]" "107 <br>[21.4%]"
8 Single agent chemotherapies "29 <br>[4.9%]" "33 <br>[4.6%]" "17 <br>[2.7%]" "28 <br>[4.2%]" "25 <br>[4.1%]" "22 <br>[4.4%]"
9 Other "30 <br>[5.1%]" "38 <br>[5.3%]" "42 <br>[6.7%]" "33 <br>[4.9%]" "31 <br>[5.1%]" "21 <br>[4.2%]"
10 <strong>Total</strong> "<strong>592</str… "<strong>713</str… "<strong>631</st… "<strong>668</st… "<strong>612</st… "<strong>501</st…
> df_2
# A tibble: 46 x 3
# Groups: therapy_class [9]
therapy_class Year_month count
<ord> <yearmon> <int>
1 ALK Inhibitors Dec 2019 16
2 ALK Inhibitors Feb 2020 14
3 ALK Inhibitors Mar 2020 22
4 ALK Inhibitors Apr 2020 13
5 ALK Inhibitors May 2020 17
6 Anti-VEGF-based therapies Dec 2019 33
7 Anti-VEGF-based therapies Jan 2020 35
8 Anti-VEGF-based therapies Feb 2020 36
9 Anti-VEGF-based therapies Mar 2020 20
10 Anti-VEGF-based therapies Apr 2020 21
# … with 36 more rows
> t2
# A tibble: 10 x 7
therapy_class `Dec 2019` `Feb 2020` `Mar 2020` `Apr 2020` `May 2020` `Jan 2020`
<ord> <chr> <chr> <chr> <chr> <chr> <chr>
1 ALK Inhibitors "16 <br>[2.7%]" "14 <br>[2.2%]" "22 <br>[3.3%]" "13 <br>[2.1%]" "17 <br>[3.4%]" ""
2 Anti-VEGF-based therapies "33 <br>[5.6%]" "36 <br>[5.7%]" "20 <br>[3.0%]" "21 <br>[3.4%]" "20 <br>[4.0%]" "35 <br>[5.0%]"
3 EGFR TKIs "52 <br>[8.8%]" "60 <br>[9.5%]" "52 <br>[7.8%]" "56 <br>[9.2%]" "49 <br>[9.8%]" "57 <br>[8.2%]"
4 EGFR-antibody based therapies "" "" "" "" "" ""
5 Non-platinum-based chemotherapy… "1 <br>[0.2%]" "4 <br>[0.6%]" "" "1 <br>[0.2%]" "" "4 <br>[0.6%]"
6 IO-based therapies "308 <br>[52.0%]" "330 <br>[52.3%]" "379 <br>[56.7%]" "345 <br>[56.4%]" "265 <br>[52.9%]" "385 <br>[55.1%]"
7 Platinum-based chemotherapy com… "123 <br>[20.8%]" "128 <br>[20.3%]" "134 <br>[20.1%]" "120 <br>[19.6%]" "107 <br>[21.4%]" "147 <br>[21.0%]"
8 Single agent chemotherapies "29 <br>[4.9%]" "17 <br>[2.7%]" "28 <br>[4.2%]" "25 <br>[4.1%]" "22 <br>[4.4%]" "33 <br>[4.7%]"
9 Other "30 <br>[5.1%]" "42 <br>[6.7%]" "33 <br>[4.9%]" "31 <br>[5.1%]" "21 <br>[4.2%]" "38 <br>[5.4%]"
10 <strong>Total</strong> "<strong>592</str… "<strong>631</str… "<strong>668</st… "<strong>612</st… "<strong>501</st… "<strong>699</st…
>
我正在尝试创建一个宽 table,其中包含长 table 的计数和百分比。列是 'Month Year',需要按顺序排列。我的问题是,当第一组(ALK 抑制剂)的某些 'Month Year' 缺少行时,列的顺序就会被打乱。缺少的 'Month Year' 放在最后。另外,长 table 不是固定的 table。它是从用户可以选择月份年份范围的函数生成的。所以 Year_month 列可以有任何范围。
在这个例子中,我使用了 2019 年 12 月到 2020 年 5 月这 6 个月的范围。 “df_1”有 6 个月,所以结果宽 table 符合预期。 “df_2”缺少 ALK 抑制剂的 2020 年 1 月。所以结果 table 最后有 'Jan 2020'。
这是我正在使用的生成宽 table:
的代码
df_2 %>%
pivot_wider(names_from = Year_month, values_from = count) %>%
ungroup() %>%
mutate_at(.vars = vars(contains("20")), list(
~ ifelse(is.na(.), "", paste(., sprintf("<br>[%1.1f%%]", 100 * (. / sum(., na.rm = TRUE)))))
))
这里是示例数据df_2
structure(list(therapy_class = structure(c(1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 5L, 5L, 5L,
5L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L,
8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L), .Label = c("ALK Inhibitors",
"Anti-VEGF-based therapies", "EGFR TKIs", "EGFR-antibody based therapies",
"Non-platinum-based chemotherapy combinations", "IO-based therapies",
"Platinum-based chemotherapy combinations", "Single agent chemotherapies",
"Other"), class = c("ordered", "factor")), Year_month = structure(c(2019.91666666667,
2020.08333333333, 2020.16666666667, 2020.25, 2020.33333333333,
2019.91666666667, 2020, 2020.08333333333, 2020.16666666667, 2020.25,
2020.33333333333, 2019.91666666667, 2020, 2020.08333333333, 2020.16666666667,
2020.25, 2020.33333333333, NA, 2019.91666666667, 2020, 2020.08333333333,
2020.25, 2019.91666666667, 2020, 2020.08333333333, 2020.16666666667,
2020.25, 2020.33333333333, 2019.91666666667, 2020, 2020.08333333333,
2020.16666666667, 2020.25, 2020.33333333333, 2019.91666666667,
2020, 2020.08333333333, 2020.16666666667, 2020.25, 2020.33333333333,
2019.91666666667, 2020, 2020.08333333333, 2020.16666666667, 2020.25,
2020.33333333333), class = "yearmon"), count = c(16L, 14L, 22L,
13L, 17L, 33L, 35L, 36L, 20L, 21L, 20L, 52L, 57L, 60L, 52L, 56L,
49L, NA, 1L, 4L, 4L, 1L, 308L, 385L, 330L, 379L, 345L, 265L,
123L, 147L, 128L, 134L, 120L, 107L, 29L, 33L, 17L, 28L, 25L,
22L, 30L, 38L, 42L, 33L, 31L, 21L)), row.names = c(NA, -46L), groups = structure(list(
therapy_class = structure(1:9, .Label = c("ALK Inhibitors",
"Anti-VEGF-based therapies", "EGFR TKIs", "EGFR-antibody based therapies",
"Non-platinum-based chemotherapy combinations", "IO-based therapies",
"Platinum-based chemotherapy combinations", "Single agent chemotherapies",
"Other"), class = c("ordered", "factor")), .rows = structure(list(
1:5, 6:11, 12:17, 18L, 19:22, 23:28, 29:34, 35:40, 41:46), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -9L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
这是包含月-季度数据的新数据集
df <- structure(list(therapy_class = structure(c(1L, 1L, 1L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 4L, 5L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L,
7L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L), .Label = c("ALK Inhibitors",
"Anti-VEGF-based therapies", "EGFR TKIs", "EGFR-antibody based therapies",
"Non-platinum-based chemotherapy combinations", "IO-based therapies",
"Platinum-based chemotherapy combinations", "Single agent chemotherapies",
"Other"), class = c("ordered", "factor")), quarter = structure(c(2020.75,
2021, 2021.25, 2020.5, 2020.75, 2021.25, 2020.5, 2020.75, 2021,
2021.25, NA, NA, 2020.5, 2020.75, 2021, 2021.25, 2021.5, 2020.5,
2020.75, 2021, 2021.25, 2021.5, 2020.5, 2020.75, 2021, 2021.25,
2021.5, 2020.5, 2020.75, 2021, 2021.25, 2021.5), class = "yearqtr"),
count = c(4L, 2L, 2L, 3L, 2L, 3L, 1L, 1L, 1L, 1L, NA, NA,
252L, 184L, 190L, 180L, 120L, 59L, 46L, 63L, 47L, 25L, 4L,
4L, 1L, 2L, 1L, 16L, 12L, 21L, 21L, 11L)), row.names = c(NA,
-32L), groups = structure(list(therapy_class = structure(1:9, .Label = c("ALK Inhibitors",
"Anti-VEGF-based therapies", "EGFR TKIs", "EGFR-antibody based therapies",
"Non-platinum-based chemotherapy combinations", "IO-based therapies",
"Platinum-based chemotherapy combinations", "Single agent chemotherapies",
"Other"), class = c("ordered", "factor")), .rows = structure(list(
1:3, 4:6, 7:10, 11L, 12L, 13:17, 18:22, 23:27, 28:32), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -9L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
--
由于缺少数据,季度日期乱序
therapy_class `2020 Q4` `2021 Q1` `2021 Q2` `2020 Q3` `NA` `2021 Q3`
<ord> <chr> <chr> <chr> <chr> <int> <chr>
1 ALK Inhibitors "4 [1.58%]" "2 [0.72%]" "2 [0.78%]" "" NA ""
2 Anti-VEGF-based therapies "2 [0.79%]" "" "3 [1.17%]" "3 [0.90%]" NA ""
3 EGFR TKIs "1 [0.40%]" "1 [0.36%]" "1 [0.39%]" "1 [0.30%]" NA ""
4 EGFR-antibody based therapies "" "" "" "" NA ""
5 Non-platinum-based chemotherapy combinations "" "" "" "" NA ""
6 IO-based therapies "184 [72.73%]" "190 [68.35%]" "180 [70.31%]" "252 [75.22%]" NA "120 [76.43%]"
7 Platinum-based chemotherapy combinations "46 [18.18%]" "63 [22.66%]" "47 [18.36%]" "59 [17.61%]" NA "25 [15.92%]"
8 Single agent chemotherapies "4 [1.58%]" "1 [0.36%]" "2 [0.78%]" "4 [1.19%]" NA "1 [0.64%]"
9 Other "12 [4.74%]" "21 [7.55%]" "21 [8.20%]" "16 [4.78%]" NA "11 [7.01%]"
10 Total "253" "278" "256" "335" 0 "157"
一个选项是在执行 pivot_wider
之前用 complete
创建缺失的年月。使用pivot_wider
,默认顺序是基于顺序
中出现的唯一值
library(dplyr)
library(tidyr)
library(zoo)
df_2 %>%
ungroup %>%
mutate(Year_month = as.Date(Year_month)) %>%
complete(therapy_class, Year_month = seq(from = min(Year_month,
na.rm = TRUE), to = max(Year_month, na.rm = TRUE),
by = '1 month')) %>%
mutate(Year_month = as.yearmon(Year_month)) %>%
pivot_wider(names_from = Year_month, values_from = count) %>%
ungroup() %>%
mutate_at(.vars = vars(contains("20")),
list(
~ ifelse(is.na(.), "", paste(., sprintf("<br>[%1.1f%%]",
100 * (. / sum(., na.rm = TRUE)))))
))
-输出
# A tibble: 9 × 8
therapy_class `Dec 2019` `Jan 2020` `Feb 2020` `Mar 2020` `Apr 2020` `May 2020` `NA`
<ord> <chr> <chr> <chr> <chr> <chr> <chr> <int>
1 ALK Inhibitors "16 <br>[2.7%]" "" "14 <br>[2.2%]" "22 <br>[3.3%]" "13 <br>[2.1%… "17 <br>[3.4… NA
2 Anti-VEGF-based therapies "33 <br>[5.6%]" "35 <br>[5.0%]" "36 <br>[5.7%]" "20 <br>[3.0%]" "21 <br>[3.4%… "20 <br>[4.0… NA
3 EGFR TKIs "52 <br>[8.8%]" "57 <br>[8.2%]" "60 <br>[9.5%]" "52 <br>[7.8%]" "56 <br>[9.2%… "49 <br>[9.8… NA
4 EGFR-antibody based therapies "" "" "" "" "" "" NA
5 Non-platinum-based chemotherapy combinations "1 <br>[0.2%]" "4 <br>[0.6%]" "4 <br>[0.6%]" "" "1 <br>[0.2%]" "" NA
6 IO-based therapies "308 <br>[52.0%]" "385 <br>[55.1%]" "330 <br>[52.3%]" "379 <br>[56.7%]" "345 <br>[56.… "265 <br>[52… NA
7 Platinum-based chemotherapy combinations "123 <br>[20.8%]" "147 <br>[21.0%]" "128 <br>[20.3%]" "134 <br>[20.1%]" "120 <br>[19.… "107 <br>[21… NA
8 Single agent chemotherapies "29 <br>[4.9%]" "33 <br>[4.7%]" "17 <br>[2.7%]" "28 <br>[4.2%]" "25 <br>[4.1%… "22 <br>[4.4… NA
9 Other "30 <br>[5.1%]" "38 <br>[5.4%]" "42 <br>[6.7%]" "33 <br>[4.9%]" "31 <br>[5.1%… "21 <br>[4.2… NA
有了新数据
df %>%
ungroup %>%
mutate(Year_month = as.Date(quarter)) %>%
complete(therapy_class,
Year_month = seq(from = min(Year_month,na.rm = TRUE ),
to = max(Year_month, na.rm = TRUE), by = '1 quarter')) %>%
mutate(Year_month = as.yearqtr(Year_month)) %>%
pivot_wider(names_from = Year_month, values_from = count) %>%
ungroup() %>%
mutate_at(.vars = vars(contains("20")),
list( ~ ifelse(is.na(.), "",
paste(., sprintf("<br>[%1.1f%%]",
100 * (. / sum(., na.rm = TRUE))))) ))
-输出
# A tibble: 35 × 8
therapy_class quarter `2020 Q3` `2020 Q4` `2021 Q1` `2021 Q2` `2021 Q3` `NA`
<ord> <yearqtr> <chr> <chr> <chr> <chr> <chr> <int>
1 ALK Inhibitors <NA> "" "" "" "" "" NA
2 ALK Inhibitors 2020 Q4 "" "4 <br>[1.6%]" "" "" "" NA
3 ALK Inhibitors 2021 Q1 "" "" "2 <br>[0.7%]" "" "" NA
4 ALK Inhibitors 2021 Q2 "" "" "" "2 <br>[0.8%]" "" NA
5 Anti-VEGF-based therapies 2020 Q3 "3 <br>[0.9%]" "" "" "" "" NA
6 Anti-VEGF-based therapies 2020 Q4 "" "2 <br>[0.8%]" "" "" "" NA
7 Anti-VEGF-based therapies <NA> "" "" "" "" "" NA
8 Anti-VEGF-based therapies 2021 Q2 "" "" "" "3 <br>[1.2%]" "" NA
9 EGFR TKIs 2020 Q3 "1 <br>[0.3%]" "" "" "" "" NA
10 EGFR TKIs 2020 Q4 "" "1 <br>[0.4%]" "" "" "" NA
# … with 25 more rows
> df_1
# A tibble: 47 x 3
# Groups: therapy_class [9]
therapy_class Year_month count
<ord> <yearmon> <int>
1 ALK Inhibitors Dec 2019 16
2 ALK Inhibitors Jan 2020 14
3 ALK Inhibitors Feb 2020 14
4 ALK Inhibitors Mar 2020 22
5 ALK Inhibitors Apr 2020 13
6 ALK Inhibitors May 2020 17
7 Anti-VEGF-based therapies Dec 2019 33
8 Anti-VEGF-based therapies Jan 2020 35
9 Anti-VEGF-based therapies Feb 2020 36
10 Anti-VEGF-based therapies Mar 2020 20
# … with 37 more rows
A tibble: 10 x 7
therapy_class `Dec 2019` `Jan 2020` `Feb 2020` `Mar 2020` `Apr 2020` `May 2020`
<ord> <chr> <chr> <chr> <chr> <chr> <chr>
1 ALK Inhibitors "16 <br>[2.7%]" "14 <br>[2.0%]" "14 <br>[2.2%]" "22 <br>[3.3%]" "13 <br>[2.1%]" "17 <br>[3.4%]"
2 Anti-VEGF-based therapies "33 <br>[5.6%]" "35 <br>[4.9%]" "36 <br>[5.7%]" "20 <br>[3.0%]" "21 <br>[3.4%]" "20 <br>[4.0%]"
3 EGFR TKIs "52 <br>[8.8%]" "57 <br>[8.0%]" "60 <br>[9.5%]" "52 <br>[7.8%]" "56 <br>[9.2%]" "49 <br>[9.8%]"
4 EGFR-antibody based therapies "" "" "" "" "" ""
5 Non-platinum-based chemotherapy… "1 <br>[0.2%]" "4 <br>[0.6%]" "4 <br>[0.6%]" "" "1 <br>[0.2%]" ""
6 IO-based therapies "308 <br>[52.0%]" "385 <br>[54.0%]" "330 <br>[52.3%]" "379 <br>[56.7%]" "345 <br>[56.4%]" "265 <br>[52.9%]"
7 Platinum-based chemotherapy com… "123 <br>[20.8%]" "147 <br>[20.6%]" "128 <br>[20.3%]" "134 <br>[20.1%]" "120 <br>[19.6%]" "107 <br>[21.4%]"
8 Single agent chemotherapies "29 <br>[4.9%]" "33 <br>[4.6%]" "17 <br>[2.7%]" "28 <br>[4.2%]" "25 <br>[4.1%]" "22 <br>[4.4%]"
9 Other "30 <br>[5.1%]" "38 <br>[5.3%]" "42 <br>[6.7%]" "33 <br>[4.9%]" "31 <br>[5.1%]" "21 <br>[4.2%]"
10 <strong>Total</strong> "<strong>592</str… "<strong>713</str… "<strong>631</st… "<strong>668</st… "<strong>612</st… "<strong>501</st…
> df_2
# A tibble: 46 x 3
# Groups: therapy_class [9]
therapy_class Year_month count
<ord> <yearmon> <int>
1 ALK Inhibitors Dec 2019 16
2 ALK Inhibitors Feb 2020 14
3 ALK Inhibitors Mar 2020 22
4 ALK Inhibitors Apr 2020 13
5 ALK Inhibitors May 2020 17
6 Anti-VEGF-based therapies Dec 2019 33
7 Anti-VEGF-based therapies Jan 2020 35
8 Anti-VEGF-based therapies Feb 2020 36
9 Anti-VEGF-based therapies Mar 2020 20
10 Anti-VEGF-based therapies Apr 2020 21
# … with 36 more rows
> t2
# A tibble: 10 x 7
therapy_class `Dec 2019` `Feb 2020` `Mar 2020` `Apr 2020` `May 2020` `Jan 2020`
<ord> <chr> <chr> <chr> <chr> <chr> <chr>
1 ALK Inhibitors "16 <br>[2.7%]" "14 <br>[2.2%]" "22 <br>[3.3%]" "13 <br>[2.1%]" "17 <br>[3.4%]" ""
2 Anti-VEGF-based therapies "33 <br>[5.6%]" "36 <br>[5.7%]" "20 <br>[3.0%]" "21 <br>[3.4%]" "20 <br>[4.0%]" "35 <br>[5.0%]"
3 EGFR TKIs "52 <br>[8.8%]" "60 <br>[9.5%]" "52 <br>[7.8%]" "56 <br>[9.2%]" "49 <br>[9.8%]" "57 <br>[8.2%]"
4 EGFR-antibody based therapies "" "" "" "" "" ""
5 Non-platinum-based chemotherapy… "1 <br>[0.2%]" "4 <br>[0.6%]" "" "1 <br>[0.2%]" "" "4 <br>[0.6%]"
6 IO-based therapies "308 <br>[52.0%]" "330 <br>[52.3%]" "379 <br>[56.7%]" "345 <br>[56.4%]" "265 <br>[52.9%]" "385 <br>[55.1%]"
7 Platinum-based chemotherapy com… "123 <br>[20.8%]" "128 <br>[20.3%]" "134 <br>[20.1%]" "120 <br>[19.6%]" "107 <br>[21.4%]" "147 <br>[21.0%]"
8 Single agent chemotherapies "29 <br>[4.9%]" "17 <br>[2.7%]" "28 <br>[4.2%]" "25 <br>[4.1%]" "22 <br>[4.4%]" "33 <br>[4.7%]"
9 Other "30 <br>[5.1%]" "42 <br>[6.7%]" "33 <br>[4.9%]" "31 <br>[5.1%]" "21 <br>[4.2%]" "38 <br>[5.4%]"
10 <strong>Total</strong> "<strong>592</str… "<strong>631</str… "<strong>668</st… "<strong>612</st… "<strong>501</st… "<strong>699</st…
>
我正在尝试创建一个宽 table,其中包含长 table 的计数和百分比。列是 'Month Year',需要按顺序排列。我的问题是,当第一组(ALK 抑制剂)的某些 'Month Year' 缺少行时,列的顺序就会被打乱。缺少的 'Month Year' 放在最后。另外,长 table 不是固定的 table。它是从用户可以选择月份年份范围的函数生成的。所以 Year_month 列可以有任何范围。
在这个例子中,我使用了 2019 年 12 月到 2020 年 5 月这 6 个月的范围。 “df_1”有 6 个月,所以结果宽 table 符合预期。 “df_2”缺少 ALK 抑制剂的 2020 年 1 月。所以结果 table 最后有 'Jan 2020'。
这是我正在使用的生成宽 table:
的代码df_2 %>%
pivot_wider(names_from = Year_month, values_from = count) %>%
ungroup() %>%
mutate_at(.vars = vars(contains("20")), list(
~ ifelse(is.na(.), "", paste(., sprintf("<br>[%1.1f%%]", 100 * (. / sum(., na.rm = TRUE)))))
))
这里是示例数据df_2
structure(list(therapy_class = structure(c(1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 5L, 5L, 5L,
5L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L,
8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L), .Label = c("ALK Inhibitors",
"Anti-VEGF-based therapies", "EGFR TKIs", "EGFR-antibody based therapies",
"Non-platinum-based chemotherapy combinations", "IO-based therapies",
"Platinum-based chemotherapy combinations", "Single agent chemotherapies",
"Other"), class = c("ordered", "factor")), Year_month = structure(c(2019.91666666667,
2020.08333333333, 2020.16666666667, 2020.25, 2020.33333333333,
2019.91666666667, 2020, 2020.08333333333, 2020.16666666667, 2020.25,
2020.33333333333, 2019.91666666667, 2020, 2020.08333333333, 2020.16666666667,
2020.25, 2020.33333333333, NA, 2019.91666666667, 2020, 2020.08333333333,
2020.25, 2019.91666666667, 2020, 2020.08333333333, 2020.16666666667,
2020.25, 2020.33333333333, 2019.91666666667, 2020, 2020.08333333333,
2020.16666666667, 2020.25, 2020.33333333333, 2019.91666666667,
2020, 2020.08333333333, 2020.16666666667, 2020.25, 2020.33333333333,
2019.91666666667, 2020, 2020.08333333333, 2020.16666666667, 2020.25,
2020.33333333333), class = "yearmon"), count = c(16L, 14L, 22L,
13L, 17L, 33L, 35L, 36L, 20L, 21L, 20L, 52L, 57L, 60L, 52L, 56L,
49L, NA, 1L, 4L, 4L, 1L, 308L, 385L, 330L, 379L, 345L, 265L,
123L, 147L, 128L, 134L, 120L, 107L, 29L, 33L, 17L, 28L, 25L,
22L, 30L, 38L, 42L, 33L, 31L, 21L)), row.names = c(NA, -46L), groups = structure(list(
therapy_class = structure(1:9, .Label = c("ALK Inhibitors",
"Anti-VEGF-based therapies", "EGFR TKIs", "EGFR-antibody based therapies",
"Non-platinum-based chemotherapy combinations", "IO-based therapies",
"Platinum-based chemotherapy combinations", "Single agent chemotherapies",
"Other"), class = c("ordered", "factor")), .rows = structure(list(
1:5, 6:11, 12:17, 18L, 19:22, 23:28, 29:34, 35:40, 41:46), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -9L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
这是包含月-季度数据的新数据集
df <- structure(list(therapy_class = structure(c(1L, 1L, 1L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 4L, 5L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L,
7L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L), .Label = c("ALK Inhibitors",
"Anti-VEGF-based therapies", "EGFR TKIs", "EGFR-antibody based therapies",
"Non-platinum-based chemotherapy combinations", "IO-based therapies",
"Platinum-based chemotherapy combinations", "Single agent chemotherapies",
"Other"), class = c("ordered", "factor")), quarter = structure(c(2020.75,
2021, 2021.25, 2020.5, 2020.75, 2021.25, 2020.5, 2020.75, 2021,
2021.25, NA, NA, 2020.5, 2020.75, 2021, 2021.25, 2021.5, 2020.5,
2020.75, 2021, 2021.25, 2021.5, 2020.5, 2020.75, 2021, 2021.25,
2021.5, 2020.5, 2020.75, 2021, 2021.25, 2021.5), class = "yearqtr"),
count = c(4L, 2L, 2L, 3L, 2L, 3L, 1L, 1L, 1L, 1L, NA, NA,
252L, 184L, 190L, 180L, 120L, 59L, 46L, 63L, 47L, 25L, 4L,
4L, 1L, 2L, 1L, 16L, 12L, 21L, 21L, 11L)), row.names = c(NA,
-32L), groups = structure(list(therapy_class = structure(1:9, .Label = c("ALK Inhibitors",
"Anti-VEGF-based therapies", "EGFR TKIs", "EGFR-antibody based therapies",
"Non-platinum-based chemotherapy combinations", "IO-based therapies",
"Platinum-based chemotherapy combinations", "Single agent chemotherapies",
"Other"), class = c("ordered", "factor")), .rows = structure(list(
1:3, 4:6, 7:10, 11L, 12L, 13:17, 18:22, 23:27, 28:32), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -9L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
-- 由于缺少数据,季度日期乱序
therapy_class `2020 Q4` `2021 Q1` `2021 Q2` `2020 Q3` `NA` `2021 Q3`
<ord> <chr> <chr> <chr> <chr> <int> <chr>
1 ALK Inhibitors "4 [1.58%]" "2 [0.72%]" "2 [0.78%]" "" NA ""
2 Anti-VEGF-based therapies "2 [0.79%]" "" "3 [1.17%]" "3 [0.90%]" NA ""
3 EGFR TKIs "1 [0.40%]" "1 [0.36%]" "1 [0.39%]" "1 [0.30%]" NA ""
4 EGFR-antibody based therapies "" "" "" "" NA ""
5 Non-platinum-based chemotherapy combinations "" "" "" "" NA ""
6 IO-based therapies "184 [72.73%]" "190 [68.35%]" "180 [70.31%]" "252 [75.22%]" NA "120 [76.43%]"
7 Platinum-based chemotherapy combinations "46 [18.18%]" "63 [22.66%]" "47 [18.36%]" "59 [17.61%]" NA "25 [15.92%]"
8 Single agent chemotherapies "4 [1.58%]" "1 [0.36%]" "2 [0.78%]" "4 [1.19%]" NA "1 [0.64%]"
9 Other "12 [4.74%]" "21 [7.55%]" "21 [8.20%]" "16 [4.78%]" NA "11 [7.01%]"
10 Total "253" "278" "256" "335" 0 "157"
一个选项是在执行 pivot_wider
之前用 complete
创建缺失的年月。使用pivot_wider
,默认顺序是基于顺序
library(dplyr)
library(tidyr)
library(zoo)
df_2 %>%
ungroup %>%
mutate(Year_month = as.Date(Year_month)) %>%
complete(therapy_class, Year_month = seq(from = min(Year_month,
na.rm = TRUE), to = max(Year_month, na.rm = TRUE),
by = '1 month')) %>%
mutate(Year_month = as.yearmon(Year_month)) %>%
pivot_wider(names_from = Year_month, values_from = count) %>%
ungroup() %>%
mutate_at(.vars = vars(contains("20")),
list(
~ ifelse(is.na(.), "", paste(., sprintf("<br>[%1.1f%%]",
100 * (. / sum(., na.rm = TRUE)))))
))
-输出
# A tibble: 9 × 8
therapy_class `Dec 2019` `Jan 2020` `Feb 2020` `Mar 2020` `Apr 2020` `May 2020` `NA`
<ord> <chr> <chr> <chr> <chr> <chr> <chr> <int>
1 ALK Inhibitors "16 <br>[2.7%]" "" "14 <br>[2.2%]" "22 <br>[3.3%]" "13 <br>[2.1%… "17 <br>[3.4… NA
2 Anti-VEGF-based therapies "33 <br>[5.6%]" "35 <br>[5.0%]" "36 <br>[5.7%]" "20 <br>[3.0%]" "21 <br>[3.4%… "20 <br>[4.0… NA
3 EGFR TKIs "52 <br>[8.8%]" "57 <br>[8.2%]" "60 <br>[9.5%]" "52 <br>[7.8%]" "56 <br>[9.2%… "49 <br>[9.8… NA
4 EGFR-antibody based therapies "" "" "" "" "" "" NA
5 Non-platinum-based chemotherapy combinations "1 <br>[0.2%]" "4 <br>[0.6%]" "4 <br>[0.6%]" "" "1 <br>[0.2%]" "" NA
6 IO-based therapies "308 <br>[52.0%]" "385 <br>[55.1%]" "330 <br>[52.3%]" "379 <br>[56.7%]" "345 <br>[56.… "265 <br>[52… NA
7 Platinum-based chemotherapy combinations "123 <br>[20.8%]" "147 <br>[21.0%]" "128 <br>[20.3%]" "134 <br>[20.1%]" "120 <br>[19.… "107 <br>[21… NA
8 Single agent chemotherapies "29 <br>[4.9%]" "33 <br>[4.7%]" "17 <br>[2.7%]" "28 <br>[4.2%]" "25 <br>[4.1%… "22 <br>[4.4… NA
9 Other "30 <br>[5.1%]" "38 <br>[5.4%]" "42 <br>[6.7%]" "33 <br>[4.9%]" "31 <br>[5.1%… "21 <br>[4.2… NA
有了新数据
df %>%
ungroup %>%
mutate(Year_month = as.Date(quarter)) %>%
complete(therapy_class,
Year_month = seq(from = min(Year_month,na.rm = TRUE ),
to = max(Year_month, na.rm = TRUE), by = '1 quarter')) %>%
mutate(Year_month = as.yearqtr(Year_month)) %>%
pivot_wider(names_from = Year_month, values_from = count) %>%
ungroup() %>%
mutate_at(.vars = vars(contains("20")),
list( ~ ifelse(is.na(.), "",
paste(., sprintf("<br>[%1.1f%%]",
100 * (. / sum(., na.rm = TRUE))))) ))
-输出
# A tibble: 35 × 8
therapy_class quarter `2020 Q3` `2020 Q4` `2021 Q1` `2021 Q2` `2021 Q3` `NA`
<ord> <yearqtr> <chr> <chr> <chr> <chr> <chr> <int>
1 ALK Inhibitors <NA> "" "" "" "" "" NA
2 ALK Inhibitors 2020 Q4 "" "4 <br>[1.6%]" "" "" "" NA
3 ALK Inhibitors 2021 Q1 "" "" "2 <br>[0.7%]" "" "" NA
4 ALK Inhibitors 2021 Q2 "" "" "" "2 <br>[0.8%]" "" NA
5 Anti-VEGF-based therapies 2020 Q3 "3 <br>[0.9%]" "" "" "" "" NA
6 Anti-VEGF-based therapies 2020 Q4 "" "2 <br>[0.8%]" "" "" "" NA
7 Anti-VEGF-based therapies <NA> "" "" "" "" "" NA
8 Anti-VEGF-based therapies 2021 Q2 "" "" "" "3 <br>[1.2%]" "" NA
9 EGFR TKIs 2020 Q3 "1 <br>[0.3%]" "" "" "" "" NA
10 EGFR TKIs 2020 Q4 "" "1 <br>[0.4%]" "" "" "" NA
# … with 25 more rows