即使缺少数据点，如何以特定顺序（月年）显示宽 table？

Question

    > df_1
    # A tibble: 47 x 3
    # Groups:   therapy_class [9]
       therapy_class             Year_month count
       <ord>                     <yearmon>  <int>
     1 ALK Inhibitors            Dec 2019      16
     2 ALK Inhibitors            Jan 2020      14
     3 ALK Inhibitors            Feb 2020      14
     4 ALK Inhibitors            Mar 2020      22
     5 ALK Inhibitors            Apr 2020      13
     6 ALK Inhibitors            May 2020      17
     7 Anti-VEGF-based therapies Dec 2019      33
     8 Anti-VEGF-based therapies Jan 2020      35
     9 Anti-VEGF-based therapies Feb 2020      36
    10 Anti-VEGF-based therapies Mar 2020      20
    # … with 37 more rows



    A tibble: 10 x 7
       therapy_class                    `Dec 2019`         `Jan 2020`         `Feb 2020`        `Mar 2020`        `Apr 2020`        `May 2020`       
       <ord>                            <chr>              <chr>              <chr>             <chr>             <chr>             <chr>            
     1 ALK Inhibitors                   "16 <br>[2.7%]"    "14 <br>[2.0%]"    "14 <br>[2.2%]"   "22 <br>[3.3%]"   "13 <br>[2.1%]"   "17 <br>[3.4%]"  
     2 Anti-VEGF-based therapies        "33 <br>[5.6%]"    "35 <br>[4.9%]"    "36 <br>[5.7%]"   "20 <br>[3.0%]"   "21 <br>[3.4%]"   "20 <br>[4.0%]"  
     3 EGFR TKIs                        "52 <br>[8.8%]"    "57 <br>[8.0%]"    "60 <br>[9.5%]"   "52 <br>[7.8%]"   "56 <br>[9.2%]"   "49 <br>[9.8%]"  
     4 EGFR-antibody based therapies    ""                 ""                 ""                ""                ""                ""               
     5 Non-platinum-based chemotherapy… "1 <br>[0.2%]"     "4 <br>[0.6%]"     "4 <br>[0.6%]"    ""                "1 <br>[0.2%]"    ""               
     6 IO-based therapies               "308 <br>[52.0%]"  "385 <br>[54.0%]"  "330 <br>[52.3%]" "379 <br>[56.7%]" "345 <br>[56.4%]" "265 <br>[52.9%]"
     7 Platinum-based chemotherapy com… "123 <br>[20.8%]"  "147 <br>[20.6%]"  "128 <br>[20.3%]" "134 <br>[20.1%]" "120 <br>[19.6%]" "107 <br>[21.4%]"
     8 Single agent chemotherapies      "29 <br>[4.9%]"    "33 <br>[4.6%]"    "17 <br>[2.7%]"   "28 <br>[4.2%]"   "25 <br>[4.1%]"   "22 <br>[4.4%]"  
     9 Other                            "30 <br>[5.1%]"    "38 <br>[5.3%]"    "42 <br>[6.7%]"   "33 <br>[4.9%]"   "31 <br>[5.1%]"   "21 <br>[4.2%]"  
    10 <strong>Total</strong>           "<strong>592</str… "<strong>713</str… "<strong>631</st… "<strong>668</st… "<strong>612</st… "<strong>501</st…


    > df_2
    # A tibble: 46 x 3
    # Groups:   therapy_class [9]
       therapy_class             Year_month count
       <ord>                     <yearmon>  <int>
     1 ALK Inhibitors            Dec 2019      16
     2 ALK Inhibitors            Feb 2020      14
     3 ALK Inhibitors            Mar 2020      22
     4 ALK Inhibitors            Apr 2020      13
     5 ALK Inhibitors            May 2020      17
     6 Anti-VEGF-based therapies Dec 2019      33
     7 Anti-VEGF-based therapies Jan 2020      35
     8 Anti-VEGF-based therapies Feb 2020      36
     9 Anti-VEGF-based therapies Mar 2020      20
    10 Anti-VEGF-based therapies Apr 2020      21
    # … with 36 more rows

> t2
# A tibble: 10 x 7
   therapy_class                    `Dec 2019`         `Feb 2020`         `Mar 2020`        `Apr 2020`        `May 2020`        `Jan 2020`       
   <ord>                            <chr>              <chr>              <chr>             <chr>             <chr>             <chr>            
 1 ALK Inhibitors                   "16 <br>[2.7%]"    "14 <br>[2.2%]"    "22 <br>[3.3%]"   "13 <br>[2.1%]"   "17 <br>[3.4%]"   ""               
 2 Anti-VEGF-based therapies        "33 <br>[5.6%]"    "36 <br>[5.7%]"    "20 <br>[3.0%]"   "21 <br>[3.4%]"   "20 <br>[4.0%]"   "35 <br>[5.0%]"  
 3 EGFR TKIs                        "52 <br>[8.8%]"    "60 <br>[9.5%]"    "52 <br>[7.8%]"   "56 <br>[9.2%]"   "49 <br>[9.8%]"   "57 <br>[8.2%]"  
 4 EGFR-antibody based therapies    ""                 ""                 ""                ""                ""                ""               
 5 Non-platinum-based chemotherapy… "1 <br>[0.2%]"     "4 <br>[0.6%]"     ""                "1 <br>[0.2%]"    ""                "4 <br>[0.6%]"   
 6 IO-based therapies               "308 <br>[52.0%]"  "330 <br>[52.3%]"  "379 <br>[56.7%]" "345 <br>[56.4%]" "265 <br>[52.9%]" "385 <br>[55.1%]"
 7 Platinum-based chemotherapy com… "123 <br>[20.8%]"  "128 <br>[20.3%]"  "134 <br>[20.1%]" "120 <br>[19.6%]" "107 <br>[21.4%]" "147 <br>[21.0%]"
 8 Single agent chemotherapies      "29 <br>[4.9%]"    "17 <br>[2.7%]"    "28 <br>[4.2%]"   "25 <br>[4.1%]"   "22 <br>[4.4%]"   "33 <br>[4.7%]"  
 9 Other                            "30 <br>[5.1%]"    "42 <br>[6.7%]"    "33 <br>[4.9%]"   "31 <br>[5.1%]"   "21 <br>[4.2%]"   "38 <br>[5.4%]"  
10 <strong>Total</strong>           "<strong>592</str… "<strong>631</str… "<strong>668</st… "<strong>612</st… "<strong>501</st… "<strong>699</st…
>

我正在尝试创建一个宽 table，其中包含长 table 的计数和百分比。列是 'Month Year'，需要按顺序排列。我的问题是，当第一组（ALK 抑制剂）的某些 'Month Year' 缺少行时，列的顺序就会被打乱。缺少的 'Month Year' 放在最后。另外，长 table 不是固定的 table。它是从用户可以选择月份年份范围的函数生成的。所以 Year_month 列可以有任何范围。

在这个例子中，我使用了 2019 年 12 月到 2020 年 5 月这 6 个月的范围。 “df_1”有 6 个月，所以结果宽 table 符合预期。 “df_2”缺少 ALK 抑制剂的 2020 年 1 月。所以结果 table 最后有 'Jan 2020'。

这是我正在使用的生成宽 table:

的代码

df_2 %>%
  pivot_wider(names_from = Year_month, values_from = count) %>%
  ungroup() %>%
  mutate_at(.vars = vars(contains("20")), list(
    ~ ifelse(is.na(.), "", paste(., sprintf("<br>[%1.1f%%]", 100 * (. / sum(., na.rm = TRUE)))))
  ))

这里是示例数据df_2

structure(list(therapy_class = structure(c(1L, 1L, 1L, 1L, 1L, 
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 5L, 5L, 5L, 
5L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 
8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L), .Label = c("ALK Inhibitors", 
"Anti-VEGF-based therapies", "EGFR TKIs", "EGFR-antibody based therapies", 
"Non-platinum-based chemotherapy combinations", "IO-based therapies", 
"Platinum-based chemotherapy combinations", "Single agent chemotherapies", 
"Other"), class = c("ordered", "factor")), Year_month = structure(c(2019.91666666667, 
2020.08333333333, 2020.16666666667, 2020.25, 2020.33333333333, 
2019.91666666667, 2020, 2020.08333333333, 2020.16666666667, 2020.25, 
2020.33333333333, 2019.91666666667, 2020, 2020.08333333333, 2020.16666666667, 
2020.25, 2020.33333333333, NA, 2019.91666666667, 2020, 2020.08333333333, 
2020.25, 2019.91666666667, 2020, 2020.08333333333, 2020.16666666667, 
2020.25, 2020.33333333333, 2019.91666666667, 2020, 2020.08333333333, 
2020.16666666667, 2020.25, 2020.33333333333, 2019.91666666667, 
2020, 2020.08333333333, 2020.16666666667, 2020.25, 2020.33333333333, 
2019.91666666667, 2020, 2020.08333333333, 2020.16666666667, 2020.25, 
2020.33333333333), class = "yearmon"), count = c(16L, 14L, 22L, 
13L, 17L, 33L, 35L, 36L, 20L, 21L, 20L, 52L, 57L, 60L, 52L, 56L, 
49L, NA, 1L, 4L, 4L, 1L, 308L, 385L, 330L, 379L, 345L, 265L, 
123L, 147L, 128L, 134L, 120L, 107L, 29L, 33L, 17L, 28L, 25L, 
22L, 30L, 38L, 42L, 33L, 31L, 21L)), row.names = c(NA, -46L), groups = structure(list(
    therapy_class = structure(1:9, .Label = c("ALK Inhibitors", 
    "Anti-VEGF-based therapies", "EGFR TKIs", "EGFR-antibody based therapies", 
    "Non-platinum-based chemotherapy combinations", "IO-based therapies", 
    "Platinum-based chemotherapy combinations", "Single agent chemotherapies", 
    "Other"), class = c("ordered", "factor")), .rows = structure(list(
        1:5, 6:11, 12:17, 18L, 19:22, 23:28, 29:34, 35:40, 41:46), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, -9L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

这是包含月-季度数据的新数据集

df <- structure(list(therapy_class = structure(c(1L, 1L, 1L, 2L, 2L, 
2L, 3L, 3L, 3L, 3L, 4L, 5L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 
7L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L), .Label = c("ALK Inhibitors", 
"Anti-VEGF-based therapies", "EGFR TKIs", "EGFR-antibody based therapies", 
"Non-platinum-based chemotherapy combinations", "IO-based therapies", 
"Platinum-based chemotherapy combinations", "Single agent chemotherapies", 
"Other"), class = c("ordered", "factor")), quarter = structure(c(2020.75, 
2021, 2021.25, 2020.5, 2020.75, 2021.25, 2020.5, 2020.75, 2021, 
2021.25, NA, NA, 2020.5, 2020.75, 2021, 2021.25, 2021.5, 2020.5, 
2020.75, 2021, 2021.25, 2021.5, 2020.5, 2020.75, 2021, 2021.25, 
2021.5, 2020.5, 2020.75, 2021, 2021.25, 2021.5), class = "yearqtr"), 
    count = c(4L, 2L, 2L, 3L, 2L, 3L, 1L, 1L, 1L, 1L, NA, NA, 
    252L, 184L, 190L, 180L, 120L, 59L, 46L, 63L, 47L, 25L, 4L, 
    4L, 1L, 2L, 1L, 16L, 12L, 21L, 21L, 11L)), row.names = c(NA, 
-32L), groups = structure(list(therapy_class = structure(1:9, .Label = c("ALK Inhibitors", 
"Anti-VEGF-based therapies", "EGFR TKIs", "EGFR-antibody based therapies", 
"Non-platinum-based chemotherapy combinations", "IO-based therapies", 
"Platinum-based chemotherapy combinations", "Single agent chemotherapies", 
"Other"), class = c("ordered", "factor")), .rows = structure(list(
    1:3, 4:6, 7:10, 11L, 12L, 13:17, 18:22, 23:27, 28:32), ptype = integer(0), class = c("vctrs_list_of", 
"vctrs_vctr", "list"))), row.names = c(NA, -9L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

-- 由于缺少数据，季度日期乱序

therapy_class                                `2020 Q4`      `2021 Q1`      `2021 Q2`      `2020 Q3`       `NA` `2021 Q3`     
   <ord>                                        <chr>          <chr>          <chr>          <chr>          <int> <chr>         
 1 ALK Inhibitors                               "4 [1.58%]"    "2 [0.72%]"    "2 [0.78%]"    ""                NA ""            
 2 Anti-VEGF-based therapies                    "2 [0.79%]"    ""             "3 [1.17%]"    "3 [0.90%]"       NA ""            
 3 EGFR TKIs                                    "1 [0.40%]"    "1 [0.36%]"    "1 [0.39%]"    "1 [0.30%]"       NA ""            
 4 EGFR-antibody based therapies                ""             ""             ""             ""                NA ""            
 5 Non-platinum-based chemotherapy combinations ""             ""             ""             ""                NA ""            
 6 IO-based therapies                           "184 [72.73%]" "190 [68.35%]" "180 [70.31%]" "252 [75.22%]"    NA "120 [76.43%]"
 7 Platinum-based chemotherapy combinations     "46 [18.18%]"  "63 [22.66%]"  "47 [18.36%]"  "59 [17.61%]"     NA "25 [15.92%]" 
 8 Single agent chemotherapies                  "4 [1.58%]"    "1 [0.36%]"    "2 [0.78%]"    "4 [1.19%]"       NA "1 [0.64%]"   
 9 Other                                        "12 [4.74%]"   "21 [7.55%]"   "21 [8.20%]"   "16 [4.78%]"      NA "11 [7.01%]"  
10 Total                                        "253"          "278"          "256"          "335"              0 "157"

Answer 1

一个选项是在执行 pivot_wider 之前用 complete 创建缺失的年月。使用pivot_wider，默认顺序是基于顺序

中出现的唯一值

library(dplyr)
library(tidyr)
library(zoo)
df_2 %>%
    ungroup %>% 
    mutate(Year_month = as.Date(Year_month)) %>% 
    complete(therapy_class, Year_month =  seq(from = min(Year_month, 
     na.rm = TRUE), to = max(Year_month, na.rm = TRUE),
       by = '1 month')) %>% 
    mutate(Year_month = as.yearmon(Year_month)) %>% 
    pivot_wider(names_from = Year_month, values_from = count) %>%
    ungroup() %>%
   mutate_at(.vars = vars(contains("20")),
     list(
    ~ ifelse(is.na(.), "", paste(., sprintf("<br>[%1.1f%%]",
     100 * (. / sum(., na.rm = TRUE)))))
  ))

-输出

# A tibble: 9 × 8
  therapy_class                                `Dec 2019`        `Jan 2020`        `Feb 2020`        `Mar 2020`        `Apr 2020`     `May 2020`     `NA`
  <ord>                                        <chr>             <chr>             <chr>             <chr>             <chr>          <chr>         <int>
1 ALK Inhibitors                               "16 <br>[2.7%]"   ""                "14 <br>[2.2%]"   "22 <br>[3.3%]"   "13 <br>[2.1%… "17 <br>[3.4…    NA
2 Anti-VEGF-based therapies                    "33 <br>[5.6%]"   "35 <br>[5.0%]"   "36 <br>[5.7%]"   "20 <br>[3.0%]"   "21 <br>[3.4%… "20 <br>[4.0…    NA
3 EGFR TKIs                                    "52 <br>[8.8%]"   "57 <br>[8.2%]"   "60 <br>[9.5%]"   "52 <br>[7.8%]"   "56 <br>[9.2%… "49 <br>[9.8…    NA
4 EGFR-antibody based therapies                ""                ""                ""                ""                ""             ""               NA
5 Non-platinum-based chemotherapy combinations "1 <br>[0.2%]"    "4 <br>[0.6%]"    "4 <br>[0.6%]"    ""                "1 <br>[0.2%]" ""               NA
6 IO-based therapies                           "308 <br>[52.0%]" "385 <br>[55.1%]" "330 <br>[52.3%]" "379 <br>[56.7%]" "345 <br>[56.… "265 <br>[52…    NA
7 Platinum-based chemotherapy combinations     "123 <br>[20.8%]" "147 <br>[21.0%]" "128 <br>[20.3%]" "134 <br>[20.1%]" "120 <br>[19.… "107 <br>[21…    NA
8 Single agent chemotherapies                  "29 <br>[4.9%]"   "33 <br>[4.7%]"   "17 <br>[2.7%]"   "28 <br>[4.2%]"   "25 <br>[4.1%… "22 <br>[4.4…    NA
9 Other                                        "30 <br>[5.1%]"   "38 <br>[5.4%]"   "42 <br>[6.7%]"   "33 <br>[4.9%]"   "31 <br>[5.1%… "21 <br>[4.2…    NA

有了新数据

df %>% 
 ungroup %>% 
 mutate(Year_month = as.Date(quarter)) %>% 
 complete(therapy_class, 
  Year_month = seq(from = min(Year_month,na.rm = TRUE ), 
   to = max(Year_month, na.rm = TRUE), by = '1 quarter')) %>% 
  mutate(Year_month = as.yearqtr(Year_month))  %>% 
  pivot_wider(names_from = Year_month, values_from = count) %>%  
  ungroup() %>%   
  mutate_at(.vars = vars(contains("20")), 
      list(     ~ ifelse(is.na(.), "", 
    paste(., sprintf("<br>[%1.1f%%]",   
     100 * (. / sum(., na.rm = TRUE)))))   ))

-输出

# A tibble: 35 × 8
   therapy_class             quarter   `2020 Q3`      `2020 Q4`      `2021 Q1`      `2021 Q2`      `2021 Q3`  `NA`
   <ord>                     <yearqtr> <chr>          <chr>          <chr>          <chr>          <chr>     <int>
 1 ALK Inhibitors            <NA>      ""             ""             ""             ""             ""           NA
 2 ALK Inhibitors            2020 Q4   ""             "4 <br>[1.6%]" ""             ""             ""           NA
 3 ALK Inhibitors            2021 Q1   ""             ""             "2 <br>[0.7%]" ""             ""           NA
 4 ALK Inhibitors            2021 Q2   ""             ""             ""             "2 <br>[0.8%]" ""           NA
 5 Anti-VEGF-based therapies 2020 Q3   "3 <br>[0.9%]" ""             ""             ""             ""           NA
 6 Anti-VEGF-based therapies 2020 Q4   ""             "2 <br>[0.8%]" ""             ""             ""           NA
 7 Anti-VEGF-based therapies <NA>      ""             ""             ""             ""             ""           NA
 8 Anti-VEGF-based therapies 2021 Q2   ""             ""             ""             "3 <br>[1.2%]" ""           NA
 9 EGFR TKIs                 2020 Q3   "1 <br>[0.3%]" ""             ""             ""             ""           NA
10 EGFR TKIs                 2020 Q4   ""             "1 <br>[0.4%]" ""             ""             ""           NA
# … with 25 more rows

即使缺少数据点，如何以特定顺序（月年）显示宽 table？

How to display wide table with specific order (month year) even when data points are missing?

r

longtable

missing-data

dplyr

yearmonth