为什么我在 R 中使用 case_when 的新列中没有得到我想要的类别?

Why do I not get my desired category in the new columns with case_when, in R?

我正在尝试创建一个新列 first_attack,其中包含基于 3 列的多个条件。我正在尝试仅使用 tidyverse 来实现这一目标。这是我的条件:

这是我拥有的数据集类型:

structure(list(id = c(112139L, 43919L, 92430L, 87137L, 95417L, 
66955L, 16293L, 61396L, 25379L, 79229L, 27107L, 63243L, 50627L, 
17968L, 83015L, 96549L, 7332L, 4873L, 98131L, 93506L, 52894L, 
59327L, 85003L, 96623L, 82999L, 65769L, 67063L, 21744L, 62961L, 
2229L, 103673L, 9367L, 60215L, 74044L, 58422L, 57530L, 100399L, 
46483L, 108690L, 62017L, 46467L, 79562L, 4800L, 119158L, 103222L, 
32908L, 14491L, 30293L, 52558L, 122304L, 42281L, 1553L, 111771L, 
23087L, 30147L, 37842L, 51552L, 20148L, 28L, 7477L), previous_cabg = structure(c(1L, 
1L, 1L, NA, 1L, NA, NA, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, NA, 1L, 1L, NA, 1L, NA, 1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 3L, 
1L, 1L, NA, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
"Unknown", "Yes"), class = "factor"), previous_pci = structure(c(1L, 
1L, 2L, NA, 1L, NA, NA, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 
2L, NA, 2L, 1L, NA, 2L, NA, 1L, 2L, 1L, 1L, 1L, NA, 2L, 1L, 1L, 
2L, 2L, NA, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 2L, 1L, 1L), .Label = c("No", 
"Yes", "Unknown"), class = "factor"), previous_ami = structure(c(2L, 
2L, 1L, 2L, 2L, NA, 2L, 1L, 2L, 2L, NA, 1L, 2L, 2L, 2L, 2L, 2L, 
1L, NA, 1L, 2L, NA, 1L, NA, 2L, 1L, 2L, 2L, 2L, NA, 1L, 1L, 1L, 
2L, 1L, NA, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 3L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, NA, 2L, 2L, 2L, 1L, 2L), .Label = c("Yes", 
"No", "Unknown"), class = "factor")), row.names = c(NA, -60L), problems = structure(list(
    row = c(34136L, 121773L, 121779L), col = c("1.01 Hospital identifier", 
    "1.01 Hospital identifier", "1.01 Hospital identifier"), 
    expected = c("value in level set", "value in level set", 
    "value in level set"), actual = c("CMH", "CMH", "CMH"), file = c("'../../data/changed/minap_2020_2021_second.csv'", 
    "'../../data/changed/minap_2020_2021_second.csv'", "'../../data/changed/minap_2020_2021_second.csv'"
    )), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
)), class = c("tbl_df", "tbl", "data.frame"))

我认为最好的是使用case_when。因此,我奇怪地继续了它。这是代码。

  test_first_attack <- sample_n %>%
      dplyr::mutate(first_attack = 
                      dplyr::case_when(  
                                        previous_cabg == 'No'  | previous_pci == 'No'  | previous_ami == 'Yes' ~ 'Yes',
                                        previous_cabg == 'No'  | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
                                        previous_cabg == 'Yes' | previous_pci == 'No'  | previous_ami == 'Yes' ~ 'Yes', 
                                        previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'No' ~  'Yes', 
                                        previous_cabg == 'No'  | previous_pci == 'No'  | previous_ami == 'Yes' ~ 'Yes',
                                        previous_cabg == 'No'  | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
                                        previous_cabg == 'Yes' | previous_pci == 'No'  | previous_ami == 'Yes' ~ 'Yes', 
                                        previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'No' ~  'Yes',
                                        
                                        # deal with the unknown category
                                        previous_cabg == 'Unknown'  | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
                                        previous_cabg == 'Yes' | previous_pci == 'Unknown'  | previous_ami == 'Yes' ~ 'Yes', 
                                        previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'Unknown' ~  'Yes', 
                                        previous_cabg == 'Unknown'  | previous_pci == 'Unknown'  | previous_ami == 'Yes' ~ 'Yes',
                                        previous_cabg == 'Unknown'  | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
                                        previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes', 
                                        previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'Unknown' ~  'Yes', 
                                        
                                        
                                        previous_cabg == 'Yes' |  previous_pci == 'No'  | previous_ami == 'Yes' ~  'Yes', 
                                        previous_cabg == 'Yes' |  previous_pci == 'No'  | previous_ami == 'No'  ~  'Yes',
                                        previous_cabg == 'No'  |  previous_pci == 'No'  | previous_ami == 'Yes' ~  'Yes',
                                        previous_cabg == 'No'  | previous_pci == 'Yes'  | previous_ami == 'No'  ~ 'Yes', 
                                        
                                        
                                        previous_cabg == 'Yes' |  previous_pci == 'Unknown'   | previous_ami == 'Yes' ~  'Yes', 
                                        previous_cabg == 'Yes' |  previous_pci == 'Unknown'   | previous_ami == 'Unknown'   ~  'Yes',
                                        previous_cabg == 'Unknown'   |  previous_pci == 'Unknown'   | previous_ami == 'Yes' ~  'Yes',
                                        previous_cabg == 'Unknown'   | previous_pci == 'Yes'  | previous_ami == 'Unknown'   ~ 'Yes', 
                                        
                                        
                                        previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Unknown' ~ 'Yes', 
                                        previous_cabg == 'Unknown'  | previous_pci == 'Yes'| previous_ami == 'Unknown' ~ 'Yes', 
                                        previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes', 
                                        previous_cabg == 'Unknown'  | previous_pci == 'Yes'| previous_ami == 'Yes' ~ 'Yes', 
                                        
                                        previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'No' ~ 'Yes', 
                                        previous_cabg == 'No'  | previous_pci == 'Yes'| previous_ami == 'No' ~ 'Yes', 
                                        previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes', 
                                        previous_cabg == 'No'  | previous_pci == 'Yes'| previous_ami == 'Yes' ~ 'Yes', 
                                        
                                        previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Unknown' ~ 'Yes', 
                                        previous_cabg == 'Unknown'  | previous_pci == 'Yes'| previous_ami == 'Unknown' ~ 'Yes', 
                                        previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes', 
                                        previous_cabg == 'Unknown' | previous_pci == 'Yes'| previous_ami == 'Yes' ~ 'Yes', 
                                        previous_cabg == 'No'  | previous_pci == 'No'  |  previous_ami == 'No' ~ 'No',
                                        previous_cabg == 'Yes' | previous_pci == 'Yes' |  previous_ami == 'Yes' ~'Yes'
                                        
                                        
                                        
                                        
                      ))

这是输出。

 test_first_attack <-  structure(list(id = c(112139L, 43919L, 92430L, 87137L, 95417L, 
    66955L, 16293L, 61396L, 25379L, 79229L, 27107L, 63243L, 50627L, 
    17968L, 83015L, 96549L, 7332L, 4873L, 98131L, 93506L, 52894L, 
    59327L, 85003L, 96623L, 82999L, 65769L, 67063L, 21744L, 62961L, 
    2229L, 103673L, 9367L, 60215L, 74044L, 58422L, 57530L, 100399L, 
    46483L, 108690L, 62017L, 46467L, 79562L, 4800L, 119158L, 103222L, 
    32908L, 14491L, 30293L, 52558L, 122304L, 42281L, 1553L, 111771L, 
    23087L, 30147L, 37842L, 51552L, 20148L, 28L, 7477L), previous_cabg = structure(c(1L, 
    1L, 1L, NA, 1L, NA, NA, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, NA, 1L, 1L, NA, 1L, NA, 1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 3L, 
    1L, 1L, NA, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Unknown", "Yes"), class = "factor"), previous_pci = structure(c(1L, 
    1L, 2L, NA, 1L, NA, NA, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 
    2L, NA, 2L, 1L, NA, 2L, NA, 1L, 2L, 1L, 1L, 1L, NA, 2L, 1L, 1L, 
    2L, 2L, NA, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 2L, 1L, 1L), .Label = c("No", 
    "Yes", "Unknown"), class = "factor"), previous_ami = structure(c(2L, 
    2L, 1L, 2L, 2L, NA, 2L, 1L, 2L, 2L, NA, 1L, 2L, 2L, 2L, 2L, 2L, 
    1L, NA, 1L, 2L, NA, 1L, NA, 2L, 1L, 2L, 2L, 2L, NA, 1L, 1L, 1L, 
    2L, 1L, NA, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 3L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, NA, 2L, 2L, 2L, 1L, 2L), .Label = c("Yes", 
    "No", "Unknown"), class = "factor"), first_attack = c("Yes", 
    "Yes", "Yes", "Yes", "Yes", NA, "Yes", "Yes", "Yes", "Yes", "Yes", 
    "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", NA, "Yes", "Yes", 
    NA, "Yes", NA, "Yes", "Yes", "Yes", "Yes", "Yes", NA, "Yes", 
    "Yes", "Yes", "Yes", "Yes", NA, "Yes", "Yes", "Yes", "Yes", "Yes", 
    "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", 
    "Yes", "Yes", "Yes", "Yes", NA, "Yes", "Yes", "Yes", "Yes", "Yes"
    )), row.names = c(NA, -60L), problems = structure(list(row = c(34136L, 
    121773L, 121779L), col = c("1.01 Hospital identifier", "1.01 Hospital identifier", 
    "1.01 Hospital identifier"), expected = c("value in level set", 
    "value in level set", "value in level set"), actual = c("CMH", 
    "CMH", "CMH"), file = c("'../../data/changed/minap_2020_2021_second.csv'", 
    "'../../data/changed/minap_2020_2021_second.csv'", "'../../data/changed/minap_2020_2021_second.csv'"
    )), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
    )), class = c("tbl_df", "tbl", "data.frame"))

如您所见,我得到 'Yes' 作为所有列的 id 行和 NO 的列的输出。更不用说考虑更复杂的条件,如 'Unknown\NA\Yes' 或其他条件。

为什么我得不到我想要的输出?是否有另一种方法可以仅使用 tidyverse 来实现它?

如果你尝试“ifelse”会怎么样?

test <- test %>% mutate(first_attack = ifelse(previous_cabg == "Yes" | previous_pci == "Yes" | previous_ami == "Yes", "Yes", "No")) 

我想你只需要:

case_when(
    previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'Yes' ~  'Yes',
    previous_cabg == 'No' | previous_pci == 'No' | previous_ami == 'No' ~  'No',
    TRUE ~ "Unknown/NA"
)

对于三个测试列中至少有一个为“是”的每一行,第一行将导致“是”。满足case_when第一行的行将不再测试任何后续条件。

在剩余的行中,第二行将导致“否”,其中三个测试列中的任何一个为“否”。满足case_when第二行的行将不再测试任何后续条件。

最后一行将“Unknown/NA”分配给所有剩余的行,这应该只是那些在三个测试列中没有任何“是”或“否”值的行。

为了完整起见,这里有一个使用您的示例数据的完整可重现示例:

library(tidyverse)

test_first_attack = structure(list(id = c(112139L, 43919L, 92430L, 87137L, 95417L, 
                                          66955L, 16293L, 61396L, 25379L, 79229L, 27107L, 63243L, 50627L, 
                                          17968L, 83015L, 96549L, 7332L, 4873L, 98131L, 93506L, 52894L, 
                                          59327L, 85003L, 96623L, 82999L, 65769L, 67063L, 21744L, 62961L, 
                                          2229L, 103673L, 9367L, 60215L, 74044L, 58422L, 57530L, 100399L, 
                                          46483L, 108690L, 62017L, 46467L, 79562L, 4800L, 119158L, 103222L, 
                                          32908L, 14491L, 30293L, 52558L, 122304L, 42281L, 1553L, 111771L, 
                                          23087L, 30147L, 37842L, 51552L, 20148L, 28L, 7477L), previous_cabg = structure(c(1L, 
                                                                                                                           1L, 1L, NA, 1L, NA, NA, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 
                                                                                                                           1L, NA, 1L, 1L, NA, 1L, NA, 1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 3L, 
                                                                                                                           1L, 1L, NA, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
                                                                                                                           1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
                                                                                                                                                                                   "Unknown", "Yes"), class = "factor"), previous_pci = structure(c(1L, 
                                                                                                                                                                                                                                                    1L, 2L, NA, 1L, NA, NA, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 
                                                                                                                                                                                                                                                    2L, NA, 2L, 1L, NA, 2L, NA, 1L, 2L, 1L, 1L, 1L, NA, 2L, 1L, 1L, 
                                                                                                                                                                                                                                                    2L, 2L, NA, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
                                                                                                                                                                                                                                                    1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 2L, 1L, 1L), .Label = c("No", 
                                                                                                                                                                                                                                                                                                            "Yes", "Unknown"), class = "factor"), previous_ami = structure(c(2L, 
                                                                                                                                                                                                                                                                                                                                                                             2L, 1L, 2L, 2L, NA, 2L, 1L, 2L, 2L, NA, 1L, 2L, 2L, 2L, 2L, 2L, 
                                                                                                                                                                                                                                                                                                                                                                             1L, NA, 1L, 2L, NA, 1L, NA, 2L, 1L, 2L, 2L, 2L, NA, 1L, 1L, 1L, 
                                                                                                                                                                                                                                                                                                                                                                             2L, 1L, NA, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 3L, 2L, 2L, 
                                                                                                                                                                                                                                                                                                                                                                             2L, 2L, 2L, 2L, 2L, NA, 2L, 2L, 2L, 1L, 2L), .Label = c("Yes", 
                                                                                                                                                                                                                                                                                                                                                                                                                                     "No", "Unknown"), class = "factor")), row.names = c(NA, -60L), problems = structure(list(
                                                                                                                                                                                                                                                                                                                                                                                                                                       row = c(34136L, 121773L, 121779L), col = c("1.01 Hospital identifier", 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  "1.01 Hospital identifier", "1.01 Hospital identifier"), 
                                                                                                                                                                                                                                                                                                                                                                                                                                       expected = c("value in level set", "value in level set", 
                                                                                                                                                                                                                                                                                                                                                                                                                                                    "value in level set"), actual = c("CMH", "CMH", "CMH"), file = c("'../../data/changed/minap_2020_2021_second.csv'", 
data/changed/minap_2020_2021_second.csv'", "'../../data/changed/minap_2020_2021_second.csv'"
                                                                                                                                                                                                                                                                                                                                                                                                                                                    )), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
                                                                                                                                                                                                                                                                                                                                                                                                                                                    )), class = c("tbl_df", "tbl", "data.frame"))



test_first_attack = test_first_attack %>% 
  mutate(first_attack = case_when(
    previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'Yes' ~  'Yes',
    previous_cabg == 'No' | previous_pci == 'No' | previous_ami == 'No' ~  'No',
    TRUE ~ "Unknown/NA"
  ))

test_first_attack
#> # A tibble: 60 x 5
#>        id previous_cabg previous_pci previous_ami first_attack
#>     <int> <fct>         <fct>        <fct>        <chr>       
#>  1 112139 No            No           No           No          
#>  2  43919 No            No           No           No          
#>  3  92430 No            Yes          Yes          Yes         
#>  4  87137 <NA>          <NA>         No           No          
#>  5  95417 No            No           No           No          
#>  6  66955 <NA>          <NA>         <NA>         Unknown/NA  
#>  7  16293 <NA>          <NA>         No           No          
#>  8  61396 No            Yes          Yes          Yes         
#>  9  25379 No            Yes          No           Yes         
#> 10  79229 No            No           No           No          
#> # … with 50 more rows