仅使用 dplyr 跨 3 列应用条件时如何填充是和否?

How to populate Yes and No when applying conditions accros 3 columns only with dplyr?

我正在尝试创建一个新列,比如测试,其中包含基于 3 列的多个条件。我正在尝试仅使用 tidyverse 来实现这一目标。这是我的条件:

这是我拥有的数据集类型:

structure(list(id = c(112139L, 43919L, 92430L, 87137L, 95417L, 
66955L, 16293L, 61396L, 25379L, 79229L, 27107L, 63243L, 50627L, 
17968L, 83015L, 96549L, 7332L, 4873L, 98131L, 93506L, 52894L, 
59327L, 85003L, 96623L, 82999L, 65769L, 67063L, 21744L, 62961L, 
2229L, 103673L, 9367L, 60215L, 74044L, 58422L, 57530L, 100399L, 
46483L, 108690L, 62017L, 46467L, 79562L, 4800L, 119158L, 103222L, 
32908L, 14491L, 30293L, 52558L, 122304L, 42281L, 1553L, 111771L, 
23087L, 30147L, 37842L, 51552L, 20148L, 28L, 7477L), previous_cabg = structure(c(1L, 
1L, 1L, NA, 1L, NA, NA, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, NA, 1L, 1L, NA, 1L, NA, 1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 3L, 
1L, 1L, NA, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
"Unknown", "Yes"), class = "factor"), previous_pci = structure(c(1L, 
1L, 2L, NA, 1L, NA, NA, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 
2L, NA, 2L, 1L, NA, 2L, NA, 1L, 2L, 1L, 1L, 1L, NA, 2L, 1L, 1L, 
2L, 2L, NA, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 2L, 1L, 1L), .Label = c("No", 
"Yes", "Unknown"), class = "factor"), previous_ami = structure(c(2L, 
2L, 1L, 2L, 2L, NA, 2L, 1L, 2L, 2L, NA, 1L, 2L, 2L, 2L, 2L, 2L, 
1L, NA, 1L, 2L, NA, 1L, NA, 2L, 1L, 2L, 2L, 2L, NA, 1L, 1L, 1L, 
2L, 1L, NA, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 3L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, NA, 2L, 2L, 2L, 1L, 2L), .Label = c("Yes", 
"No", "Unknown"), class = "factor")), row.names = c(NA, -60L), problems = structure(list(
    row = c(34136L, 121773L, 121779L), col = c("1.01 Hospital identifier", 
    "1.01 Hospital identifier", "1.01 Hospital identifier"), 
    expected = c("value in level set", "value in level set", 
    "value in level set"), actual = c("CMH", "CMH", "CMH"), file = c("'../../data/changed/minap_2020_2021_second.csv'", 
    "'../../data/changed/minap_2020_2021_second.csv'", "'../../data/changed/minap_2020_2021_second.csv'"
    )), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
)), class = c("tbl_df", "tbl", "data.frame"))

这是它的样子,但只有前 10 行,如果你仔细看,我在 3 列中有不同的匹配组

# A tibble: 60 x 4
       id previous_cabg previous_pci previous_ami
    <int> <fct>         <fct>        <fct>       
 1 112139 No            No           No          
 2  43919 No            No           No          
 3  92430 No            Yes          Yes         
 4  87137 NA            NA           No          
 5  95417 No            No           No          
 6  66955 NA            NA           NA          
 7  16293 NA            NA           No          
 8  61396 No            Yes          Yes         
 9  25379 No            Yes          No          
10  79229 No            No           No        

我希望仅使用 tidyverse 或混合使用 tidyverse 和 r base 来解决这个问题。

这是我试过的方法,但感觉不太明智。我认为这是不明智的,因为这段代码将成为自动化过程的一部分,如果我得到其他类别,而不是 Yes and No,比如 Unknown,因为它随后出现在下一个数据集提取中,那么我希望该代码将避免我上面给出的条件中的所有其他情况。

dplyr::mutate(first_attack = 
                  dplyr::case_when(previous_cabg == 'No'  | previous_pci == 'No'  | previous_ami == 'Yes' ~ 'Yes',
                                   previous_cabg == 'No'  | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
                                   previous_cabg == 'Yes' | previous_pci == 'No'  | previous_ami == 'Yes' ~ 'Yes', 
                                   previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'No' ~  'Yes', 
                                   previous_cabg == 'No'  | previous_pci == 'No'  | previous_ami == 'Yes' ~ 'Yes',
                                   previous_cabg == 'No'  | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
                                   previous_cabg == 'Yes' | previous_pci == 'No'  | previous_ami == 'Yes' ~ 'Yes', 
                                   previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'No' ~  'Yes'
                                   
                                   # deal with the unknown category
                                   previous_cabg == 'Unknown'  | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
                                   previous_cabg == 'Yes' | previous_pci == 'Unknown'  | previous_ami == 'Yes' ~ 'Yes', 
                                   previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'No' ~  'Yes', 
                                   previous_cabg == 'Unknown'  | previous_pci == 'Unknown'  | previous_ami == 'Yes' ~ 'Yes',
                                   previous_cabg == 'Unknown'  | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
                                   previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes', 
                                   previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'Unknown' ~  'Yes', 
                                   
                                   
                                   previous_cabg == 'Yes' |  previous_pci == 'No'  | previous_ami == 'Yes' ~  'Yes', 
                                   previous_cabg == 'Yes' |  previous_pci == 'No'  | previous_ami == 'No'  ~  'Yes',
                                   previous_cabg == 'No'  |  previous_pci == 'No'  | previous_ami == 'Yes' ~  'Yes',
                                   previous_cabg == 'No'  | previous_pci == 'Yes'  | previous_ami == 'No'  ~ 'Yes', 
                                   
                                   
                                   previous_cabg == 'Yes' |  previous_pci == 'Unknown'   | previous_ami == 'Yes' ~  'Yes', 
                                   previous_cabg == 'Yes' |  previous_pci == 'Unknown'   | previous_ami == 'Unknown'   ~  'Yes',
                                   previous_cabg == 'Unknown'   |  previous_pci == 'Unknown'   | previous_ami == 'Yes' ~  'Yes',
                                   previous_cabg == 'Unknown'   | previous_pci == 'Yes'  | previous_ami == 'Unknown'   ~ 'Yes', 
                                   
                                   
                                   previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Unknown' ~ 'Yes', 
                                   previous_cabg == 'Unknown'  | previous_pci == 'Yes'| previous_ami == 'Unknown' ~ 'Yes', 
                                   previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes', 
                                   previous_cabg == 'Unknown'  | previous_pci == 'Yes'| previous_ami == 'Yes' ~ 'Yes', 
                                   
                                   previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'No' ~ 'Yes', 
                                   previous_cabg == 'No'  | previous_pci == 'Yes'| previous_ami == 'No' ~ 'Yes', 
                                   previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes', 
                                   previous_cabg == 'No'  | previous_pci == 'Yes'| previous_ami == 'Yes' ~ 'Yes', 
                                   
                                   previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Unknown' ~ 'Yes', 
                                   previous_cabg == 'Unknown'  | previous_pci == 'Yes'| previous_ami == 'Unknown' ~ 'Yes', 
                                   previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes', 
                                   previous_cabg == 'Unknown' | previous_pci == 'Yes'| previous_ami == 'Yes' ~ 'Yes', 
                                   
                                   
                                   previous_cabg == 'No'  | previous_pci == 'No'  |  previous_ami == 'No' ~ 'No', 
                                   previous_cabg == 'Yes' | previous_pci == 'Yes' |  previous_ami == 'Yes' ~'Yes'
                                   
                  ))

所以综上所述,您的条件是:

  • 对于每一行,如果任何一列是'Yes',输出'Yes'
  • 对于每一行,如果所有列都是 NA,则输出 NA
  • 对于每一行,如果所有列都是'Unknown',输出'Unknown'
  • 否则输出'No'

如果是这种情况,你可以这样做:

# Convert your data structure into a data.frame
dat <- as.data.frame(dat)

# Remove id col
id <- dat$id
dat <- subset(dat, select = -c(id))

# For each row, check if there is a 'Yes' under any column. If so, return 'Yes'; otherwise return 'No'
output <- apply(dat, 1, function(x) ifelse('Yes' %in% x, 'Yes', 'No'))

# For each row, check if NA under all column. If so, return TRUE; otherwise return FALSE.
isNA <- apply(dat, 1, function(x) ifelse(all(is.na(x)), TRUE, FALSE))

# Now merge output and isNA
output[isNA] <- NA

# For each row, check if 'Unknown' under all column. If so, return TRUE; otherwise return FALSE.
isUK <- apply(dat, 1, function(x) ifelse(all('Unknown' == x), TRUE, FALSE))

# Now merge output and isUK
output[isUK] <- 'Unknown'

# Append the output character vector to a new col of the data frame
dat$id <- id
dat$test <- output

这些操作是 rowwise(),所以它们不是很有效,但是 tidyverse 中的这个解决方案应该干净地实现你想要的。

让我们用名称 dataset 来称呼您的示例数据集。然后是下面的工作流程

library(tidyverse)


# ...
# Code to generate your 'dataset'.
# ...


# Define custom logic across a single row.
get_first_attack <- function(values_across_row) {
  # "Yes" overrides all other values.
  if(isTRUE(any(values_across_row == "Yes"))){
    return("Yes")
  }
  # "No" overrides all missing values: 'NA' and "Unknown".
  else if(isTRUE(any(values_across_row == "No"))) {
    return("No")
  }
  # "Unknown" overrides all other missing values: 'NA'.
  else if(isTRUE(any(values_across_row == "Unknown"))) {
    return("Unknown")
  }
  # All values are missing: 'NA'.
  else {
    return(as.character(NA))
  }
}


dataset %>%
  # Examine row by row.
  dplyr::rowwise() %>%
  # Compare values across each row according to the logic in 'get_first_attack()'.
  dplyr::mutate(first_attack = get_first_attack(across(previous_cabg:previous_ami))) %>%
  # Exit row-wise approach, to restore efficiency.
  dplyr::ungroup() %>%
  # Factor 'first_attack' exactly like its neighboring column.
  dplyr::mutate(first_attack = factor(first_attack, levels = levels(previous_ami)))

应该给你这些结果

# A tibble: 60 x 5
       id previous_cabg previous_pci previous_ami first_attack
    <int> <fct>         <fct>        <fct>        <fct>       
 1 112139 No            No           No           No          
 2  43919 No            No           No           No          
 3  92430 No            Yes          Yes          Yes         
 4  87137 NA            NA           No           No          
 5  95417 No            No           No           No          
 6  66955 NA            NA           NA           NA          
 7  16293 NA            NA           No           No          
 8  61396 No            Yes          Yes          Yes         
 9  25379 No            Yes          No           Yes         
10  79229 No            No           No           No          
# ... with 50 more rows

其中 first_attack 列被恰当地定义为具有三个级别的 factor"Yes""No""Unknown"