使用数值条件对一系列列进行编码

Question

我有一系列数字列，范围从 0 到 8。我想在一行仅一次报告 3 或更多时生成一个二项式变量，而不是编码为“高”，否则为“低”。

structure(list(AE_1 = c(0L, 1L, 0L, 0L, 0L, 2L, 0L), AE_2 = c(0L, 
1L, 2L, 1L, 0L, 0L, 0L), AE_3 = c(1L, 4L, 1L, 8L, 0L, 8L, 1L), 
    AE_4 = c(0L, 1L, 1L, 0L, 0L, 0L, 0L), AE_5 = c(0L, 0L, 1L, 
    1L, 0L, 0L, 1L), AE_6 = c(0L, 5L, 1L, 3L, 0L, 4L, 1L), AE_7 = c(0L, 
    1L, 1L, 1L, 0L, 2L, 0L), AE_8 = c(0L, 2L, 1L, 2L, 0L, 0L, 
    0L), new_AE = c("low", "low", "low", "low", "low", "low", 
    "low")), class = "data.frame", row.names = c(NA, -7L))

我有这段代码，所有行的结果都很低。


df<-df%>%
     mutate(new_AE=  pmap_chr(select(., starts_with('AE')), ~ 
       case_when(any(c(...) <= 2) ~ "low" , any(c(...) >=3) ~ "high")))

虽然我想要这样的东西：

Answer 1

问题是第一个条件的 case_when 全部为真，因此我们只得到 'low' 值。在这里，我们甚至不需要 case_when 因为只有两个类别，这可以通过将逻辑索引转换为数字索引并替换为标签向量来创建

library(dplyr)
df %>%
  rowwise %>%
  mutate(new_AE = c('low', 'high')[1+ any(c_across(where(is.numeric)) >=3)]) %>%
  ungroup

-输出

# A tibble: 7 x 9
#   AE_1  AE_2  AE_3  AE_4  AE_5  AE_6  AE_7  AE_8 new_AE
#  <int> <int> <int> <int> <int> <int> <int> <int> <chr> 
#1     0     0     1     0     0     0     0     0 low   
#2     1     1     4     1     0     5     1     2 high  
#3     0     2     1     1     1     1     1     1 low   
#4     0     1     8     0     1     3     1     2 high  
#5     0     0     0     0     0     0     0     0 low   
#6     2     0     8     0     0     4     2     0 high  
#7     0     0     1     0     1     1     0     0 low

或者使用 base R

中的 rowSums 可以更轻松地完成此操作

df$new_AE <- c("low", "high")[(!!rowSums(df >= 3)) + 1]
df$new_AE
#[1] "low"  "high" "low"  "high" "low"  "high" "low"

在应用case_when时必须考虑逻辑语句的顺序或确保在后续表达式中进行更正。如果我们测试 OP 的第二个数据

v1 <- c(1, 1, 4, 1, 0, 5, 1)
any(v1 <= 2)
#[1] TRUE

这是case_when中的第一个表达式。由于第一个已经执行并找到匹配项，因此不执行后续表达式

case_when(any(v1 <=2) ~ 'low', any(v1 >=3) ~ 'high')
#[1] "low"

通过颠倒顺序，我们得到“高”

case_when( any(v1 >=3) ~ 'high', any(v1 <=2) ~ 'low')
#[1] "high"

所以，确定哪个更优先，并根据那个设置这些表达式的顺序

Answer 2

更新我对我的解决方案稍作修改，因为 new_AE 列从一开始就存在，只是值不正确所以这里还有另一种解决方案，以防万一您想一次性使用 pmap .但是，您已经收到了一些很棒的解决方案。

library(dplyr)
library(purrr)

df %>%
  mutate(new_AE = pmap(df %>% 
                         select(-9), ~ ifelse(any(c(...) >= 3), "high", "low")))


  AE_1 AE_2 AE_3 AE_4 AE_5 AE_6 AE_7 AE_8 new_AE
1    0    0    1    0    0    0    0    0    low
2    1    1    4    1    0    5    1    2   high
3    0    2    1    1    1    1    1    1    low
4    0    1    8    0    1    3    1    2   high
5    0    0    0    0    0    0    0    0    low
6    2    0    8    0    0    4    2    0   high
7    0    0    1    0    1    1    0    0    low

Answer 3

这可以通过使用 pmax 检查 base R 中每一行的最大值来轻松完成。现在当然，您不会将 8 个列名写入 pmax，所以这样做。

df[,9] <- c("low", "high")[ 1 + (do.call(pmax, df[,-9]) >= 3)]

> df
  AE_1 AE_2 AE_3 AE_4 AE_5 AE_6 AE_7 AE_8 new_AE
1    0    0    1    0    0    0    0    0    low
2    1    1    4    1    0    5    1    2   high
3    0    2    1    1    1    1    1    1    low
4    0    1    8    0    1    3    1    2   high
5    0    0    0    0    0    0    0    0    low
6    2    0    8    0    0    4    2    0   high
7    0    0    1    0    1    1    0    0    low

根据您想要的条件查看 [] returns true/false 中的表达式

# this returns max of each row
do.call(pmax, df[,-9])
[1] 1 5 2 8 0 8 1

# this checks whether max of each row is 3 or more
do.call(pmax, df[,-9]) >= 3
[1] FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE

因此，如果您不习惯使用此策略，可以改用 replace

df$new_AE <- replace(df$new_AE, do.call(pmax, df[,-9]) >= 3, "high")

使用数值条件对一系列列进行编码

Coding a series of columns with a numerical condition

r

dataframe

dplyr

case-when