告诉 ifelse 忽略 NA 的直接方法
Direct way of telling ifelse to ignore NA
如所述,当ifelse(test, yes, no)
中的测试条件为NA
时,评估也为NA
。因此以下 returns...
df <- data.frame(a = c(1, 1, NA, NA, NA ,NA),
b = c(NA, NA, 1, 1, NA, NA),
c = c(rep(NA, 4), 1, 1))
ifelse(df$a==1, "a==1",
ifelse(df$b==1, "b==1",
ifelse(df$c==1, "c==1", NA)))
#[1] "a==1" "a==1" NA NA NA NA
... 而不是所需的
#[1] "a==1" "a==1" "b==1" "b==1" "c==1" "c==1"
按照Cath的建议,我可以通过正式指定测试条件不应该包括NA来规避这个问题:
ifelse(df$a==1 & !is.na(df$a), "a==1",
ifelse(df$b==1 & !is.na(df$b), "b==1",
ifelse(df$c==1 & !is.na(df$c), "c==1", NA)))
但是,正如 akrun 还指出的那样,随着列数的增加,此解决方案变得相当冗长。
解决方法是首先将所有 NA
替换为 data.frame 中不存在的值(例如,在本例中为 2):
df_noNA <- data.frame(a = c(1, 1, 2, 2, 2 ,2),
b = c(2, 2, 1, 1, 2, 2),
c = c(rep(2, 4), 1, 1))
ifelse(df_noNA$a==1, "a==1",
ifelse(df_noNA$b==1, "b==1",
ifelse(df_noNA$c==1, "c==1", NA)))
#[1] "a==1" "a==1" "b==1" "b==1" "c==1" "c==1"
但是,我想知道是否有 更直接的方法告诉 ifelse
忽略 NA?还是为 & !is.na
编写函数是最直接的方法?
ignorena <- function(column) {
column ==1 & !is.na(column)
}
ifelse(ignorena(df$a), "a==1",
ifelse(ignorena(df$b), "b==1",
ifelse(ignorena(df$c), "c==1", NA)))
#[1] "a==1" "a==1" "b==1" "b==1" "c==1" "c==1"
我们可以在没有嵌套 ifelse
循环的情况下更有效地做到这一点。对于第一个数据集,我们为非 NA 元素创建一个逻辑矩阵 (!is.na(df)
),获取最大值的列索引,即每行为 TRUE,使用该索引获取列名和 paste
==1
paste0(names(df)[max.col(!is.na(df))], "==1")
#[1] "a==1" "a==1" "b==1" "b==1" "c==1" "c==1"
如果有只有 NA 的行
v1 <- names(df)[max.col(!is.na(df)) * NA^!rowSums(!is.na(df))]
i1 <- !is.na(v1)
v1[i1] <- paste0(v1[i1], "==1")
而对于第二个数据集,因为没有NA,我们可以直接与1比较得到一个逻辑矩阵,和之前的步骤一样
paste0(names(df_noNA)[max.col(df_noNA == 1)], "==1")
#[1] "a==1" "a==1" "b==1" "b==1" "c==1" "c==1"
您可以使用 %in%
而不是 ==
来排序忽略 NA
s。
ifelse(df$a %in% 1, "a==1",
ifelse(df$b %in% 1, "b==1",
ifelse(df$c %in% 1, "c==1", NA)))
不幸的是,与原始解决方案相比,这并没有带来任何性能提升,而@arkun 的解决方案大约快了 3 倍。
solution_original <- function(){
ifelse(df$a==1 & !is.na(df$a), "a==1",
ifelse(df$b==1 & !is.na(df$b), "b==1",
ifelse(df$c==1 & !is.na(df$c), "c==1", NA)))
}
solution_akrun <- function(){
v1 <- names(df)[max.col(!is.na(df)) * NA^!rowSums(!is.na(df))]
i1 <- !is.na(v1)
v1[i1] <- paste0(v1[i1], "==1")
}
solution_mine <- function(x){
ifelse(df$a %in% 1, "a==1",
ifelse(df$b %in% 1, "b==1",
ifelse(df$c %in% 1, "c==1", NA)))
}
set.seed(1)
df <- data.frame(a = sample(c(1, rep(NA, 4)), 1e6, T),
b = sample(c(1, rep(NA, 4)), 1e6, T),
c = sample(c(1, rep(NA, 4)), 1e6, T))
microbenchmark::microbenchmark(
solution_original(),
solution_akrun(),
solution_mine()
)
## Unit: milliseconds
## expr min lq mean median uq max neval
## solution_original() 701.9413 839.3715 845.0720 853.1960 875.6151 1051.6659 100
## solution_akrun() 217.4129 242.5113 293.2987 253.2144 387.1598 564.3981 100
## solution_mine() 698.7628 845.0822 848.6717 858.7892 877.9676 1006.2872 100
受此启发:R: Dealing with TRUE, FALSE, NA and NaN
编辑
根据@arkun 的评论,我重新做了基准测试并修改了声明。
dplyr::case_when
是级联 ifelse
调用的便捷替代方法:
library(dplyr)
df <- data.frame(a = c(1, 1, NA, NA, NA ,NA),
b = c(NA, NA, 1, 1, NA, NA),
c = c(rep(NA, 4), 1, 1))
df %>% mutate(equals = case_when(a == 1 ~ 'a==1',
b == 1 ~ 'b==1',
c == 1 ~ 'c==1'))
#> a b c equals
#> 1 1 NA NA a==1
#> 2 1 NA NA a==1
#> 3 NA 1 NA b==1
#> 4 NA 1 NA b==1
#> 5 NA NA 1 c==1
#> 6 NA NA 1 c==1
它像ifelse
一样级联,所以如果第一个条件为真,即使第二个和第三个条件也为真,也会返回第一个结果。如果 none 为真,则 returns NA
:
set.seed(47)
df <- setNames(as.data.frame(matrix(sample(c(1, NA), 30, replace = TRUE), 10)), letters[1:3])
df %>% mutate(equals = case_when(a == 1 ~ 'a==1',
b == 1 ~ 'b==1',
c == 1 ~ 'c==1'))
#> a b c equals
#> 1 NA 1 1 b==1
#> 2 1 NA NA a==1
#> 3 NA 1 NA b==1
#> 4 NA NA 1 c==1
#> 5 NA NA NA <NA>
#> 6 NA NA 1 c==1
#> 7 1 1 1 a==1
#> 8 1 1 1 a==1
#> 9 NA 1 NA b==1
#> 10 NA 1 NA b==1
而且速度很快:
set.seed(47)
df <- setNames(as.data.frame(matrix(sample(c(1, NA), 3 * 1e5, replace = TRUE), ncol = 3)), letters[1:3])
microbenchmark::microbenchmark(
original = {
ifelse(df$a == 1 & !is.na(df$a), "a==1",
ifelse(df$b == 1 & !is.na(df$b), "b==1",
ifelse(df$c == 1 & !is.na(df$c), "c==1", NA)))},
akrun = {
v1 <- names(df)[max.col(!is.na(df)) * NA^!rowSums(!is.na(df))]
i1 <- !is.na(v1)
v1[i1] <- paste0(v1[i1], "==1")
},
amatsuo_net = {
ifelse(df$a %in% 1, "a==1",
ifelse(df$b %in% 1, "b==1",
ifelse(df$c %in% 1, "c==1", NA)))
},
alistaire = {
df %>% mutate(equals = case_when(a == 1 ~ 'a==1',
b == 1 ~ 'b==1',
c == 1 ~ 'c==1'))
}
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> original 81.19896 86.11843 110.93882 123.92463 128.58037 171.11026 100
#> akrun 27.50351 30.99127 38.98353 32.67991 34.64947 77.98958 100
#> amatsuo_net 83.75744 88.54095 109.22226 110.40066 129.02168 170.92911 100
#> alistaire 16.57426 18.91951 21.73293 19.29925 24.30350 33.83180 100
如ifelse(test, yes, no)
中的测试条件为NA
时,评估也为NA
。因此以下 returns...
df <- data.frame(a = c(1, 1, NA, NA, NA ,NA),
b = c(NA, NA, 1, 1, NA, NA),
c = c(rep(NA, 4), 1, 1))
ifelse(df$a==1, "a==1",
ifelse(df$b==1, "b==1",
ifelse(df$c==1, "c==1", NA)))
#[1] "a==1" "a==1" NA NA NA NA
... 而不是所需的
#[1] "a==1" "a==1" "b==1" "b==1" "c==1" "c==1"
按照Cath的建议,我可以通过正式指定测试条件不应该包括NA来规避这个问题:
ifelse(df$a==1 & !is.na(df$a), "a==1",
ifelse(df$b==1 & !is.na(df$b), "b==1",
ifelse(df$c==1 & !is.na(df$c), "c==1", NA)))
但是,正如 akrun 还指出的那样,随着列数的增加,此解决方案变得相当冗长。
解决方法是首先将所有 NA
替换为 data.frame 中不存在的值(例如,在本例中为 2):
df_noNA <- data.frame(a = c(1, 1, 2, 2, 2 ,2),
b = c(2, 2, 1, 1, 2, 2),
c = c(rep(2, 4), 1, 1))
ifelse(df_noNA$a==1, "a==1",
ifelse(df_noNA$b==1, "b==1",
ifelse(df_noNA$c==1, "c==1", NA)))
#[1] "a==1" "a==1" "b==1" "b==1" "c==1" "c==1"
但是,我想知道是否有 更直接的方法告诉 ifelse
忽略 NA?还是为 & !is.na
编写函数是最直接的方法?
ignorena <- function(column) {
column ==1 & !is.na(column)
}
ifelse(ignorena(df$a), "a==1",
ifelse(ignorena(df$b), "b==1",
ifelse(ignorena(df$c), "c==1", NA)))
#[1] "a==1" "a==1" "b==1" "b==1" "c==1" "c==1"
我们可以在没有嵌套 ifelse
循环的情况下更有效地做到这一点。对于第一个数据集,我们为非 NA 元素创建一个逻辑矩阵 (!is.na(df)
),获取最大值的列索引,即每行为 TRUE,使用该索引获取列名和 paste
==1
paste0(names(df)[max.col(!is.na(df))], "==1")
#[1] "a==1" "a==1" "b==1" "b==1" "c==1" "c==1"
如果有只有 NA 的行
v1 <- names(df)[max.col(!is.na(df)) * NA^!rowSums(!is.na(df))]
i1 <- !is.na(v1)
v1[i1] <- paste0(v1[i1], "==1")
而对于第二个数据集,因为没有NA,我们可以直接与1比较得到一个逻辑矩阵,和之前的步骤一样
paste0(names(df_noNA)[max.col(df_noNA == 1)], "==1")
#[1] "a==1" "a==1" "b==1" "b==1" "c==1" "c==1"
您可以使用 %in%
而不是 ==
来排序忽略 NA
s。
ifelse(df$a %in% 1, "a==1",
ifelse(df$b %in% 1, "b==1",
ifelse(df$c %in% 1, "c==1", NA)))
不幸的是,与原始解决方案相比,这并没有带来任何性能提升,而@arkun 的解决方案大约快了 3 倍。
solution_original <- function(){
ifelse(df$a==1 & !is.na(df$a), "a==1",
ifelse(df$b==1 & !is.na(df$b), "b==1",
ifelse(df$c==1 & !is.na(df$c), "c==1", NA)))
}
solution_akrun <- function(){
v1 <- names(df)[max.col(!is.na(df)) * NA^!rowSums(!is.na(df))]
i1 <- !is.na(v1)
v1[i1] <- paste0(v1[i1], "==1")
}
solution_mine <- function(x){
ifelse(df$a %in% 1, "a==1",
ifelse(df$b %in% 1, "b==1",
ifelse(df$c %in% 1, "c==1", NA)))
}
set.seed(1)
df <- data.frame(a = sample(c(1, rep(NA, 4)), 1e6, T),
b = sample(c(1, rep(NA, 4)), 1e6, T),
c = sample(c(1, rep(NA, 4)), 1e6, T))
microbenchmark::microbenchmark(
solution_original(),
solution_akrun(),
solution_mine()
)
## Unit: milliseconds
## expr min lq mean median uq max neval
## solution_original() 701.9413 839.3715 845.0720 853.1960 875.6151 1051.6659 100
## solution_akrun() 217.4129 242.5113 293.2987 253.2144 387.1598 564.3981 100
## solution_mine() 698.7628 845.0822 848.6717 858.7892 877.9676 1006.2872 100
受此启发:R: Dealing with TRUE, FALSE, NA and NaN
编辑
根据@arkun 的评论,我重新做了基准测试并修改了声明。
dplyr::case_when
是级联 ifelse
调用的便捷替代方法:
library(dplyr)
df <- data.frame(a = c(1, 1, NA, NA, NA ,NA),
b = c(NA, NA, 1, 1, NA, NA),
c = c(rep(NA, 4), 1, 1))
df %>% mutate(equals = case_when(a == 1 ~ 'a==1',
b == 1 ~ 'b==1',
c == 1 ~ 'c==1'))
#> a b c equals
#> 1 1 NA NA a==1
#> 2 1 NA NA a==1
#> 3 NA 1 NA b==1
#> 4 NA 1 NA b==1
#> 5 NA NA 1 c==1
#> 6 NA NA 1 c==1
它像ifelse
一样级联,所以如果第一个条件为真,即使第二个和第三个条件也为真,也会返回第一个结果。如果 none 为真,则 returns NA
:
set.seed(47)
df <- setNames(as.data.frame(matrix(sample(c(1, NA), 30, replace = TRUE), 10)), letters[1:3])
df %>% mutate(equals = case_when(a == 1 ~ 'a==1',
b == 1 ~ 'b==1',
c == 1 ~ 'c==1'))
#> a b c equals
#> 1 NA 1 1 b==1
#> 2 1 NA NA a==1
#> 3 NA 1 NA b==1
#> 4 NA NA 1 c==1
#> 5 NA NA NA <NA>
#> 6 NA NA 1 c==1
#> 7 1 1 1 a==1
#> 8 1 1 1 a==1
#> 9 NA 1 NA b==1
#> 10 NA 1 NA b==1
而且速度很快:
set.seed(47)
df <- setNames(as.data.frame(matrix(sample(c(1, NA), 3 * 1e5, replace = TRUE), ncol = 3)), letters[1:3])
microbenchmark::microbenchmark(
original = {
ifelse(df$a == 1 & !is.na(df$a), "a==1",
ifelse(df$b == 1 & !is.na(df$b), "b==1",
ifelse(df$c == 1 & !is.na(df$c), "c==1", NA)))},
akrun = {
v1 <- names(df)[max.col(!is.na(df)) * NA^!rowSums(!is.na(df))]
i1 <- !is.na(v1)
v1[i1] <- paste0(v1[i1], "==1")
},
amatsuo_net = {
ifelse(df$a %in% 1, "a==1",
ifelse(df$b %in% 1, "b==1",
ifelse(df$c %in% 1, "c==1", NA)))
},
alistaire = {
df %>% mutate(equals = case_when(a == 1 ~ 'a==1',
b == 1 ~ 'b==1',
c == 1 ~ 'c==1'))
}
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> original 81.19896 86.11843 110.93882 123.92463 128.58037 171.11026 100
#> akrun 27.50351 30.99127 38.98353 32.67991 34.64947 77.98958 100
#> amatsuo_net 83.75744 88.54095 109.22226 110.40066 129.02168 170.92911 100
#> alistaire 16.57426 18.91951 21.73293 19.29925 24.30350 33.83180 100