嵌套在 R 中的 ifelse 非常接近工作

Question

我正在处理以下四列原始体重测量数据和一个功能非常接近的嵌套 ifelse 语句，该语句生成 'kg' 向量。

     Id       G4_R_2_4         G4_R_2_5        G4_R_2_5_option2          kg
219 13237       16.0             NA                  NA                16.0
220 139129      8.50             55.70               47.20             8.50
221 139215      28.9             NA                  NA                28.9
222 139216       NA              46.70               8.50              46.70
223 139264      12.40            NA                  NA                12.40
224 139281      13.60            NA                  NA                13.60
225 139366      16.10            NA                  NA                16.10
226 139376      61.80            NA                  NA                61.80
227 140103      NA               48.60               9.10              48.60

目标是根据以下条件将三个 'G4' 列合并为 kg： 1) 如果 G4_R_2_4 不是 NA，打印它的值 2) 如果 G4_R_2_4 是 NA，打印出现在 G4_R_2_5 和 G4_R_2_5_option2 中的较小值（对不起变量名）

我一直在使用以下语句（称为 'child' 的大数据集）：

> child$kg <- ifelse(child$G4_R_2_4 == 'NA' & child$G4_R_2_5 < child$G4_R_2_5_option2,
   child$G4_R_2_5, ifelse(child$G4_R_2_4 == 'NA' & child$G4_R_2_5 > child$G4_R_2_5_option2,
                          child$G4_R_2_5_option2, child$G4_R_2_4))

这导致了我现在拥有的 'kg' 向量。它似乎满足 G4_R_2_4 条件（is/is 不是 NA），但对于 NA 情况总是打印 G4_R_2_5 的值。我如何让它包含大于 than/less 的条件？

Answer 1

从您的示例中看不出来，但我认为问题在于您对 NA 的处理不正确 and\or，对 data.frame 的列使用了错误的类型。尝试像这样重写您的代码：

#if your columns are of character type (warnings are ok)
child$G4_R_2_4<-as.numeric(child$G4_R_2_4)
child$G4_R_2_5<-as.numeric(child$G4_R_2_5)
child$G4_R_2_5_option2<-as.numeric(child$G4_R_2_5_option2)
#correct NA handling
child$kg<-ifelse(is.na(child$G4_R_2_4) & child$G4_R_2_5 <
   child$G4_R_2_5_option2, child$G4_R_2_5, ifelse(is.na(child$G4_R_2_4) &
     child$G4_R_2_5 > child$G4_R_2_5_option2, child$G4_R_2_5_option2, child$G4_R_2_4))

Answer 2

我很确定问题在于您不是在测试这些值是否为 NA，而是在测试它们是否等于字符串 "NA"，而它们从来都不是。这应该有效：

child$kg <- ifelse(is.na(child$G4_R_2_4) & 
                   child$G4_R_2_5 < child$G4_R_2_5_option2,
                   child$G4_R_2_5,
              ifelse(is.na(child$G4_R_2_4) &
                     child$G4_R_2_5 > child$G4_R_2_5_option2,
                     child$G4_R_2_5_option2,
                       child$G4_R_2_4))

Answer 3

我们可以使用 pmin 来做到这一点。假设您的 'G4' 列是 'character' class，我们将这些列转换为 'numeric' class 并在该列上使用 pmin。

 indx <- grep('^G4', names(child))
 child[indx] <- lapply(child[indx], as.numeric)
 d1 <- child[indx]
 child$kgN <- ifelse(is.na(d1[,1]), do.call(pmin, c(d1[-1], na.rm=TRUE)), d1[,1])
 child$kgN
 #[1] 16.0  8.5 28.9  8.5 12.4 13.6 16.1 61.8  9.1

或不使用 ifelse

 cbind(d1[,1], do.call(pmin, c(d1[-1], na.rm=TRUE)))[cbind(1:nrow(d1),
             (is.na(d1[,1]))+1L)]
 #[1] 16.0  8.5 28.9  8.5 12.4 13.6 16.1 61.8  9.1

基准

set.seed(24)
child1 <- as.data.frame(matrix(sample(c(NA,0:50), 1e6*3, replace=TRUE),
    ncol=3, dimnames=list(NULL, c('G4_R_2_4', 'G4_R_2_5', 
                'G4_R_2_5_option2'))) )
cyberj0g <- function(){
   with(child1, ifelse(is.na(G4_R_2_4) & G4_R_2_5 <
     G4_R_2_5_option2, G4_R_2_5, ifelse(is.na(G4_R_2_4) &
       G4_R_2_5 > G4_R_2_5_option2, G4_R_2_5_option2, G4_R_2_4)))
  }

 get_kg <- function(x){
      if(!is.na(x[2])) return (x[2])
      return (min(x[3], x[4], na.rm = T))}
RHertel <- function() apply(child1,1,get_kg) 

akrun <- function(){cbind(child1[,1], do.call(pmin, c(child1[-1],
    na.rm=TRUE)))[cbind(1:nrow(child1),  (is.na(child1[,1]))+1L)]} 

system.time(cyberj0g())
#  user  system elapsed 
# 0.451   0.000   0.388  

system.time(RHertel())
#   user  system elapsed 
# 11.808   0.000  10.928 

system.time(akrun())
#   user  system elapsed 
#  0.000   0.000   0.084 

library(microbenchmark) 
microbenchmark(cyberj0g(), akrun(), unit='relative', times=20L)
#Unit: relative
#       expr      min       lq     mean   median       uq      max neval cld
# cyberj0g() 3.750391 4.137777 3.538063 4.091793 2.895156 3.197511    20   b
#    akrun() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    20   a

Answer 4

这是一个可能有趣的替代版本，假设值以数字形式存储（否则列条目应转换为数字值，如其他答案中所建议的）：

get_kg <- function(x){
 if(!is.na(x[2])) return (x[2])
 return (min(x[3], x[4], na.rm = T))}

child$kg <- apply(child,1,get_kg)

#> child
#        Id G4_R_2_4 G4_R_2_5 G4_R_2_5_option2   kg
#219  13237     16.0       NA               NA 16.0
#220 139129      8.5     55.7             47.2  8.5
#221 139215     28.9       NA               NA 28.9
#222 139216       NA     46.7              8.5  8.5
#223 139264     12.4       NA               NA 12.4
#224 139281     13.6       NA               NA 13.6
#225 139366     16.1       NA               NA 16.1
#226 139376     61.8       NA               NA 61.8
#227 140103       NA     48.6              9.1  9.1

嵌套在 R 中的 ifelse 非常接近工作

nested ifelse in R so close to working

if-statement

r

dataframe

na

基准