r - 从其他变量复制缺失值

Question

简单的问题，但我不知道如何执行以下操作。这是我的数据：

ID    Time1    Time2    Time3    Time4
01    23       23       NA       NA  
02    21       21       21       NA
03    22       22       25       NA
04    29       29       20       NA
05    NA       NA       15       22
06    NA       NA       11       NA

现在，我想用其他变量中可用的数据替换缺失值 (NA)。重要的是，我需要 r 将 'closest' 的值取到缺失的数据点。例如，对于 ID 5，Time1 和 Time2 应该是“15”（而不是“22”）。

像这样：

ID    Time1    Time2    Time3    Time4
01    23       23       23       23  
02    21       21       21       21
03    22       22       25       25
04    29       29       20       20
05    15       15       15       22
06    11       11       11       11

我试过 ifelse 语句，但没有成功。

谢谢！

Answer 1

这比看起来要难得多。我构建了一个一次在一列上工作的解决方案，采用所有时间列索引和当前列索引之间的绝对距离的 pmin()，使用 na.rm=T 参数剥离 NA。然后可以使用结果使用索引矩阵对原始时间列进行索引，然后可以将其分配给目标 data.frame.

中的当前列索引

这种设计的一个优点是它对行进行了完全矢量化。换句话说，它不会一次迭代一行。这对于行数非常多的输入来说可能是一个优势。另一方面，该解决方案确实涉及构建并行所有时间列（timemat、nacols 和 off）的矩阵，这对于大输入来说可能代价高昂。它基本上是在牺牲内存来节省 CPU.

我添加了几行来测试 OP 样本未涵盖的其他案例 data.frame；特别是 (1) 一个全 NA 行，和 (2) 一个在 NA 值的两边有候选非 NA 值的行。

输入：

df <- data.frame(ID=c('01','02','03','04','05','06','07','08'),Time1=c(23L,21L,22L,NA,29L,NA,NA,1L),Time2=c(23L,21L,22L,NA,29L,NA,NA,NA),Time3=c(NA,21L,25L,NA,20L,15L,11L,NA),Time4=c(NA,NA,NA,NA,NA,22L,NA,2L),stringsAsFactors=F);
df;
##   ID Time1 Time2 Time3 Time4
## 1 01    23    23    NA    NA
## 2 02    21    21    21    NA
## 3 03    22    22    25    NA
## 4 04    NA    NA    NA    NA
## 5 05    29    29    20    NA
## 6 06    NA    NA    15    22
## 7 07    NA    NA    11    NA
## 8 08     1    NA    NA     2

解决方案：

ris <- seq_len(nrow(df));
cis <- grep('^Time',names(df));
timemat <- as.matrix(df[cis]);
nacols <- as.data.frame(ifelse(is.na(timemat),NA,col(timemat)));
nacols;
##   Time1 Time2 Time3 Time4
## 1     1     2    NA    NA
## 2     1     2     3    NA
## 3     1     2     3    NA
## 4    NA    NA    NA    NA
## 5     1     2     3    NA
## 6    NA    NA     3     4
## 7    NA    NA     3    NA
## 8     1    NA    NA     4
for (ci in seq_len(ncol(timemat))) {
    off <- abs(nacols-ci);
    best <- which(off==do.call(pmin,c(off,na.rm=T)),arr.ind=T);
    df[cis[ci]] <- timemat[matrix(c(ris,best[match(ris,best[,'row']),'col']),nrow(df))];
};
df;
##   ID Time1 Time2 Time3 Time4
## 1 01    23    23    23    23
## 2 02    21    21    21    21
## 3 03    22    22    25    25
## 4 04    NA    NA    NA    NA
## 5 05    29    29    20    20
## 6 06    15    15    15    22
## 7 07    11    11    11    11
## 8 08     1     1     2     2

Rcpp解决方案：

library(Rcpp);
cppFunction('
    IntegerMatrix fillDFNAsWithNearestInRow(DataFrame df, IntegerVector cis ) {
        IntegerMatrix res(df.nrows(),cis.size());
        if (df.nrows()==0 || cis.size()==0) return res;
        IntegerVector cis0 = clone(cis); for (int cisi = 0; cisi < cis0.size(); ++cisi) --cis0[cisi]; // correct from R 1-based to Rcpp 0-based
        for (int cisi = 0; cisi < cis0.size(); ++cisi) {
            IntegerVector colCur = df[cis0[cisi]];
            for (int ri = 0; ri < colCur.size(); ++ri) {
                if (!IntegerVector::is_na(colCur[ri])) {
                    res(ri,cisi) = colCur[ri];
                    continue;
                }
                int leftOk;
                int rightOk;
                IntegerVector colLeft;
                IntegerVector colRight;
                bool set = false; // assumption
                for (int off = 1; (leftOk = cisi-off>=0, rightOk = cisi+off<cis0.size(), leftOk ) || rightOk; ++off) {
                    if (leftOk && (colLeft = df[cis0[cisi-off]], !IntegerVector::is_na(colLeft[ri]))) {
                        res(ri,cisi) = colLeft[ri];
                        set = true;
                        break;
                    } else if (rightOk && (colRight = df[cis0[cisi+off]], !IntegerVector::is_na(colRight[ri]))) {
                        res(ri,cisi) = colRight[ri];
                        set = true;
                        break;
                    }
                }
                if (!set) res(ri,cisi) = NA_INTEGER;
            }
        }
        return res;
    }
');

df <- data.frame(ID=c('01','02','03','04','05','06','07','08'),Time1=c(23L,21L,22L,NA,29L,NA,NA,1L),Time2=c(23L,21L,22L,NA,29L,NA,NA,NA),Time3=c(NA,21L,25L,NA,20L,15L,11L,NA),Time4=c(NA,NA,NA,NA,NA,22L,NA,2L),stringsAsFactors=F);
cis <- grep('^Time',names(df));
df[cis] <- fillDFNAsWithNearestInRow(df,cis);
df;
##   ID Time1 Time2 Time3 Time4
## 1 01    23    23    23    23
## 2 02    21    21    21    21
## 3 03    22    22    25    25
## 4 04    NA    NA    NA    NA
## 5 05    29    29    20    20
## 6 06    15    15    15    22
## 7 07    11    11    11    11
## 8 08     1     1     2     2

Answer 2

使用 data.table 的滚动连接和 set:

library(data.table)
good = as.data.table( which(!is.na(df[-1]), arr.ind = TRUE) )
all = CJ(row = seq(nrow(df)), col = seq(2L, ncol(df)))
good$col = good$col + 1L
good$col_src = good$col

changes = good[all, on = c("row", "col"), roll="nearest"][ col != col_src ]

changes[, {
  set(df, i = row, j = col, value = df[[ col_src ]][row])
  NULL
}, by=.(col,col_src)]

# based on input from bgoldst's answer
   ID  1  2  3  4
1: 01 23 23 23 23
2: 02 21 21 21 21
3: 03 22 22 25 25
4: 04 NA NA NA NA
5: 05 29 29 20 20
6: 06 15 15 15 22
7: 07 11 11 11 11
8: 08  1  1  2  2

我们找到所有条目进行切换，然后参照set进行修改。我不确定 roll="nearest" 如何处理关系，但我确定可以对其进行调整。

Answer 3

这是一个简单的解决方案：

x <-read.table(text="ID    Time1    Time2    Time3    Time4
01    23       23       NA       NA  
02    21       21       21       NA
03    22       22       25       NA
04    29       29       20       NA
05    NA       NA       15       22
06    NA       NA       11       NA", header=TRUE)

x <- as.matrix(x[,-1])

dofill <- function(r){
  PREV <- c(NA, suppressWarnings(head(r, -1)))
  NEXT <- c(tail(r,-1), NA)
  r[is.na(r)] <- PREV[is.na(r)]
  r[is.na(r)] <- NEXT[is.na(r)]
  r
}

rlefill <- function(r){
  r[is.na(r)] <- "NA"
  rle1 <- rle(r)
  rle1$values <- dofill(as.numeric(rle1$values))
  inverse.rle(rle1)
}

t(apply(x, 1, rlefill))

dofill 只是将所有 NA 替换为先前的值，并将所有剩余的 NA 替换为下一个值。

rlefill 需要将 NA 序列转换为 "one big NA"。

当然，如果你有更多的时间点，你可能需要类似...

cis <- grep('^Time',names(df))
timemat <- as.matrix(df[cis]);

...即从数据框中提取相关部分的更通用的解决方案。

（现在我意识到这不完全是你问的 - 我的解决方案总是使用前面的值，如果它可用，即使下面的值在时间上更接近。它对你的例子没有影响数据集，但它可能会对实际数据产生影响。）

Answer 4

又一次尝试。尽可能分解：（1）从左到右循环一次，向前携带最后一个非 NA 值，并且记录被替换的每个 NA 的非 NA 在哪里，（2）从右到左再次循环（ a) 向后替换携带非 NA 的 NA 和 (b) 比较替换每个 NA 的非 NA 与当前非 NA 的距离，然后保留或替换。尽管有两个显式循环，计算涉及 length == nrow(x).

的向量

ff = function(x)
{    
    taken_from = lapply(seq_along(x), rep_len, nrow(x))
    nas = lapply(x, is.na)

    #loop left -> right 
    # carry forward non-NAs and record which non-NA replaced NA
    last_nona = !nas[[1L]]
    for(j in 2:length(x)) {
        i = which(nas[[j]] & last_nona)
        x[[j]][i] = x[[j - 1L]][i]
        taken_from[[j]][i] = taken_from[[j - 1L]][i]
        last_nona = !is.na(x[[j]])
    }

    #loop right -> left
    #if NA and not replace carry the previous non-NA backward
    #else compare which non-NA is nearer and replace appropriately
    last_nona = !nas[[length(x)]]
    for(j in (length(x) - 1L):1L) {
        i1 = which(nas[[j]] & last_nona)
        i = i1[(j - taken_from[[j]][i1]) > (taken_from[[(j + 1L)]][i1] - j)]
        ii = i1[j == taken_from[[j]][i1]]
        taken_from[[j]][i] = taken_from[[j + 1L]][i]
        taken_from[[j]][ii] = taken_from[[j + 1L]][ii]
        x[[j]][i] = x[[j + 1L]][i]
        x[[j]][ii] = x[[j + 1L]][ii]
        last_nona = !is.na(x[[j]])
    }

    return(x)
}

使用 bgoldst 的数据：

ff(df[-1L])
#  Time1 Time2 Time3 Time4
#1    23    23    23    23
#2    21    21    21    21
#3    22    22    25    25
#4    NA    NA    NA    NA
#5    29    29    20    20
#6    15    15    15    22
#7    11    11    11    11
#8     1     1     2     2

以及必要的基准测试：

set.seed(911)            
DAT = as.data.frame(matrix(sample(c(NA, 0:10), 1e7, TRUE), 1e6, 10))
system.time({ ansff = ff(DAT) })
#   user  system elapsed 
#   0.82    0.38    1.75 
system.time({ ansbgoldst1 = bgoldst1(DAT) })
#   user  system elapsed 
#  20.96    7.53   42.04 
system.time({ ansbgoldst2 = bgoldst2(DAT) })
#   user  system elapsed 
#   0.97    0.25    1.64 
sf1 = system.time({ ansfrank = frank(DAT) }); sf2 = system.time( copy(DAT) )
sf1 - sf2
#   user  system elapsed 
#   5.84    1.46    8.59 
all.equal(ansff, ansbgoldst1)
#[1] TRUE
all.equal(ansbgoldst1, ansbgoldst2)
#[1] TRUE
all.equal(ansbgoldst2, ansfrank)
#[1] TRUE

函数：

bgoldst1 = function(x)
{
    ris = seq_len(nrow(x))
    xm = as.matrix(x)
    nacols = as.data.frame(lapply(seq_along(x), function(i) { x[[i]][!is.na(x[[i]])] = i; x[[i]] }))
    for(ci in seq_along(x)) {
        off = abs(nacols - ci)
        best = which(off == do.call(pmin, c(off, na.rm = TRUE)), arr.ind = TRUE)
        x[ci] = xm[matrix(c(ris, best[match(ris, best[, "row"]), "col"]), nrow(x))]
    }
    x
}

bgoldst2 = function(x) 
{
    ans = as.data.frame(fillDFNAsWithNearestInRow(x, seq_along(x)))
    names(ans) = names(x)
    ans
}

frank = function(x)
{
    x = copy(x)
    good = as.data.table(which(!is.na(x), arr.ind = TRUE))
    all = CJ(row = seq_len(nrow(x)), col = seq_len(ncol(x)))
    good$col = good$col
    good$col_src = good$col

    changes = good[all, on = c("row", "col"), roll = "nearest"][col != col_src]

    changes[, {
            set(x, i = row, j = col, value = x[[col_src]][row])
            NULL
            }, by = .(col, col_src)]
    x
}

r - 从其他变量复制缺失值

r - copy missing values from other variables

r

missing-data