r - 从其他变量复制缺失值
r - copy missing values from other variables
简单的问题,但我不知道如何执行以下操作。这是我的数据:
ID Time1 Time2 Time3 Time4
01 23 23 NA NA
02 21 21 21 NA
03 22 22 25 NA
04 29 29 20 NA
05 NA NA 15 22
06 NA NA 11 NA
现在,我想用其他变量中可用的数据替换缺失值 (NA)。重要的是,我需要 r 将 'closest' 的值取到缺失的数据点。例如,对于 ID 5,Time1 和 Time2 应该是“15”(而不是“22”)。
像这样:
ID Time1 Time2 Time3 Time4
01 23 23 23 23
02 21 21 21 21
03 22 22 25 25
04 29 29 20 20
05 15 15 15 22
06 11 11 11 11
我试过 ifelse 语句,但没有成功。
谢谢!
这比看起来要难得多。我构建了一个一次在一列上工作的解决方案,采用所有时间列索引和当前列索引之间的绝对距离的 pmin()
,使用 na.rm=T
参数剥离 NA。然后可以使用结果使用索引矩阵对原始时间列进行索引,然后可以将其分配给目标 data.frame.
中的当前列索引
这种设计的一个优点是它对行进行了完全矢量化。换句话说,它不会一次迭代一行。这对于行数非常多的输入来说可能是一个优势。另一方面,该解决方案确实涉及构建并行所有时间列(timemat
、nacols
和 off
)的矩阵,这对于大输入来说可能代价高昂。它基本上是在牺牲内存来节省 CPU.
我添加了几行来测试 OP 样本未涵盖的其他案例 data.frame;特别是 (1) 一个全 NA 行,和 (2) 一个在 NA 值的两边有候选非 NA 值的行。
输入:
df <- data.frame(ID=c('01','02','03','04','05','06','07','08'),Time1=c(23L,21L,22L,NA,29L,NA,NA,1L),Time2=c(23L,21L,22L,NA,29L,NA,NA,NA),Time3=c(NA,21L,25L,NA,20L,15L,11L,NA),Time4=c(NA,NA,NA,NA,NA,22L,NA,2L),stringsAsFactors=F);
df;
## ID Time1 Time2 Time3 Time4
## 1 01 23 23 NA NA
## 2 02 21 21 21 NA
## 3 03 22 22 25 NA
## 4 04 NA NA NA NA
## 5 05 29 29 20 NA
## 6 06 NA NA 15 22
## 7 07 NA NA 11 NA
## 8 08 1 NA NA 2
解决方案:
ris <- seq_len(nrow(df));
cis <- grep('^Time',names(df));
timemat <- as.matrix(df[cis]);
nacols <- as.data.frame(ifelse(is.na(timemat),NA,col(timemat)));
nacols;
## Time1 Time2 Time3 Time4
## 1 1 2 NA NA
## 2 1 2 3 NA
## 3 1 2 3 NA
## 4 NA NA NA NA
## 5 1 2 3 NA
## 6 NA NA 3 4
## 7 NA NA 3 NA
## 8 1 NA NA 4
for (ci in seq_len(ncol(timemat))) {
off <- abs(nacols-ci);
best <- which(off==do.call(pmin,c(off,na.rm=T)),arr.ind=T);
df[cis[ci]] <- timemat[matrix(c(ris,best[match(ris,best[,'row']),'col']),nrow(df))];
};
df;
## ID Time1 Time2 Time3 Time4
## 1 01 23 23 23 23
## 2 02 21 21 21 21
## 3 03 22 22 25 25
## 4 04 NA NA NA NA
## 5 05 29 29 20 20
## 6 06 15 15 15 22
## 7 07 11 11 11 11
## 8 08 1 1 2 2
Rcpp解决方案:
library(Rcpp);
cppFunction('
IntegerMatrix fillDFNAsWithNearestInRow(DataFrame df, IntegerVector cis ) {
IntegerMatrix res(df.nrows(),cis.size());
if (df.nrows()==0 || cis.size()==0) return res;
IntegerVector cis0 = clone(cis); for (int cisi = 0; cisi < cis0.size(); ++cisi) --cis0[cisi]; // correct from R 1-based to Rcpp 0-based
for (int cisi = 0; cisi < cis0.size(); ++cisi) {
IntegerVector colCur = df[cis0[cisi]];
for (int ri = 0; ri < colCur.size(); ++ri) {
if (!IntegerVector::is_na(colCur[ri])) {
res(ri,cisi) = colCur[ri];
continue;
}
int leftOk;
int rightOk;
IntegerVector colLeft;
IntegerVector colRight;
bool set = false; // assumption
for (int off = 1; (leftOk = cisi-off>=0, rightOk = cisi+off<cis0.size(), leftOk ) || rightOk; ++off) {
if (leftOk && (colLeft = df[cis0[cisi-off]], !IntegerVector::is_na(colLeft[ri]))) {
res(ri,cisi) = colLeft[ri];
set = true;
break;
} else if (rightOk && (colRight = df[cis0[cisi+off]], !IntegerVector::is_na(colRight[ri]))) {
res(ri,cisi) = colRight[ri];
set = true;
break;
}
}
if (!set) res(ri,cisi) = NA_INTEGER;
}
}
return res;
}
');
df <- data.frame(ID=c('01','02','03','04','05','06','07','08'),Time1=c(23L,21L,22L,NA,29L,NA,NA,1L),Time2=c(23L,21L,22L,NA,29L,NA,NA,NA),Time3=c(NA,21L,25L,NA,20L,15L,11L,NA),Time4=c(NA,NA,NA,NA,NA,22L,NA,2L),stringsAsFactors=F);
cis <- grep('^Time',names(df));
df[cis] <- fillDFNAsWithNearestInRow(df,cis);
df;
## ID Time1 Time2 Time3 Time4
## 1 01 23 23 23 23
## 2 02 21 21 21 21
## 3 03 22 22 25 25
## 4 04 NA NA NA NA
## 5 05 29 29 20 20
## 6 06 15 15 15 22
## 7 07 11 11 11 11
## 8 08 1 1 2 2
使用 data.table 的滚动连接和 set
:
library(data.table)
good = as.data.table( which(!is.na(df[-1]), arr.ind = TRUE) )
all = CJ(row = seq(nrow(df)), col = seq(2L, ncol(df)))
good$col = good$col + 1L
good$col_src = good$col
changes = good[all, on = c("row", "col"), roll="nearest"][ col != col_src ]
changes[, {
set(df, i = row, j = col, value = df[[ col_src ]][row])
NULL
}, by=.(col,col_src)]
# based on input from bgoldst's answer
ID 1 2 3 4
1: 01 23 23 23 23
2: 02 21 21 21 21
3: 03 22 22 25 25
4: 04 NA NA NA NA
5: 05 29 29 20 20
6: 06 15 15 15 22
7: 07 11 11 11 11
8: 08 1 1 2 2
我们找到所有条目进行切换,然后参照set
进行修改。我不确定 roll="nearest"
如何处理关系,但我确定可以对其进行调整。
这是一个简单的解决方案:
x <-read.table(text="ID Time1 Time2 Time3 Time4
01 23 23 NA NA
02 21 21 21 NA
03 22 22 25 NA
04 29 29 20 NA
05 NA NA 15 22
06 NA NA 11 NA", header=TRUE)
x <- as.matrix(x[,-1])
dofill <- function(r){
PREV <- c(NA, suppressWarnings(head(r, -1)))
NEXT <- c(tail(r,-1), NA)
r[is.na(r)] <- PREV[is.na(r)]
r[is.na(r)] <- NEXT[is.na(r)]
r
}
rlefill <- function(r){
r[is.na(r)] <- "NA"
rle1 <- rle(r)
rle1$values <- dofill(as.numeric(rle1$values))
inverse.rle(rle1)
}
t(apply(x, 1, rlefill))
dofill
只是将所有 NA 替换为先前的值,并将所有剩余的 NA 替换为下一个值。
rlefill
需要将 NA 序列转换为 "one big NA"。
当然,如果你有更多的时间点,你可能需要类似...
cis <- grep('^Time',names(df))
timemat <- as.matrix(df[cis]);
...即从数据框中提取相关部分的更通用的解决方案。
(现在我意识到这不完全是你问的 - 我的解决方案总是使用前面的值,如果它可用,即使下面的值在时间上更接近。它对你的例子没有影响数据集,但它可能会对实际数据产生影响。)
又一次尝试。尽可能分解:(1)从左到右循环一次,向前携带最后一个非 NA 值,并且记录被替换的每个 NA 的非 NA 在哪里,(2)从右到左再次循环( a) 向后替换携带非 NA 的 NA 和 (b) 比较替换每个 NA 的非 NA 与当前非 NA 的距离,然后保留或替换。尽管有两个显式循环,计算涉及 length == nrow(x)
.
的向量
ff = function(x)
{
taken_from = lapply(seq_along(x), rep_len, nrow(x))
nas = lapply(x, is.na)
#loop left -> right
# carry forward non-NAs and record which non-NA replaced NA
last_nona = !nas[[1L]]
for(j in 2:length(x)) {
i = which(nas[[j]] & last_nona)
x[[j]][i] = x[[j - 1L]][i]
taken_from[[j]][i] = taken_from[[j - 1L]][i]
last_nona = !is.na(x[[j]])
}
#loop right -> left
#if NA and not replace carry the previous non-NA backward
#else compare which non-NA is nearer and replace appropriately
last_nona = !nas[[length(x)]]
for(j in (length(x) - 1L):1L) {
i1 = which(nas[[j]] & last_nona)
i = i1[(j - taken_from[[j]][i1]) > (taken_from[[(j + 1L)]][i1] - j)]
ii = i1[j == taken_from[[j]][i1]]
taken_from[[j]][i] = taken_from[[j + 1L]][i]
taken_from[[j]][ii] = taken_from[[j + 1L]][ii]
x[[j]][i] = x[[j + 1L]][i]
x[[j]][ii] = x[[j + 1L]][ii]
last_nona = !is.na(x[[j]])
}
return(x)
}
使用 bgoldst 的数据:
ff(df[-1L])
# Time1 Time2 Time3 Time4
#1 23 23 23 23
#2 21 21 21 21
#3 22 22 25 25
#4 NA NA NA NA
#5 29 29 20 20
#6 15 15 15 22
#7 11 11 11 11
#8 1 1 2 2
以及必要的基准测试:
set.seed(911)
DAT = as.data.frame(matrix(sample(c(NA, 0:10), 1e7, TRUE), 1e6, 10))
system.time({ ansff = ff(DAT) })
# user system elapsed
# 0.82 0.38 1.75
system.time({ ansbgoldst1 = bgoldst1(DAT) })
# user system elapsed
# 20.96 7.53 42.04
system.time({ ansbgoldst2 = bgoldst2(DAT) })
# user system elapsed
# 0.97 0.25 1.64
sf1 = system.time({ ansfrank = frank(DAT) }); sf2 = system.time( copy(DAT) )
sf1 - sf2
# user system elapsed
# 5.84 1.46 8.59
all.equal(ansff, ansbgoldst1)
#[1] TRUE
all.equal(ansbgoldst1, ansbgoldst2)
#[1] TRUE
all.equal(ansbgoldst2, ansfrank)
#[1] TRUE
函数:
bgoldst1 = function(x)
{
ris = seq_len(nrow(x))
xm = as.matrix(x)
nacols = as.data.frame(lapply(seq_along(x), function(i) { x[[i]][!is.na(x[[i]])] = i; x[[i]] }))
for(ci in seq_along(x)) {
off = abs(nacols - ci)
best = which(off == do.call(pmin, c(off, na.rm = TRUE)), arr.ind = TRUE)
x[ci] = xm[matrix(c(ris, best[match(ris, best[, "row"]), "col"]), nrow(x))]
}
x
}
bgoldst2 = function(x)
{
ans = as.data.frame(fillDFNAsWithNearestInRow(x, seq_along(x)))
names(ans) = names(x)
ans
}
frank = function(x)
{
x = copy(x)
good = as.data.table(which(!is.na(x), arr.ind = TRUE))
all = CJ(row = seq_len(nrow(x)), col = seq_len(ncol(x)))
good$col = good$col
good$col_src = good$col
changes = good[all, on = c("row", "col"), roll = "nearest"][col != col_src]
changes[, {
set(x, i = row, j = col, value = x[[col_src]][row])
NULL
}, by = .(col, col_src)]
x
}
简单的问题,但我不知道如何执行以下操作。这是我的数据:
ID Time1 Time2 Time3 Time4
01 23 23 NA NA
02 21 21 21 NA
03 22 22 25 NA
04 29 29 20 NA
05 NA NA 15 22
06 NA NA 11 NA
现在,我想用其他变量中可用的数据替换缺失值 (NA)。重要的是,我需要 r 将 'closest' 的值取到缺失的数据点。例如,对于 ID 5,Time1 和 Time2 应该是“15”(而不是“22”)。
像这样:
ID Time1 Time2 Time3 Time4
01 23 23 23 23
02 21 21 21 21
03 22 22 25 25
04 29 29 20 20
05 15 15 15 22
06 11 11 11 11
我试过 ifelse 语句,但没有成功。
谢谢!
这比看起来要难得多。我构建了一个一次在一列上工作的解决方案,采用所有时间列索引和当前列索引之间的绝对距离的 pmin()
,使用 na.rm=T
参数剥离 NA。然后可以使用结果使用索引矩阵对原始时间列进行索引,然后可以将其分配给目标 data.frame.
这种设计的一个优点是它对行进行了完全矢量化。换句话说,它不会一次迭代一行。这对于行数非常多的输入来说可能是一个优势。另一方面,该解决方案确实涉及构建并行所有时间列(timemat
、nacols
和 off
)的矩阵,这对于大输入来说可能代价高昂。它基本上是在牺牲内存来节省 CPU.
我添加了几行来测试 OP 样本未涵盖的其他案例 data.frame;特别是 (1) 一个全 NA 行,和 (2) 一个在 NA 值的两边有候选非 NA 值的行。
输入:
df <- data.frame(ID=c('01','02','03','04','05','06','07','08'),Time1=c(23L,21L,22L,NA,29L,NA,NA,1L),Time2=c(23L,21L,22L,NA,29L,NA,NA,NA),Time3=c(NA,21L,25L,NA,20L,15L,11L,NA),Time4=c(NA,NA,NA,NA,NA,22L,NA,2L),stringsAsFactors=F);
df;
## ID Time1 Time2 Time3 Time4
## 1 01 23 23 NA NA
## 2 02 21 21 21 NA
## 3 03 22 22 25 NA
## 4 04 NA NA NA NA
## 5 05 29 29 20 NA
## 6 06 NA NA 15 22
## 7 07 NA NA 11 NA
## 8 08 1 NA NA 2
解决方案:
ris <- seq_len(nrow(df));
cis <- grep('^Time',names(df));
timemat <- as.matrix(df[cis]);
nacols <- as.data.frame(ifelse(is.na(timemat),NA,col(timemat)));
nacols;
## Time1 Time2 Time3 Time4
## 1 1 2 NA NA
## 2 1 2 3 NA
## 3 1 2 3 NA
## 4 NA NA NA NA
## 5 1 2 3 NA
## 6 NA NA 3 4
## 7 NA NA 3 NA
## 8 1 NA NA 4
for (ci in seq_len(ncol(timemat))) {
off <- abs(nacols-ci);
best <- which(off==do.call(pmin,c(off,na.rm=T)),arr.ind=T);
df[cis[ci]] <- timemat[matrix(c(ris,best[match(ris,best[,'row']),'col']),nrow(df))];
};
df;
## ID Time1 Time2 Time3 Time4
## 1 01 23 23 23 23
## 2 02 21 21 21 21
## 3 03 22 22 25 25
## 4 04 NA NA NA NA
## 5 05 29 29 20 20
## 6 06 15 15 15 22
## 7 07 11 11 11 11
## 8 08 1 1 2 2
Rcpp解决方案:
library(Rcpp);
cppFunction('
IntegerMatrix fillDFNAsWithNearestInRow(DataFrame df, IntegerVector cis ) {
IntegerMatrix res(df.nrows(),cis.size());
if (df.nrows()==0 || cis.size()==0) return res;
IntegerVector cis0 = clone(cis); for (int cisi = 0; cisi < cis0.size(); ++cisi) --cis0[cisi]; // correct from R 1-based to Rcpp 0-based
for (int cisi = 0; cisi < cis0.size(); ++cisi) {
IntegerVector colCur = df[cis0[cisi]];
for (int ri = 0; ri < colCur.size(); ++ri) {
if (!IntegerVector::is_na(colCur[ri])) {
res(ri,cisi) = colCur[ri];
continue;
}
int leftOk;
int rightOk;
IntegerVector colLeft;
IntegerVector colRight;
bool set = false; // assumption
for (int off = 1; (leftOk = cisi-off>=0, rightOk = cisi+off<cis0.size(), leftOk ) || rightOk; ++off) {
if (leftOk && (colLeft = df[cis0[cisi-off]], !IntegerVector::is_na(colLeft[ri]))) {
res(ri,cisi) = colLeft[ri];
set = true;
break;
} else if (rightOk && (colRight = df[cis0[cisi+off]], !IntegerVector::is_na(colRight[ri]))) {
res(ri,cisi) = colRight[ri];
set = true;
break;
}
}
if (!set) res(ri,cisi) = NA_INTEGER;
}
}
return res;
}
');
df <- data.frame(ID=c('01','02','03','04','05','06','07','08'),Time1=c(23L,21L,22L,NA,29L,NA,NA,1L),Time2=c(23L,21L,22L,NA,29L,NA,NA,NA),Time3=c(NA,21L,25L,NA,20L,15L,11L,NA),Time4=c(NA,NA,NA,NA,NA,22L,NA,2L),stringsAsFactors=F);
cis <- grep('^Time',names(df));
df[cis] <- fillDFNAsWithNearestInRow(df,cis);
df;
## ID Time1 Time2 Time3 Time4
## 1 01 23 23 23 23
## 2 02 21 21 21 21
## 3 03 22 22 25 25
## 4 04 NA NA NA NA
## 5 05 29 29 20 20
## 6 06 15 15 15 22
## 7 07 11 11 11 11
## 8 08 1 1 2 2
使用 data.table 的滚动连接和 set
:
library(data.table)
good = as.data.table( which(!is.na(df[-1]), arr.ind = TRUE) )
all = CJ(row = seq(nrow(df)), col = seq(2L, ncol(df)))
good$col = good$col + 1L
good$col_src = good$col
changes = good[all, on = c("row", "col"), roll="nearest"][ col != col_src ]
changes[, {
set(df, i = row, j = col, value = df[[ col_src ]][row])
NULL
}, by=.(col,col_src)]
# based on input from bgoldst's answer
ID 1 2 3 4
1: 01 23 23 23 23
2: 02 21 21 21 21
3: 03 22 22 25 25
4: 04 NA NA NA NA
5: 05 29 29 20 20
6: 06 15 15 15 22
7: 07 11 11 11 11
8: 08 1 1 2 2
我们找到所有条目进行切换,然后参照set
进行修改。我不确定 roll="nearest"
如何处理关系,但我确定可以对其进行调整。
这是一个简单的解决方案:
x <-read.table(text="ID Time1 Time2 Time3 Time4
01 23 23 NA NA
02 21 21 21 NA
03 22 22 25 NA
04 29 29 20 NA
05 NA NA 15 22
06 NA NA 11 NA", header=TRUE)
x <- as.matrix(x[,-1])
dofill <- function(r){
PREV <- c(NA, suppressWarnings(head(r, -1)))
NEXT <- c(tail(r,-1), NA)
r[is.na(r)] <- PREV[is.na(r)]
r[is.na(r)] <- NEXT[is.na(r)]
r
}
rlefill <- function(r){
r[is.na(r)] <- "NA"
rle1 <- rle(r)
rle1$values <- dofill(as.numeric(rle1$values))
inverse.rle(rle1)
}
t(apply(x, 1, rlefill))
dofill
只是将所有 NA 替换为先前的值,并将所有剩余的 NA 替换为下一个值。
rlefill
需要将 NA 序列转换为 "one big NA"。
当然,如果你有更多的时间点,你可能需要类似...
cis <- grep('^Time',names(df))
timemat <- as.matrix(df[cis]);
...即从数据框中提取相关部分的更通用的解决方案。
(现在我意识到这不完全是你问的 - 我的解决方案总是使用前面的值,如果它可用,即使下面的值在时间上更接近。它对你的例子没有影响数据集,但它可能会对实际数据产生影响。)
又一次尝试。尽可能分解:(1)从左到右循环一次,向前携带最后一个非 NA 值,并且记录被替换的每个 NA 的非 NA 在哪里,(2)从右到左再次循环( a) 向后替换携带非 NA 的 NA 和 (b) 比较替换每个 NA 的非 NA 与当前非 NA 的距离,然后保留或替换。尽管有两个显式循环,计算涉及 length == nrow(x)
.
ff = function(x)
{
taken_from = lapply(seq_along(x), rep_len, nrow(x))
nas = lapply(x, is.na)
#loop left -> right
# carry forward non-NAs and record which non-NA replaced NA
last_nona = !nas[[1L]]
for(j in 2:length(x)) {
i = which(nas[[j]] & last_nona)
x[[j]][i] = x[[j - 1L]][i]
taken_from[[j]][i] = taken_from[[j - 1L]][i]
last_nona = !is.na(x[[j]])
}
#loop right -> left
#if NA and not replace carry the previous non-NA backward
#else compare which non-NA is nearer and replace appropriately
last_nona = !nas[[length(x)]]
for(j in (length(x) - 1L):1L) {
i1 = which(nas[[j]] & last_nona)
i = i1[(j - taken_from[[j]][i1]) > (taken_from[[(j + 1L)]][i1] - j)]
ii = i1[j == taken_from[[j]][i1]]
taken_from[[j]][i] = taken_from[[j + 1L]][i]
taken_from[[j]][ii] = taken_from[[j + 1L]][ii]
x[[j]][i] = x[[j + 1L]][i]
x[[j]][ii] = x[[j + 1L]][ii]
last_nona = !is.na(x[[j]])
}
return(x)
}
使用 bgoldst 的数据:
ff(df[-1L])
# Time1 Time2 Time3 Time4
#1 23 23 23 23
#2 21 21 21 21
#3 22 22 25 25
#4 NA NA NA NA
#5 29 29 20 20
#6 15 15 15 22
#7 11 11 11 11
#8 1 1 2 2
以及必要的基准测试:
set.seed(911)
DAT = as.data.frame(matrix(sample(c(NA, 0:10), 1e7, TRUE), 1e6, 10))
system.time({ ansff = ff(DAT) })
# user system elapsed
# 0.82 0.38 1.75
system.time({ ansbgoldst1 = bgoldst1(DAT) })
# user system elapsed
# 20.96 7.53 42.04
system.time({ ansbgoldst2 = bgoldst2(DAT) })
# user system elapsed
# 0.97 0.25 1.64
sf1 = system.time({ ansfrank = frank(DAT) }); sf2 = system.time( copy(DAT) )
sf1 - sf2
# user system elapsed
# 5.84 1.46 8.59
all.equal(ansff, ansbgoldst1)
#[1] TRUE
all.equal(ansbgoldst1, ansbgoldst2)
#[1] TRUE
all.equal(ansbgoldst2, ansfrank)
#[1] TRUE
函数:
bgoldst1 = function(x)
{
ris = seq_len(nrow(x))
xm = as.matrix(x)
nacols = as.data.frame(lapply(seq_along(x), function(i) { x[[i]][!is.na(x[[i]])] = i; x[[i]] }))
for(ci in seq_along(x)) {
off = abs(nacols - ci)
best = which(off == do.call(pmin, c(off, na.rm = TRUE)), arr.ind = TRUE)
x[ci] = xm[matrix(c(ris, best[match(ris, best[, "row"]), "col"]), nrow(x))]
}
x
}
bgoldst2 = function(x)
{
ans = as.data.frame(fillDFNAsWithNearestInRow(x, seq_along(x)))
names(ans) = names(x)
ans
}
frank = function(x)
{
x = copy(x)
good = as.data.table(which(!is.na(x), arr.ind = TRUE))
all = CJ(row = seq_len(nrow(x)), col = seq_len(ncol(x)))
good$col = good$col
good$col_src = good$col
changes = good[all, on = c("row", "col"), roll = "nearest"][col != col_src]
changes[, {
set(x, i = row, j = col, value = x[[col_src]][row])
NULL
}, by = .(col, col_src)]
x
}