用尾随行值乘以增长率填充 NA 值?
Fill NA values with the trailing row value times a growth rate?
用以前的值乘以 (1 + growth)
来填充 NA
值的好方法是什么?
df <- data.frame(
year = 0:6,
price1 = c(1.1, 2.1, 3.2, 4.8, NA, NA, NA),
price2 = c(1.1, 2.1, 3.2, NA, NA, NA, NA)
)
growth <- .02
在这种情况下,我希望 price1
中的缺失值用 4.8*1.02
、4.8*1.02^2
和 4.8*1.02^3
填充。同样,我希望 price2
中的缺失值用 3.2*1.02
、3.2*1.02^2
、3.2*1.02^3
和 3.2*1.02^4
.
填充
我试过了,但我认为它需要设置为以某种方式重复(apply
?):
library(dplyr)
df %>%
mutate(price1 = ifelse(is.na(price1),
lag(price1) * (1 + growth), price1
))
我还没有将 dplyr
用于任何其他用途,因此将不胜感激来自基础 R 或 plyr
或类似内容的内容。
你可以试试这个功能
test <- function(x,n) {
if (!is.na(df[x,n])) return (df[x,n])
else return (test(x-1,n)*(1+growth))
}
a=1:nrow(df)
lapply(a, FUN=function(i) test(i,2))
unlist(lapply(a, FUN=function(i) test(i,2)))
[1] 1.100000 2.100000 3.200000 4.800000 4.896000 4.993920 5.093798
假设只有尾随 NA:
NAgrow <- function(x,growth=0.02) {
isna <- is.na(x)
lastval <- tail(x[!isna],1)
x[isna] <- lastval*(1+growth)^seq(sum(isna))
return(x)
}
如果还有内部 NA
值,这会变得有点棘手。
应用于除第一列以外的所有列:
df[-1] <- lapply(df[-1],NAgrow)
## year price1 price2
## 1 0 1.100000 1.100000
## 2 1 2.100000 2.100000
## 3 2 3.200000 3.200000
## 4 3 4.800000 3.264000
## 5 4 4.896000 3.329280
## 6 5 4.993920 3.395866
## 7 6 5.093798 3.463783
看起来 dplyr
无法处理访问新分配的滞后值。这是一个解决方案,即使 NA
位于列的中间也应该有效。
df <- apply(
df, 2, function(x){
if(sum(is.na(x)) == 0){return(x)}
## updated with optimized portion from @josilber
r <- rle(is.na(x))
na.loc <- which(r$values)
b <- rep(cumsum(r$lengths)[na.loc-1], r$lengths[na.loc])
lastValIs <- 1:length(x)
lastValI[is.na(x)] <- b
x[is.na(x)] <-
sapply(which(is.na(x)), function(i){
return(x[lastValIs[i]]*(1 + growth)^(i - lastValIs[i]))
})
return(x)
})
以下基于rle
的解决方案适用于任何位置的NA并且不依赖循环来填充缺失值:
NAgrow.rle <- function(x) {
if (is.na(x[1])) stop("Can't have NA at beginning")
r <- rle(is.na(x))
na.loc <- which(r$values)
b <- rep(cumsum(r$lengths)[na.loc-1], r$lengths[na.loc])
x[is.na(x)] <- ave(x[b], b, FUN=function(y) y[1]*(1+growth)^seq_along(y))
x
}
df[,-1] <- lapply(df[,-1], NAgrow.rle)
# year price1 price2
# 1 0 1.100000 1.100000
# 2 1 2.100000 2.100000
# 3 2 3.200000 3.200000
# 4 3 4.800000 3.264000
# 5 4 4.896000 3.329280
# 6 5 4.993920 3.395866
# 7 6 5.093798 3.463783
我将添加另外两个使用 for 循环的解决方案,一个在 base R 中,一个在 Rcpp 中:
NAgrow.for <- function(x) {
for (i in which(is.na(x))) {
x[i] <- x[i-1] * (1+growth)
}
x
}
library(Rcpp)
cppFunction(
"NumericVector NAgrowRcpp(NumericVector x, double growth) {
const int n = x.size();
NumericVector y(x);
for (int i=1; i < n; ++i) {
if (R_IsNA(x[i])) {
y[i] = (1.0 + growth) * y[i-1];
}
}
return y;
}")
基于rle
(crimson
和josilber.rle
)的解决方案比基于for循环(josilber.for
)的简单解决方案花费大约两倍的时间,并且正如预期的那样,Rcpp 解决方案是最快的,运行 大约 0.002 秒。
set.seed(144)
big.df <- data.frame(ID=1:100000,
price1=sample(c(1:10, NA), 100000, replace=TRUE),
price2=sample(c(1:10, NA), 100000, replace=TRUE))
crimson <- function(df) apply(df[,-1], 2, function(x){
if(sum(is.na(x)) == 0){return(x)}
## updated with optimized portion from @josilber
r <- rle(is.na(x))
na.loc <- which(r$values)
b <- rep(cumsum(r$lengths)[na.loc-1], r$lengths[na.loc])
lastValIs <- 1:length(x)
lastValIs[is.na(x)] <- b
x[is.na(x)] <-
sapply(which(is.na(x)), function(i){
return(x[lastValIs[i]]*(1 + growth)^(i - lastValIs[i]))
})
return(x)
})
ggrothendieck <- function(df) {
growthfun <- function(x, y) if (is.na(y)) (1+growth)*x else y
lapply(df[,-1], Reduce, f = growthfun, acc = TRUE)
}
josilber.rle <- function(df) lapply(df[,-1], NAgrow.rle)
josilber.for <- function(df) lapply(df[,-1], NAgrow.for)
josilber.rcpp <- function(df) lapply(df[,-1], NAgrowRcpp, growth=growth)
library(microbenchmark)
microbenchmark(crimson(big.df), ggrothendieck(big.df), josilber.rle(big.df), josilber.for(big.df), josilber.rcpp(big.df))
# Unit: milliseconds
# expr min lq mean median uq max neval
# crimson(big.df) 98.447546 131.063713 161.494366 152.477661 183.175840 379.643222 100
# ggrothendieck(big.df) 437.015693 667.760401 822.530745 817.864707 925.974019 1607.352929 100
# josilber.rle(big.df) 59.678527 115.220519 132.874030 127.476340 151.665657 262.003756 100
# josilber.for(big.df) 21.076516 57.479169 73.860913 72.959536 84.846912 178.412591 100
# josilber.rcpp(big.df) 1.248793 1.894723 2.373469 2.190545 2.697246 5.646878 100
可以使用 Reduce
:
获得紧凑的基础 R 解决方案
growthfun <- function(x, y) if (is.na(y)) (1+growth)*x else y
replace(df, TRUE, lapply(df, Reduce, f = growthfun, acc = TRUE))
给予:
year price1 price2
1 0 1.100000 1.100000
2 1 2.100000 2.100000
3 2 3.200000 3.200000
4 3 4.800000 3.264000
5 4 4.896000 3.329280
6 5 4.993920 3.395866
7 6 5.093798 3.463783
注意:问题中的数据没有非尾随的 NA 值,但如果有的话,我们可以使用 na.fill
from zoo 来首先替换尾随的 NA 值具有特殊值的NA,比如NaN,寻找它而不是NA:
library(zoo)
DF <- as.data.frame(na.fill(df, c(NA, NA, NaN)))
growthfun <- function(x, y) if (is.nan(y)) (1+growth)*x else y
replace(DF, TRUE, lapply(DF, Reduce, f = growthfun, acc = TRUE))
用以前的值乘以 (1 + growth)
来填充 NA
值的好方法是什么?
df <- data.frame(
year = 0:6,
price1 = c(1.1, 2.1, 3.2, 4.8, NA, NA, NA),
price2 = c(1.1, 2.1, 3.2, NA, NA, NA, NA)
)
growth <- .02
在这种情况下,我希望 price1
中的缺失值用 4.8*1.02
、4.8*1.02^2
和 4.8*1.02^3
填充。同样,我希望 price2
中的缺失值用 3.2*1.02
、3.2*1.02^2
、3.2*1.02^3
和 3.2*1.02^4
.
我试过了,但我认为它需要设置为以某种方式重复(apply
?):
library(dplyr)
df %>%
mutate(price1 = ifelse(is.na(price1),
lag(price1) * (1 + growth), price1
))
我还没有将 dplyr
用于任何其他用途,因此将不胜感激来自基础 R 或 plyr
或类似内容的内容。
你可以试试这个功能
test <- function(x,n) {
if (!is.na(df[x,n])) return (df[x,n])
else return (test(x-1,n)*(1+growth))
}
a=1:nrow(df)
lapply(a, FUN=function(i) test(i,2))
unlist(lapply(a, FUN=function(i) test(i,2)))
[1] 1.100000 2.100000 3.200000 4.800000 4.896000 4.993920 5.093798
假设只有尾随 NA:
NAgrow <- function(x,growth=0.02) {
isna <- is.na(x)
lastval <- tail(x[!isna],1)
x[isna] <- lastval*(1+growth)^seq(sum(isna))
return(x)
}
如果还有内部 NA
值,这会变得有点棘手。
应用于除第一列以外的所有列:
df[-1] <- lapply(df[-1],NAgrow)
## year price1 price2
## 1 0 1.100000 1.100000
## 2 1 2.100000 2.100000
## 3 2 3.200000 3.200000
## 4 3 4.800000 3.264000
## 5 4 4.896000 3.329280
## 6 5 4.993920 3.395866
## 7 6 5.093798 3.463783
看起来 dplyr
无法处理访问新分配的滞后值。这是一个解决方案,即使 NA
位于列的中间也应该有效。
df <- apply(
df, 2, function(x){
if(sum(is.na(x)) == 0){return(x)}
## updated with optimized portion from @josilber
r <- rle(is.na(x))
na.loc <- which(r$values)
b <- rep(cumsum(r$lengths)[na.loc-1], r$lengths[na.loc])
lastValIs <- 1:length(x)
lastValI[is.na(x)] <- b
x[is.na(x)] <-
sapply(which(is.na(x)), function(i){
return(x[lastValIs[i]]*(1 + growth)^(i - lastValIs[i]))
})
return(x)
})
以下基于rle
的解决方案适用于任何位置的NA并且不依赖循环来填充缺失值:
NAgrow.rle <- function(x) {
if (is.na(x[1])) stop("Can't have NA at beginning")
r <- rle(is.na(x))
na.loc <- which(r$values)
b <- rep(cumsum(r$lengths)[na.loc-1], r$lengths[na.loc])
x[is.na(x)] <- ave(x[b], b, FUN=function(y) y[1]*(1+growth)^seq_along(y))
x
}
df[,-1] <- lapply(df[,-1], NAgrow.rle)
# year price1 price2
# 1 0 1.100000 1.100000
# 2 1 2.100000 2.100000
# 3 2 3.200000 3.200000
# 4 3 4.800000 3.264000
# 5 4 4.896000 3.329280
# 6 5 4.993920 3.395866
# 7 6 5.093798 3.463783
我将添加另外两个使用 for 循环的解决方案,一个在 base R 中,一个在 Rcpp 中:
NAgrow.for <- function(x) {
for (i in which(is.na(x))) {
x[i] <- x[i-1] * (1+growth)
}
x
}
library(Rcpp)
cppFunction(
"NumericVector NAgrowRcpp(NumericVector x, double growth) {
const int n = x.size();
NumericVector y(x);
for (int i=1; i < n; ++i) {
if (R_IsNA(x[i])) {
y[i] = (1.0 + growth) * y[i-1];
}
}
return y;
}")
基于rle
(crimson
和josilber.rle
)的解决方案比基于for循环(josilber.for
)的简单解决方案花费大约两倍的时间,并且正如预期的那样,Rcpp 解决方案是最快的,运行 大约 0.002 秒。
set.seed(144)
big.df <- data.frame(ID=1:100000,
price1=sample(c(1:10, NA), 100000, replace=TRUE),
price2=sample(c(1:10, NA), 100000, replace=TRUE))
crimson <- function(df) apply(df[,-1], 2, function(x){
if(sum(is.na(x)) == 0){return(x)}
## updated with optimized portion from @josilber
r <- rle(is.na(x))
na.loc <- which(r$values)
b <- rep(cumsum(r$lengths)[na.loc-1], r$lengths[na.loc])
lastValIs <- 1:length(x)
lastValIs[is.na(x)] <- b
x[is.na(x)] <-
sapply(which(is.na(x)), function(i){
return(x[lastValIs[i]]*(1 + growth)^(i - lastValIs[i]))
})
return(x)
})
ggrothendieck <- function(df) {
growthfun <- function(x, y) if (is.na(y)) (1+growth)*x else y
lapply(df[,-1], Reduce, f = growthfun, acc = TRUE)
}
josilber.rle <- function(df) lapply(df[,-1], NAgrow.rle)
josilber.for <- function(df) lapply(df[,-1], NAgrow.for)
josilber.rcpp <- function(df) lapply(df[,-1], NAgrowRcpp, growth=growth)
library(microbenchmark)
microbenchmark(crimson(big.df), ggrothendieck(big.df), josilber.rle(big.df), josilber.for(big.df), josilber.rcpp(big.df))
# Unit: milliseconds
# expr min lq mean median uq max neval
# crimson(big.df) 98.447546 131.063713 161.494366 152.477661 183.175840 379.643222 100
# ggrothendieck(big.df) 437.015693 667.760401 822.530745 817.864707 925.974019 1607.352929 100
# josilber.rle(big.df) 59.678527 115.220519 132.874030 127.476340 151.665657 262.003756 100
# josilber.for(big.df) 21.076516 57.479169 73.860913 72.959536 84.846912 178.412591 100
# josilber.rcpp(big.df) 1.248793 1.894723 2.373469 2.190545 2.697246 5.646878 100
可以使用 Reduce
:
growthfun <- function(x, y) if (is.na(y)) (1+growth)*x else y
replace(df, TRUE, lapply(df, Reduce, f = growthfun, acc = TRUE))
给予:
year price1 price2
1 0 1.100000 1.100000
2 1 2.100000 2.100000
3 2 3.200000 3.200000
4 3 4.800000 3.264000
5 4 4.896000 3.329280
6 5 4.993920 3.395866
7 6 5.093798 3.463783
注意:问题中的数据没有非尾随的 NA 值,但如果有的话,我们可以使用 na.fill
from zoo 来首先替换尾随的 NA 值具有特殊值的NA,比如NaN,寻找它而不是NA:
library(zoo)
DF <- as.data.frame(na.fill(df, c(NA, NA, NaN)))
growthfun <- function(x, y) if (is.nan(y)) (1+growth)*x else y
replace(DF, TRUE, lapply(DF, Reduce, f = growthfun, acc = TRUE))