计算具有不同权重和 R 中缺失值的加权平均值
Calculate weighted average with varying weights and missing values in R
我正在尝试计算 3 列的加权平均值,其中权重是根据每行缺失值的数量决定的。
一个可重现的例子:
# Some simulated data
N <- 50
df <- data.table(int_1 = runif(N,1000,5000), int_2 = runif(N,1000,5000), int_3 = runif(N,1000,5000))
df[-1] <- lapply(df[-1], function(x) { x[sample(c(1:N), floor(N/10))] <- NA ; x })
# Function to calculate weighted average
# The weights are flexible and are input by user
a = 5
b = 3
c = 2
i = 10
wa_func <- function(x,y,z){
if(!(is.na(x) & is.na(y) & is.na(z))){
wt_avg <- (a/i)* x + (b/i) * y + (c/i) * z
} else if(!is.na(x) & !is.na(y) & is.na(z)){
wt_avg <- (a/(i-c))* x + (b/(i-c)) * y
} else if(!is.na(x) & is.na(y) & is.na(z)){
wt_avg <- a/(i-(b+c))* x
}
return(wt_avg)
}
df[, weighted_avg_int := mapply(wa_func,int_1,int_2,int_3)]
但是对于一行中的任何缺失值,该函数都输出 NA。我在这里错过了什么?
提前致谢。
您需要更改函数中第一个 if
的条件:
wa_func <- function(x, y, z) {
if (!(is.na(x) | is.na(y) | is.na(z))) {
wt_avg <- (a / i) * x + (b / i) * y + (c / i) * z
} else if (!is.na(x) & !is.na(y) & is.na(z)) {
wt_avg <- (a / (i - c)) * x + (b / (i - c)) * y
} else if (!is.na(x) & is.na(y) & is.na(z)) {
wt_avg <- a / (i - (b + c)) * x
}
return(wt_avg)
}
您可以通过使用 Vectorise()
:
包装您的函数来改进函数,从而不再需要 mapply
wa_func <- Vectorize(function(x, y, z) {
a <- 5 # part of the function?
b <- 3
c <- 2
i <- 10
if (!(is.na(x) | is.na(y) | is.na(z))) {
(a / i) * x + (b / i) * y + (c / i) * z
} else if (!is.na(x) & !is.na(y) & is.na(z)) {
(a / (i - c)) * x + (b / (i - c)) * y
} else if (!is.na(x) & is.na(y) & is.na(z)) {
a / (i - (b + c)) * x
}
# no need for return()
})
我正在尝试计算 3 列的加权平均值,其中权重是根据每行缺失值的数量决定的。
一个可重现的例子:
# Some simulated data
N <- 50
df <- data.table(int_1 = runif(N,1000,5000), int_2 = runif(N,1000,5000), int_3 = runif(N,1000,5000))
df[-1] <- lapply(df[-1], function(x) { x[sample(c(1:N), floor(N/10))] <- NA ; x })
# Function to calculate weighted average
# The weights are flexible and are input by user
a = 5
b = 3
c = 2
i = 10
wa_func <- function(x,y,z){
if(!(is.na(x) & is.na(y) & is.na(z))){
wt_avg <- (a/i)* x + (b/i) * y + (c/i) * z
} else if(!is.na(x) & !is.na(y) & is.na(z)){
wt_avg <- (a/(i-c))* x + (b/(i-c)) * y
} else if(!is.na(x) & is.na(y) & is.na(z)){
wt_avg <- a/(i-(b+c))* x
}
return(wt_avg)
}
df[, weighted_avg_int := mapply(wa_func,int_1,int_2,int_3)]
但是对于一行中的任何缺失值,该函数都输出 NA。我在这里错过了什么?
提前致谢。
您需要更改函数中第一个 if
的条件:
wa_func <- function(x, y, z) {
if (!(is.na(x) | is.na(y) | is.na(z))) {
wt_avg <- (a / i) * x + (b / i) * y + (c / i) * z
} else if (!is.na(x) & !is.na(y) & is.na(z)) {
wt_avg <- (a / (i - c)) * x + (b / (i - c)) * y
} else if (!is.na(x) & is.na(y) & is.na(z)) {
wt_avg <- a / (i - (b + c)) * x
}
return(wt_avg)
}
您可以通过使用 Vectorise()
:
mapply
wa_func <- Vectorize(function(x, y, z) {
a <- 5 # part of the function?
b <- 3
c <- 2
i <- 10
if (!(is.na(x) | is.na(y) | is.na(z))) {
(a / i) * x + (b / i) * y + (c / i) * z
} else if (!is.na(x) & !is.na(y) & is.na(z)) {
(a / (i - c)) * x + (b / (i - c)) * y
} else if (!is.na(x) & is.na(y) & is.na(z)) {
a / (i - (b + c)) * x
}
# no need for return()
})