如果 value 等于 NA,则根据 R 中的份额进行估计
If value is equal to NA, then estimate based on share in R
考虑以下两个数据框:
df <- data.frame(REGION = c("REG01","REG02","REG03","REGSUM"),
INDUSTRY = c("INDU01","INDU01","INDU01","INDU01"),
VALUE = c(NA,10,NA,30))
和:
df2 <- data.frame(REGION = c("REG01","REG02","REG03","REGSUM"),
INDUSTRY = c("INDU01","INDU01","INDU01","INDU01"),
VALUE = c(5,15,20,40))
我想做以下计算:如果 df
中的值等于 NA,那么我想根据 df2
中的份额进行估算。因为我知道 df
中的总和,所以我知道我必须在 df
.
中的两个具有 NA 的元素之间分配值 df[REGSUM,INDU01] - df[REG02,INDU01] = 30 - 10 = 20
那么它应该将df2
中相同的元素除以NA的元素之和:
df2_share[REG01,INDU01] = 5 / (5 + 20) = 0.2
df2_share[REG03,INDU01] = 20 / (5 + 20) = 0.8
此份额应用于估计 df1
中的 NA
。所以我最终会得到以下数据框:
REGION INDUSTRY VALUE
1 REG01 INDU01 0.2 * 20 = 4
2 REG02 INDU01 10
3 REG03 INDU01 0.8 * 20 = 16
4 REGSUM INDU01 30
我可以在 R 中做到这一点吗(我的数据框中有很多地区和行业)。
这是一个方法。
对 df
中既不是 NA
也不是 "REGSUM"
的值求和。使用此值计算分配给 NA
值的总数。然后获取 df2
中与 NA
条目对应的值并计算要分配的比例。
not_na_values <- sum(df$VALUE[df$REGION != "REGSUM"], na.rm = TRUE)
to_assign <- df$VALUE[df$REGION == "REGSUM"] - not_na_values
na <- is.na(df$VALUE)
numer <- df2$VALUE[na]
denom <- sum(numer)
df$VALUE[na] <- numer/denom * to_assign
df
# REGION INDUSTRY VALUE
#1 REG01 INDU01 4
#2 REG02 INDU01 10
#3 REG03 INDU01 16
#4 REGSUM INDU01 30
下面的函数将上面的代码概括为 data.frames 许多行业。它的工作原理是按行业拆分输入 data.frames 并 lapply
ing 之前的代码,作为每个拆分列表成员的函数编写。最后它重新组装这些子数据帧和 returns 给调用者。
assign_na_values <- function(x, y,
region_col = "REGION",
industry_col = "INDUSTRY",
value_col = "VALUE",
regsum = "REGSUM") {
f <- function(x, y, region_col, value_col, regsum){
i <- x[[region_col]] != regsum
not_na_values <- sum(x[[value_col]][ i ], na.rm = TRUE)
to_assign <- x[[value_col]][ !i ] - not_na_values
na <- is.na(x[[value_col]])
numer <- y[[value_col]][na]
denom <- sum(numer)
x[[value_col]][na] <- numer/denom * to_assign
x
}
sp_x <- split(x, x[[industry_col]])
sp_y <- split(y, y[[industry_col]])
res <- lapply(seq_along(sp_x), function(i){
f(sp_x[[i]], sp_y[[i]], region_col, value_col, regsum)
})
res <- do.call(rbind, res)
row.names(res) <- NULL
res
}
assign_na_values(df, df2)
# REGION INDUSTRY VALUE
#1 REG01 INDU01 4
#2 REG02 INDU01 10
#3 REG03 INDU01 16
#4 REGSUM INDU01 30
#5 REG01 INDU02 30
#6 REG02 INDU02 6
#7 REG03 INDU02 4
#8 REGSUM INDU02 40
新测试数据
df <- data.frame(
REGION = c("REG01","REG02","REG03","REGSUM","REG01","REG02","REG03","REGSUM"),
INDUSTRY = c("INDU01","INDU01","INDU01","INDU01","INDU02","INDU02","INDU02","INDU02"),
VALUE = c(NA,10,NA,30,30,NA,NA,40)
)
df2 <- data.frame(
REGION = c("REG01","REG02","REG03","REGSUM","REG01","REG02","REG03","REGSUM"),
INDUSTRY = c("INDU01","INDU01","INDU01","INDU01","INDU02","INDU02","INDU02","INDU02"),
VALUE = c(5,15,20,40,10,30,20,60)
)
这是另一个基本的 R 解决方案
idx <- which(REGION == "REGSUM")
df <- transform(
df,
VALUE = replace(
VALUE,
is.na(VALUE),
prop.table(df2$VALUE[is.na(VALUE)]) * (VALUE[idx] - sum(VALUE[-idx], na.rm = TRUE))
)
)
这给出了
REGION INDUSTRY VALUE
1 REG01 INDU01 4
2 REG02 INDU01 10
3 REG03 INDU01 16
4 REGSUM INDU01 30
如果有多个'INDUSTRY',我们可以通过操作进行连接和使用组
library(dplyr)
df %>%
left_join(df2, by = c("REGION", "INDUSTRY")) %>%
group_by(INDUSTRY) %>%
transmute(REGION, INDUSTRY, VALUE = case_when(is.na(VALUE.x) ~
VALUE.y/sum(VALUE.y[is.na(VALUE.x)]) * (VALUE.x[n()] -
sum(VALUE.x[-n()], na.rm = TRUE)), TRUE ~ VALUE.x)) %>%
ungroup
-输出
# A tibble: 4 x 3
# REGION INDUSTRY VALUE
# <chr> <chr> <dbl>
#1 REG01 INDU01 4
#2 REG02 INDU01 10
#3 REG03 INDU01 16
#4 REGSUM INDU01 30
考虑以下两个数据框:
df <- data.frame(REGION = c("REG01","REG02","REG03","REGSUM"),
INDUSTRY = c("INDU01","INDU01","INDU01","INDU01"),
VALUE = c(NA,10,NA,30))
和:
df2 <- data.frame(REGION = c("REG01","REG02","REG03","REGSUM"),
INDUSTRY = c("INDU01","INDU01","INDU01","INDU01"),
VALUE = c(5,15,20,40))
我想做以下计算:如果 df
中的值等于 NA,那么我想根据 df2
中的份额进行估算。因为我知道 df
中的总和,所以我知道我必须在 df
.
df[REGSUM,INDU01] - df[REG02,INDU01] = 30 - 10 = 20
那么它应该将df2
中相同的元素除以NA的元素之和:
df2_share[REG01,INDU01] = 5 / (5 + 20) = 0.2
df2_share[REG03,INDU01] = 20 / (5 + 20) = 0.8
此份额应用于估计 df1
中的 NA
。所以我最终会得到以下数据框:
REGION INDUSTRY VALUE
1 REG01 INDU01 0.2 * 20 = 4
2 REG02 INDU01 10
3 REG03 INDU01 0.8 * 20 = 16
4 REGSUM INDU01 30
我可以在 R 中做到这一点吗(我的数据框中有很多地区和行业)。
这是一个方法。
对 df
中既不是 NA
也不是 "REGSUM"
的值求和。使用此值计算分配给 NA
值的总数。然后获取 df2
中与 NA
条目对应的值并计算要分配的比例。
not_na_values <- sum(df$VALUE[df$REGION != "REGSUM"], na.rm = TRUE)
to_assign <- df$VALUE[df$REGION == "REGSUM"] - not_na_values
na <- is.na(df$VALUE)
numer <- df2$VALUE[na]
denom <- sum(numer)
df$VALUE[na] <- numer/denom * to_assign
df
# REGION INDUSTRY VALUE
#1 REG01 INDU01 4
#2 REG02 INDU01 10
#3 REG03 INDU01 16
#4 REGSUM INDU01 30
下面的函数将上面的代码概括为 data.frames 许多行业。它的工作原理是按行业拆分输入 data.frames 并 lapply
ing 之前的代码,作为每个拆分列表成员的函数编写。最后它重新组装这些子数据帧和 returns 给调用者。
assign_na_values <- function(x, y,
region_col = "REGION",
industry_col = "INDUSTRY",
value_col = "VALUE",
regsum = "REGSUM") {
f <- function(x, y, region_col, value_col, regsum){
i <- x[[region_col]] != regsum
not_na_values <- sum(x[[value_col]][ i ], na.rm = TRUE)
to_assign <- x[[value_col]][ !i ] - not_na_values
na <- is.na(x[[value_col]])
numer <- y[[value_col]][na]
denom <- sum(numer)
x[[value_col]][na] <- numer/denom * to_assign
x
}
sp_x <- split(x, x[[industry_col]])
sp_y <- split(y, y[[industry_col]])
res <- lapply(seq_along(sp_x), function(i){
f(sp_x[[i]], sp_y[[i]], region_col, value_col, regsum)
})
res <- do.call(rbind, res)
row.names(res) <- NULL
res
}
assign_na_values(df, df2)
# REGION INDUSTRY VALUE
#1 REG01 INDU01 4
#2 REG02 INDU01 10
#3 REG03 INDU01 16
#4 REGSUM INDU01 30
#5 REG01 INDU02 30
#6 REG02 INDU02 6
#7 REG03 INDU02 4
#8 REGSUM INDU02 40
新测试数据
df <- data.frame(
REGION = c("REG01","REG02","REG03","REGSUM","REG01","REG02","REG03","REGSUM"),
INDUSTRY = c("INDU01","INDU01","INDU01","INDU01","INDU02","INDU02","INDU02","INDU02"),
VALUE = c(NA,10,NA,30,30,NA,NA,40)
)
df2 <- data.frame(
REGION = c("REG01","REG02","REG03","REGSUM","REG01","REG02","REG03","REGSUM"),
INDUSTRY = c("INDU01","INDU01","INDU01","INDU01","INDU02","INDU02","INDU02","INDU02"),
VALUE = c(5,15,20,40,10,30,20,60)
)
这是另一个基本的 R 解决方案
idx <- which(REGION == "REGSUM")
df <- transform(
df,
VALUE = replace(
VALUE,
is.na(VALUE),
prop.table(df2$VALUE[is.na(VALUE)]) * (VALUE[idx] - sum(VALUE[-idx], na.rm = TRUE))
)
)
这给出了
REGION INDUSTRY VALUE
1 REG01 INDU01 4
2 REG02 INDU01 10
3 REG03 INDU01 16
4 REGSUM INDU01 30
如果有多个'INDUSTRY',我们可以通过操作进行连接和使用组
library(dplyr)
df %>%
left_join(df2, by = c("REGION", "INDUSTRY")) %>%
group_by(INDUSTRY) %>%
transmute(REGION, INDUSTRY, VALUE = case_when(is.na(VALUE.x) ~
VALUE.y/sum(VALUE.y[is.na(VALUE.x)]) * (VALUE.x[n()] -
sum(VALUE.x[-n()], na.rm = TRUE)), TRUE ~ VALUE.x)) %>%
ungroup
-输出
# A tibble: 4 x 3
# REGION INDUSTRY VALUE
# <chr> <chr> <dbl>
#1 REG01 INDU01 4
#2 REG02 INDU01 10
#3 REG03 INDU01 16
#4 REGSUM INDU01 30