R:通过匹配另一个数据帧的列来插入和推断数据帧中的值
R: Inter- and extrapolate values in dataframe by matching column of another dataframe
我有两个数据框:
df1 <- data.frame(levels = c(1, 3, 5, 7, 9),
values = c(2.2, 5.3, 7.9, 5.4, 8.7))
df2 <- data.frame(levels = c(1, 4, 8, 12)) # other columns not necessary
我希望根据 df1$levels 中的数字将 df1$values 插入到 df2$levels 中。所以有一些插值,但也外推到第二个数据帧中的 12 级。
也许,根据两个数据集的 levels
的 union
做一个 complete
,然后使用 na.approx
(来自 zoo
)和 rule = 2
(用于外推)
library(dplyr)
library(tidyr)
library(zoo)
df1 <- df1 %>%
complete(levels = union(levels, df2$levels)) %>%
mutate(values = na.approx(values, maxgap = Inf, rule = 2))
-输出
df1
# A tibble: 8 x 2
# levels values
# <dbl> <dbl>
#1 1 2.2
#2 3 5.3
#3 4 6.6
#4 5 7.9
#5 7 5.4
#6 8 7.05
#7 9 8.7
#8 12 8.7
我确定这可以压缩,这是我很久以前写的一些代码,它处理必须在有序向量的 head/tail 处进行外推:
# Function to interpolate / extrapolate: l_estimate => function()
l_estimate <- function(vec){
# Function to perform-linear interpolation and return vector:
# .l_interp_vec => function()
.l_interp_vec <- function(vec){
interped_values <-
approx(x = vec, method = "linear", ties = "constant", n = length(vec))$y
return(ifelse(is.na(vec), interped_values[is.na(vec)], vec))
}
# Store a vector denoting the indices of the vector that are NA:
# na_idx => integer vector
na_idx <- is.na(vec)
# Store a scalar of min row where x isn't NA: min_non_na => integer vector
min_non_na <- min(which(!(na_idx)))
# Store a scalar of max row where x isn't NA: max_non_na => integer vector
max_non_na <- max(which(!(na_idx)))
# Store scalar of the number of rows needed to impute prior
# to first NA value: ru_lower => integer vector
ru_lower <- ifelse(min_non_na > 1, min_non_na - 1, min_non_na)
# Store scalar of the number of rows needed to impute after
# the last non-NA value: ru_upper => integer vector
ru_upper <- ifelse(
max_non_na == length(vec),
length(vec) - 1,
(length(vec) - (max_non_na + 1))
)
# Store a vector of the ramp to function: ramp_up => numeric vector
ramp_up <- as.numeric(
cumsum(rep(vec[min_non_na]/(min_non_na), ru_lower))
)
# Apply the interpolation function on vector: y => numeric vector
y <- as.numeric(.l_interp_vec(as.numeric(vec[min_non_na:max_non_na])))
# Create a vector that combines the ramp_up vector
# and y if the first NA is at row 1:
if(length(ramp_up) >= 1 & max_non_na != length(vec)){
# Create a vector interpolations if there are
# multiple NA values after the last value: lower_l_int => numeric vector
lower_l_int <- as.numeric(
cumsum(rep(mean(diff(c(ramp_up, y))), ru_upper+1)) +
as.numeric(vec[max_non_na])
)
# Store the linear interpolations in a vector: z => numeric vector
z <- as.numeric(c(ramp_up, y, lower_l_int))
}else if(length(ramp_up) > 1 & max_non_na == length(vec)){
# Store the linear interpolations in a vector: z => numeric
z <- as.numeric(c(ramp_up, y))
}else if(min_non_na == 1 & max_non_na != length(vec)){
# Create a vector interpolations if there are
# multiple NA values after the last value: lower_l_int => numeric vector
lower_l_int <- as.numeric(
cumsum(rep(mean(diff(c(ramp_up, y))), ru_upper+1)) +
as.numeric(vec[max_non_na])
)
# Store the linear interpolations in a vector: z => numeric vector
z <- as.numeric(c(y, lower_l_int))
}else{
# Store the linear interpolations in a vector: z => numeric vector
z <- as.numeric(y)
}
# Interpolate between points in x, return new x:
return(as.numeric(ifelse(is.na(vec), z, vec)))
}
# Apply the function on ordered data: data.frame => stdout(console)
transform(full_df[order(full_df$levels),],
values = l_estimate(values)
)
我有两个数据框:
df1 <- data.frame(levels = c(1, 3, 5, 7, 9),
values = c(2.2, 5.3, 7.9, 5.4, 8.7))
df2 <- data.frame(levels = c(1, 4, 8, 12)) # other columns not necessary
我希望根据 df1$levels 中的数字将 df1$values 插入到 df2$levels 中。所以有一些插值,但也外推到第二个数据帧中的 12 级。
也许,根据两个数据集的 levels
的 union
做一个 complete
,然后使用 na.approx
(来自 zoo
)和 rule = 2
(用于外推)
library(dplyr)
library(tidyr)
library(zoo)
df1 <- df1 %>%
complete(levels = union(levels, df2$levels)) %>%
mutate(values = na.approx(values, maxgap = Inf, rule = 2))
-输出
df1
# A tibble: 8 x 2
# levels values
# <dbl> <dbl>
#1 1 2.2
#2 3 5.3
#3 4 6.6
#4 5 7.9
#5 7 5.4
#6 8 7.05
#7 9 8.7
#8 12 8.7
我确定这可以压缩,这是我很久以前写的一些代码,它处理必须在有序向量的 head/tail 处进行外推:
# Function to interpolate / extrapolate: l_estimate => function()
l_estimate <- function(vec){
# Function to perform-linear interpolation and return vector:
# .l_interp_vec => function()
.l_interp_vec <- function(vec){
interped_values <-
approx(x = vec, method = "linear", ties = "constant", n = length(vec))$y
return(ifelse(is.na(vec), interped_values[is.na(vec)], vec))
}
# Store a vector denoting the indices of the vector that are NA:
# na_idx => integer vector
na_idx <- is.na(vec)
# Store a scalar of min row where x isn't NA: min_non_na => integer vector
min_non_na <- min(which(!(na_idx)))
# Store a scalar of max row where x isn't NA: max_non_na => integer vector
max_non_na <- max(which(!(na_idx)))
# Store scalar of the number of rows needed to impute prior
# to first NA value: ru_lower => integer vector
ru_lower <- ifelse(min_non_na > 1, min_non_na - 1, min_non_na)
# Store scalar of the number of rows needed to impute after
# the last non-NA value: ru_upper => integer vector
ru_upper <- ifelse(
max_non_na == length(vec),
length(vec) - 1,
(length(vec) - (max_non_na + 1))
)
# Store a vector of the ramp to function: ramp_up => numeric vector
ramp_up <- as.numeric(
cumsum(rep(vec[min_non_na]/(min_non_na), ru_lower))
)
# Apply the interpolation function on vector: y => numeric vector
y <- as.numeric(.l_interp_vec(as.numeric(vec[min_non_na:max_non_na])))
# Create a vector that combines the ramp_up vector
# and y if the first NA is at row 1:
if(length(ramp_up) >= 1 & max_non_na != length(vec)){
# Create a vector interpolations if there are
# multiple NA values after the last value: lower_l_int => numeric vector
lower_l_int <- as.numeric(
cumsum(rep(mean(diff(c(ramp_up, y))), ru_upper+1)) +
as.numeric(vec[max_non_na])
)
# Store the linear interpolations in a vector: z => numeric vector
z <- as.numeric(c(ramp_up, y, lower_l_int))
}else if(length(ramp_up) > 1 & max_non_na == length(vec)){
# Store the linear interpolations in a vector: z => numeric
z <- as.numeric(c(ramp_up, y))
}else if(min_non_na == 1 & max_non_na != length(vec)){
# Create a vector interpolations if there are
# multiple NA values after the last value: lower_l_int => numeric vector
lower_l_int <- as.numeric(
cumsum(rep(mean(diff(c(ramp_up, y))), ru_upper+1)) +
as.numeric(vec[max_non_na])
)
# Store the linear interpolations in a vector: z => numeric vector
z <- as.numeric(c(y, lower_l_int))
}else{
# Store the linear interpolations in a vector: z => numeric vector
z <- as.numeric(y)
}
# Interpolate between points in x, return new x:
return(as.numeric(ifelse(is.na(vec), z, vec)))
}
# Apply the function on ordered data: data.frame => stdout(console)
transform(full_df[order(full_df$levels),],
values = l_estimate(values)
)