用分组变量中存在的值替换 NA
Substitute NAs by value present in grouped variables
虚假数据
fruit <- c("Orange", "Banana", "Orange", "Banana")
flavour <- c("Bitter", NA, NA, "Sweet")
geo <- c(NA, NA, NA, "France")
value <- c(1, NA, NA, 4)
dd <- data.frame(fruit, flavour, geo, value)
rm(fruit, flavour, geo, value)
我想按 'fruit' 对数据集进行分组并替换缺失值
在具有分组数据中存在的值的所有变量中。
期望的输出
fruit <- c("Orange", "Banana", "Orange", "Banana")
flavour <- c("Bitter", "Sweet", "Bitter", "Sweet")
geo <- c(NA, "France", NA, "France")
value <- c(1, 4, 1, 4)
dd2 <- data.frame(fruit, flavour, geo, value)
rm(fruit, flavour, geo, value)
代码尝试
tt <- dd %>%
group_by(fruit) %>%
summarise_all()
我们需要mutate_all
在group_by
之后(summarise/summarise_all
用于从多行中获取单行)。使用 zoo
中的 na.locf
为每列
用相邻的非 NA 元素填充 NA 元素
library(zoo)
library(dplyr)
dd %>%
group_by(fruit) %>%
mutate_all(funs(na.locf(na.locf(., na.rm = FALSE),
fromLast = TRUE, na.rm = FALSE)))
# A tibble: 4 x 4
# Groups: fruit [2]
# fruit flavour geo value
# <fct> <fct> <fct> <dbl>
#1 Orange Bitter <NA> 1
#2 Banana Sweet France 4
#3 Orange Bitter <NA> 1
#4 Banana Sweet France 4
我创建了一个函数来执行此操作。它依赖于 dplyr::coalesce。
如果组中的所有值都相等,它只会填充值。也可以指定已知值的最小值(n 或百分比),以防止单个值填充组中的所有值
library(dplyr)
fill_missing <- function (x, min_known_n = NULL, min_known_p = NULL) {
if (NA %in% x) {
y <- na.omit(x)
y_n_distinct <- length(unique(y))
if (!is.null(min_known_n)) {
known_n <- length(y)
if (known_n < min_known_n) {
return(x)
}
}
if (!is.null(min_known_p)) {
known_p <- length(y)/length(x)
if (known_p < min_known_p) {
return(x)
}
}
if (y_n_distinct == 1) {
x <- dplyr::coalesce(x, y[1])
}
}
return(x)
}
dd %>% group_by(fruit) %>%
mutate_all(fill_missing)
虚假数据
fruit <- c("Orange", "Banana", "Orange", "Banana")
flavour <- c("Bitter", NA, NA, "Sweet")
geo <- c(NA, NA, NA, "France")
value <- c(1, NA, NA, 4)
dd <- data.frame(fruit, flavour, geo, value)
rm(fruit, flavour, geo, value)
我想按 'fruit' 对数据集进行分组并替换缺失值 在具有分组数据中存在的值的所有变量中。
期望的输出
fruit <- c("Orange", "Banana", "Orange", "Banana")
flavour <- c("Bitter", "Sweet", "Bitter", "Sweet")
geo <- c(NA, "France", NA, "France")
value <- c(1, 4, 1, 4)
dd2 <- data.frame(fruit, flavour, geo, value)
rm(fruit, flavour, geo, value)
代码尝试
tt <- dd %>%
group_by(fruit) %>%
summarise_all()
我们需要mutate_all
在group_by
之后(summarise/summarise_all
用于从多行中获取单行)。使用 zoo
中的 na.locf
为每列
library(zoo)
library(dplyr)
dd %>%
group_by(fruit) %>%
mutate_all(funs(na.locf(na.locf(., na.rm = FALSE),
fromLast = TRUE, na.rm = FALSE)))
# A tibble: 4 x 4
# Groups: fruit [2]
# fruit flavour geo value
# <fct> <fct> <fct> <dbl>
#1 Orange Bitter <NA> 1
#2 Banana Sweet France 4
#3 Orange Bitter <NA> 1
#4 Banana Sweet France 4
我创建了一个函数来执行此操作。它依赖于 dplyr::coalesce。 如果组中的所有值都相等,它只会填充值。也可以指定已知值的最小值(n 或百分比),以防止单个值填充组中的所有值
library(dplyr)
fill_missing <- function (x, min_known_n = NULL, min_known_p = NULL) {
if (NA %in% x) {
y <- na.omit(x)
y_n_distinct <- length(unique(y))
if (!is.null(min_known_n)) {
known_n <- length(y)
if (known_n < min_known_n) {
return(x)
}
}
if (!is.null(min_known_p)) {
known_p <- length(y)/length(x)
if (known_p < min_known_p) {
return(x)
}
}
if (y_n_distinct == 1) {
x <- dplyr::coalesce(x, y[1])
}
}
return(x)
}
dd %>% group_by(fruit) %>%
mutate_all(fill_missing)