用行平均值替换多列中的一组 NA
Replace a set number of NA's across multiple columns with a row mean
在处理调查数据时,我需要对不同测量问题中偶尔出现的 NA 进行一些估算。我想用行平均值替换 NA,但前提是行中的 NA 不超过 2 个。关于如何实现这一目标的任何提示都将是惊人的。
我试过使用下面的代码(带有一些示例数据),但这不允许我控制可接受的连续 NA 的数量。
data <- data.frame(
var_1 = c(2,3,NA,2,3,5,NA,3),
var_2 = c(3,4,2,3,1,3,NA,2),
var_3 = c(NA,3,2,5,4,2,NA,2),
var_4 = c(NA,3,NA,4,1,2,NA,1),
var_5 = c(NA,4,2,3,2,3,NA,2),
var_6 = c(4,2,1,NA,2,5,NA,3),
var_7 = c(3,2,1,2,2,4,NA,3))
data_fix <- data %>%
mutate(var_1 = ifelse(is.na(var_1),rowMeans(data[row_number(),], na.rm = T),var_1),
var_2 = ifelse(is.na(var_2),rowMeans(data[row_number(),], na.rm = T),var_2),
var_3 = ifelse(is.na(var_3),rowMeans(data[row_number(),], na.rm = T),var_3),
var_4 = ifelse(is.na(var_4),rowMeans(data[row_number(),], na.rm = T),var_4),
var_5 = ifelse(is.na(var_5),rowMeans(data[row_number(),], na.rm = T),var_5),
var_6 = ifelse(is.na(var_6),rowMeans(data[row_number(),], na.rm = T),var_6),
var_7 = ifelse(is.na(var_7),rowMeans(data[row_number(),], na.rm = T),var_7))
这里有一个选项:
rmeans <- rowMeans(data, na.rm = TRUE)
rmeans[rowSums(is.na(data)) > 2] <- NA
data2 <- data
data2[] <- rmeans
data[is.na(data)] <- data2[is.na(data)]
data
#> var_1 var_2 var_3 var_4 var_5 var_6 var_7
#> 1 2.0 3 NA NA NA 4.000000 3
#> 2 3.0 4 3 3.0 4 2.000000 2
#> 3 1.6 2 2 1.6 2 1.000000 1
#> 4 2.0 3 5 4.0 3 3.166667 2
#> 5 3.0 1 4 1.0 2 2.000000 2
#> 6 5.0 3 2 2.0 3 5.000000 4
#> 7 NA NA NA NA NA NA NA
#> 8 3.0 2 2 1.0 2 3.000000 3
由 reprex package (v2.0.1)
于 2022-04-20 创建
一个dplyr
解决方案:
library(dplyr)
data %>%
mutate(na.count = rowSums(is.na(data)),
row.mean = rowMeans(data, na.rm = TRUE),
across(var_1:var_7, ~ ifelse(is.na(.x) & na.count <= 2, row.mean, .x))) %>%
select(-c(na.count, row.mean))
var_1 var_2 var_3 var_4 var_5 var_6 var_7
1 2.0 3 NA NA NA 4.000000 3
2 3.0 4 3 3.0 4 2.000000 2
3 1.6 2 2 1.6 2 1.000000 1
4 2.0 3 5 4.0 3 3.166667 2
5 3.0 1 4 1.0 2 2.000000 2
6 5.0 3 2 2.0 3 5.000000 4
7 NA NA NA NA NA NA NA
8 3.0 2 2 1.0 2 3.000000 3
其base
等价于:
na.count <- rowSums(is.na(data))
row.mean <- rowMeans(data, na.rm = TRUE)
data2 <- data
data2[] <- lapply(data, \(x) ifelse(is.na(x) & na.count <= 2, row.mean, x))
data2
在处理调查数据时,我需要对不同测量问题中偶尔出现的 NA 进行一些估算。我想用行平均值替换 NA,但前提是行中的 NA 不超过 2 个。关于如何实现这一目标的任何提示都将是惊人的。
我试过使用下面的代码(带有一些示例数据),但这不允许我控制可接受的连续 NA 的数量。
data <- data.frame(
var_1 = c(2,3,NA,2,3,5,NA,3),
var_2 = c(3,4,2,3,1,3,NA,2),
var_3 = c(NA,3,2,5,4,2,NA,2),
var_4 = c(NA,3,NA,4,1,2,NA,1),
var_5 = c(NA,4,2,3,2,3,NA,2),
var_6 = c(4,2,1,NA,2,5,NA,3),
var_7 = c(3,2,1,2,2,4,NA,3))
data_fix <- data %>%
mutate(var_1 = ifelse(is.na(var_1),rowMeans(data[row_number(),], na.rm = T),var_1),
var_2 = ifelse(is.na(var_2),rowMeans(data[row_number(),], na.rm = T),var_2),
var_3 = ifelse(is.na(var_3),rowMeans(data[row_number(),], na.rm = T),var_3),
var_4 = ifelse(is.na(var_4),rowMeans(data[row_number(),], na.rm = T),var_4),
var_5 = ifelse(is.na(var_5),rowMeans(data[row_number(),], na.rm = T),var_5),
var_6 = ifelse(is.na(var_6),rowMeans(data[row_number(),], na.rm = T),var_6),
var_7 = ifelse(is.na(var_7),rowMeans(data[row_number(),], na.rm = T),var_7))
这里有一个选项:
rmeans <- rowMeans(data, na.rm = TRUE)
rmeans[rowSums(is.na(data)) > 2] <- NA
data2 <- data
data2[] <- rmeans
data[is.na(data)] <- data2[is.na(data)]
data
#> var_1 var_2 var_3 var_4 var_5 var_6 var_7
#> 1 2.0 3 NA NA NA 4.000000 3
#> 2 3.0 4 3 3.0 4 2.000000 2
#> 3 1.6 2 2 1.6 2 1.000000 1
#> 4 2.0 3 5 4.0 3 3.166667 2
#> 5 3.0 1 4 1.0 2 2.000000 2
#> 6 5.0 3 2 2.0 3 5.000000 4
#> 7 NA NA NA NA NA NA NA
#> 8 3.0 2 2 1.0 2 3.000000 3
由 reprex package (v2.0.1)
于 2022-04-20 创建一个dplyr
解决方案:
library(dplyr)
data %>%
mutate(na.count = rowSums(is.na(data)),
row.mean = rowMeans(data, na.rm = TRUE),
across(var_1:var_7, ~ ifelse(is.na(.x) & na.count <= 2, row.mean, .x))) %>%
select(-c(na.count, row.mean))
var_1 var_2 var_3 var_4 var_5 var_6 var_7
1 2.0 3 NA NA NA 4.000000 3
2 3.0 4 3 3.0 4 2.000000 2
3 1.6 2 2 1.6 2 1.000000 1
4 2.0 3 5 4.0 3 3.166667 2
5 3.0 1 4 1.0 2 2.000000 2
6 5.0 3 2 2.0 3 5.000000 4
7 NA NA NA NA NA NA NA
8 3.0 2 2 1.0 2 3.000000 3
其base
等价于:
na.count <- rowSums(is.na(data))
row.mean <- rowMeans(data, na.rm = TRUE)
data2 <- data
data2[] <- lapply(data, \(x) ifelse(is.na(x) & na.count <= 2, row.mean, x))
data2