用行平均值替换多列中的一组 NA

Replace a set number of NA's across multiple columns with a row mean

在处理调查数据时,我需要对不同测量问题中偶尔出现的 NA 进行一些估算。我想用行平均值替换 NA,但前提是行中的 NA 不超过 2 个。关于如何实现这一目标的任何提示都将是惊人的。

我试过使用下面的代码(带有一些示例数据),但这不允许我控制可接受的连续 NA 的数量。

data <- data.frame(
  var_1 = c(2,3,NA,2,3,5,NA,3),
  var_2 = c(3,4,2,3,1,3,NA,2),
  var_3 = c(NA,3,2,5,4,2,NA,2),
  var_4 = c(NA,3,NA,4,1,2,NA,1),
  var_5 = c(NA,4,2,3,2,3,NA,2),
  var_6 = c(4,2,1,NA,2,5,NA,3),
  var_7 = c(3,2,1,2,2,4,NA,3))

data_fix <- data %>%
  mutate(var_1 = ifelse(is.na(var_1),rowMeans(data[row_number(),], na.rm = T),var_1),
         var_2 = ifelse(is.na(var_2),rowMeans(data[row_number(),], na.rm = T),var_2),
         var_3 = ifelse(is.na(var_3),rowMeans(data[row_number(),], na.rm = T),var_3),
         var_4 = ifelse(is.na(var_4),rowMeans(data[row_number(),], na.rm = T),var_4),
         var_5 = ifelse(is.na(var_5),rowMeans(data[row_number(),], na.rm = T),var_5),
         var_6 = ifelse(is.na(var_6),rowMeans(data[row_number(),], na.rm = T),var_6),
         var_7 = ifelse(is.na(var_7),rowMeans(data[row_number(),], na.rm = T),var_7))

这里有一个选项:

rmeans <- rowMeans(data, na.rm = TRUE)
rmeans[rowSums(is.na(data)) > 2] <- NA
data2 <- data
data2[] <- rmeans
data[is.na(data)] <- data2[is.na(data)]
data
#>   var_1 var_2 var_3 var_4 var_5    var_6 var_7
#> 1   2.0     3    NA    NA    NA 4.000000     3
#> 2   3.0     4     3   3.0     4 2.000000     2
#> 3   1.6     2     2   1.6     2 1.000000     1
#> 4   2.0     3     5   4.0     3 3.166667     2
#> 5   3.0     1     4   1.0     2 2.000000     2
#> 6   5.0     3     2   2.0     3 5.000000     4
#> 7    NA    NA    NA    NA    NA       NA    NA
#> 8   3.0     2     2   1.0     2 3.000000     3

reprex package (v2.0.1)

于 2022-04-20 创建

一个dplyr解决方案:

library(dplyr)

data %>%
  mutate(na.count = rowSums(is.na(data)),
         row.mean = rowMeans(data, na.rm = TRUE),
         across(var_1:var_7, ~ ifelse(is.na(.x) & na.count <= 2, row.mean, .x))) %>%
  select(-c(na.count, row.mean))

  var_1 var_2 var_3 var_4 var_5    var_6 var_7
1   2.0     3    NA    NA    NA 4.000000     3
2   3.0     4     3   3.0     4 2.000000     2
3   1.6     2     2   1.6     2 1.000000     1
4   2.0     3     5   4.0     3 3.166667     2
5   3.0     1     4   1.0     2 2.000000     2
6   5.0     3     2   2.0     3 5.000000     4
7    NA    NA    NA    NA    NA       NA    NA
8   3.0     2     2   1.0     2 3.000000     3

base等价于:

na.count <- rowSums(is.na(data))
row.mean <- rowMeans(data, na.rm = TRUE)
data2 <- data
data2[] <- lapply(data, \(x) ifelse(is.na(x) & na.count <= 2, row.mean, x))
data2