根据 R 中的条件替换 NA
Replace NAs based on conditions in R
我有一个数据集,我想在那些缺失值数量大于或等于 n
的列中用空字符串替换 NA。例如,n = 500
.
set.seed(2022)
synthetic <- tibble(
col1 = runif(1000),
col2 = runif(1000),
col3 = runif(1000)
)
na_insert <- c(sample(nrow(synthetic), 500, replace = FALSE))
synthetic[na_insert, 1] <- NA
我正在尝试做但最终失败的事情:
synthetic %>%
mutate(across(everything(), ~ replace_na(sum(is.na(.x)) >= 500, "")))
我在这个原始练习中做错了什么?
您可以使用 where
和类似 purrr
的函数:
library(dplyr)
synthetic %>%
mutate(across(where(~sum(is.na(.x)) >= 500), ~coalesce(as.character(.x), "")))
这个returns
# A tibble: 1,000 x 3
col1 col2 col3
<chr> <dbl> <dbl>
1 "" 0.479 0.139
2 "0.647259329678491" 0.410 0.770
3 "" 0.696 0.805
4 "" 0.863 0.803
5 "0.184729989385232" 0.146 0.652
6 "0.635790845612064" 0.634 0.0830
7 "" 0.305 0.527
8 "0.0419759317301214" 0.297 0.275
9 "" 0.883 0.698
10 "0.757252902723849" 0.115 0.933
# ... with 990 more rows
使用ifelse
函数:
library(dplyr)
synthetic |>
mutate_all(~ifelse(
sum(is.na(.)) >= 500 & is.na(.),
"",
.
))
输出:
# A tibble: 1,000 x 3
col1 col2 col3
<chr> <dbl> <dbl>
1 "" 0.479 0.139
2 "0.647259329678491" 0.410 0.770
3 "" 0.696 0.805
4 "" 0.863 0.803
5 "0.184729989385232" 0.146 0.652
6 "0.635790845612064" 0.634 0.0830
7 "" 0.305 0.527
8 "0.0419759317301214" 0.297 0.275
9 "" 0.883 0.698
10 "0.757252902723849" 0.115 0.933
编辑:
使用 across
而不是 mutate_all
:
synthetic |>
mutate(across(everything(),
~ ifelse(sum(is.na(
.
)) >= 500 & is.na(.),
"",
.)))
library(data.table)
n <- 500
# convert all to character
setDT(synthetic)[, names(synthetic) := lapply(.SD, as.character)]
# find columns with >= 500 NA's
cols <- which(colSums(is.na(synthetic)) >= n)
# fast!! replace all NA in the found columns to ""
for(col in cols) set(synthetic,
i = which(is.na(synthetic[[col]])),
j = col,
value = "")
我有一个数据集,我想在那些缺失值数量大于或等于 n
的列中用空字符串替换 NA。例如,n = 500
.
set.seed(2022)
synthetic <- tibble(
col1 = runif(1000),
col2 = runif(1000),
col3 = runif(1000)
)
na_insert <- c(sample(nrow(synthetic), 500, replace = FALSE))
synthetic[na_insert, 1] <- NA
我正在尝试做但最终失败的事情:
synthetic %>%
mutate(across(everything(), ~ replace_na(sum(is.na(.x)) >= 500, "")))
我在这个原始练习中做错了什么?
您可以使用 where
和类似 purrr
的函数:
library(dplyr)
synthetic %>%
mutate(across(where(~sum(is.na(.x)) >= 500), ~coalesce(as.character(.x), "")))
这个returns
# A tibble: 1,000 x 3
col1 col2 col3
<chr> <dbl> <dbl>
1 "" 0.479 0.139
2 "0.647259329678491" 0.410 0.770
3 "" 0.696 0.805
4 "" 0.863 0.803
5 "0.184729989385232" 0.146 0.652
6 "0.635790845612064" 0.634 0.0830
7 "" 0.305 0.527
8 "0.0419759317301214" 0.297 0.275
9 "" 0.883 0.698
10 "0.757252902723849" 0.115 0.933
# ... with 990 more rows
使用ifelse
函数:
library(dplyr)
synthetic |>
mutate_all(~ifelse(
sum(is.na(.)) >= 500 & is.na(.),
"",
.
))
输出:
# A tibble: 1,000 x 3
col1 col2 col3
<chr> <dbl> <dbl>
1 "" 0.479 0.139
2 "0.647259329678491" 0.410 0.770
3 "" 0.696 0.805
4 "" 0.863 0.803
5 "0.184729989385232" 0.146 0.652
6 "0.635790845612064" 0.634 0.0830
7 "" 0.305 0.527
8 "0.0419759317301214" 0.297 0.275
9 "" 0.883 0.698
10 "0.757252902723849" 0.115 0.933
编辑:
使用 across
而不是 mutate_all
:
synthetic |>
mutate(across(everything(),
~ ifelse(sum(is.na(
.
)) >= 500 & is.na(.),
"",
.)))
library(data.table)
n <- 500
# convert all to character
setDT(synthetic)[, names(synthetic) := lapply(.SD, as.character)]
# find columns with >= 500 NA's
cols <- which(colSums(is.na(synthetic)) >= n)
# fast!! replace all NA in the found columns to ""
for(col in cols) set(synthetic,
i = which(is.na(synthetic[[col]])),
j = col,
value = "")