R Tidyverse 分组依据和所有列的空值计数
R Tidyverse Group by and count of nulls for all columns
我发现了类似的问题,但找不到能完全回答我的问题的问题:
我试图按位置查找人们输入大约 100 个参数的数据的频率。
这个问题几乎可以回答。但是如何添加 group_by 行,使其更细化?
试图得到类似于:
我要 group_by 的专栏是 group_id。这是我目前的,不是优雅的解决方案
df1 <- structure(list(SCR9 = c(50.5, NA, NA, 25.75, 100, NA, NA, NA,
100, NA, NA, NA, 75.25, NA, NA), SCR10 = c(25.75, NA, NA, NA,
NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA), SCR12 = c(75.25,
75.25, 50.5, NA, 75.25, 75.25, 100, 100, 75.25, NA, 75.25, 75.25,
50.5, 100, 50.5), ID = 1:15, group_id = c("a", "b", "b", "c",
"a", "b", "c", "c", "a", "b", "a", "a", "c", "b", "b")), row.names = c(NA,
15L), class = "data.frame")
attach(df1)
df2 <- df1 |>
split.data.frame(group_id)
d_a <- df2$a |>
map_df(function(x) sum(is.na(x))) %>%
gather(feature, num_nulls) %>%
dplyr::arrange(desc(num_nulls)) %>%
mutate(
percent_null = num_nulls/nrow(df2$a),
group_id = 'a') |>
select(-num_nulls)
d_b <- df2$b |>
map_df(function(x) sum(is.na(x))) %>%
gather(feature, num_nulls) %>%
dplyr::arrange(desc(num_nulls)) %>%
mutate(
percent_null = num_nulls/nrow(df2$b),
group_id = 'b') |>
select(-num_nulls)
d_c <- df2$c |>
map_df(function(x) sum(is.na(x))) %>%
gather(feature, num_nulls) %>%
dplyr::arrange(desc(num_nulls)) %>%
mutate(
percent_null = num_nulls/nrow(df2$c),
group_id = 'c') |>
select(-num_nulls)
d_all <- bind_rows(
d_a,
d_b,
d_c
)
d_all |>
dplyr::arrange(feature, group_id) |>
slice(-c(1:3)) |>
select(feature, group_id, percent_null)
感谢您的帮助!
看看这是否适合你...
library(tidyverse)
df1 <- structure(list(SCR9 = c(
50.5, NA, NA, 25.75, 100, NA, NA, NA,
100, NA, NA, NA, 75.25, NA, NA
), SCR10 = c(
25.75, NA, NA, NA,
NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA
), SCR12 = c(
75.25,
75.25, 50.5, NA, 75.25, 75.25, 100, 100, 75.25, NA, 75.25, 75.25,
50.5, 100, 50.5
), ID = 1:15, group_id = c(
"a", "b", "b", "c",
"a", "b", "c", "c", "a", "b", "a", "a", "c", "b", "b"
)), row.names = c(
NA,
15L
), class = "data.frame")
df1 |>
group_by(group_id) |>
summarise(pct_na = across(everything(), ~ sum(is.na(.x)) / n()))
#> # A tibble: 3 × 2
#> group_id pct_na$SCR9 $SCR10 $SCR12 $ID
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 a 0.4 0.8 0 0
#> 2 b 1 0.833 0.167 0
#> 3 c 0.5 1 0.25 0
由 reprex package (v2.0.1)
创建于 2022-04-21
我发现了类似的问题,但找不到能完全回答我的问题的问题:
我试图按位置查找人们输入大约 100 个参数的数据的频率。
这个问题几乎可以回答。但是如何添加 group_by 行,使其更细化?
试图得到类似于:
我要 group_by 的专栏是 group_id。这是我目前的,不是优雅的解决方案
df1 <- structure(list(SCR9 = c(50.5, NA, NA, 25.75, 100, NA, NA, NA,
100, NA, NA, NA, 75.25, NA, NA), SCR10 = c(25.75, NA, NA, NA,
NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA), SCR12 = c(75.25,
75.25, 50.5, NA, 75.25, 75.25, 100, 100, 75.25, NA, 75.25, 75.25,
50.5, 100, 50.5), ID = 1:15, group_id = c("a", "b", "b", "c",
"a", "b", "c", "c", "a", "b", "a", "a", "c", "b", "b")), row.names = c(NA,
15L), class = "data.frame")
attach(df1)
df2 <- df1 |>
split.data.frame(group_id)
d_a <- df2$a |>
map_df(function(x) sum(is.na(x))) %>%
gather(feature, num_nulls) %>%
dplyr::arrange(desc(num_nulls)) %>%
mutate(
percent_null = num_nulls/nrow(df2$a),
group_id = 'a') |>
select(-num_nulls)
d_b <- df2$b |>
map_df(function(x) sum(is.na(x))) %>%
gather(feature, num_nulls) %>%
dplyr::arrange(desc(num_nulls)) %>%
mutate(
percent_null = num_nulls/nrow(df2$b),
group_id = 'b') |>
select(-num_nulls)
d_c <- df2$c |>
map_df(function(x) sum(is.na(x))) %>%
gather(feature, num_nulls) %>%
dplyr::arrange(desc(num_nulls)) %>%
mutate(
percent_null = num_nulls/nrow(df2$c),
group_id = 'c') |>
select(-num_nulls)
d_all <- bind_rows(
d_a,
d_b,
d_c
)
d_all |>
dplyr::arrange(feature, group_id) |>
slice(-c(1:3)) |>
select(feature, group_id, percent_null)
感谢您的帮助!
看看这是否适合你...
library(tidyverse)
df1 <- structure(list(SCR9 = c(
50.5, NA, NA, 25.75, 100, NA, NA, NA,
100, NA, NA, NA, 75.25, NA, NA
), SCR10 = c(
25.75, NA, NA, NA,
NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA
), SCR12 = c(
75.25,
75.25, 50.5, NA, 75.25, 75.25, 100, 100, 75.25, NA, 75.25, 75.25,
50.5, 100, 50.5
), ID = 1:15, group_id = c(
"a", "b", "b", "c",
"a", "b", "c", "c", "a", "b", "a", "a", "c", "b", "b"
)), row.names = c(
NA,
15L
), class = "data.frame")
df1 |>
group_by(group_id) |>
summarise(pct_na = across(everything(), ~ sum(is.na(.x)) / n()))
#> # A tibble: 3 × 2
#> group_id pct_na$SCR9 $SCR10 $SCR12 $ID
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 a 0.4 0.8 0 0
#> 2 b 1 0.833 0.167 0
#> 3 c 0.5 1 0.25 0
由 reprex package (v2.0.1)
创建于 2022-04-21