R Tidyverse 分组依据和所有列的空值计数

R Tidyverse Group by and count of nulls for all columns

我发现了类似的问题,但找不到能完全回答我的问题的问题: 我试图按位置查找人们输入大约 100 个参数的数据的频率。 这个问题几乎可以回答。但是如何添加 group_by 行,使其更细化?

试图得到类似于:

我要 group_by 的专栏是 group_id。这是我目前的,不是优雅的解决方案

df1 <- structure(list(SCR9 = c(50.5, NA, NA, 25.75, 100, NA, NA, NA, 
                        100, NA, NA, NA, 75.25, NA, NA), SCR10 = c(25.75, NA, NA, NA, 
                                                                   NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA), SCR12 = c(75.25, 
                                                                                                                         75.25, 50.5, NA, 75.25, 75.25, 100, 100, 75.25, NA, 75.25, 75.25, 
                                                                                                                         50.5, 100, 50.5), ID = 1:15, group_id = c("a", "b", "b", "c", 
                                                                                                                                                                   "a", "b", "c", "c", "a", "b", "a", "a", "c", "b", "b")), row.names = c(NA, 
                                                                                                                                                                                                                                          15L), class = "data.frame")
attach(df1)

df2 <- df1 |> 
  split.data.frame(group_id)

d_a <- df2$a |> 
  map_df(function(x) sum(is.na(x))) %>%
  gather(feature, num_nulls) %>%
  dplyr::arrange(desc(num_nulls)) %>%
  mutate(
    percent_null = num_nulls/nrow(df2$a),
    group_id = 'a') |> 
  select(-num_nulls)


d_b <- df2$b |> 
  map_df(function(x) sum(is.na(x))) %>%
  gather(feature, num_nulls) %>%
  dplyr::arrange(desc(num_nulls)) %>%
  mutate(
    percent_null = num_nulls/nrow(df2$b),
    group_id = 'b') |> 
  select(-num_nulls)

d_c <- df2$c |> 
  map_df(function(x) sum(is.na(x))) %>%
  gather(feature, num_nulls) %>%
  dplyr::arrange(desc(num_nulls)) %>%
  mutate(
    percent_null = num_nulls/nrow(df2$c),
    group_id = 'c') |> 
  select(-num_nulls)

d_all <- bind_rows(
  d_a,
  d_b,
  d_c
)

d_all |> 
  dplyr::arrange(feature, group_id) |> 
  slice(-c(1:3)) |> 
  select(feature, group_id, percent_null)

感谢您的帮助!

看看这是否适合你...

library(tidyverse)

df1 <- structure(list(SCR9 = c(
  50.5, NA, NA, 25.75, 100, NA, NA, NA,
  100, NA, NA, NA, 75.25, NA, NA
), SCR10 = c(
  25.75, NA, NA, NA,
  NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA
), SCR12 = c(
  75.25,
  75.25, 50.5, NA, 75.25, 75.25, 100, 100, 75.25, NA, 75.25, 75.25,
  50.5, 100, 50.5
), ID = 1:15, group_id = c(
  "a", "b", "b", "c",
  "a", "b", "c", "c", "a", "b", "a", "a", "c", "b", "b"
)), row.names = c(
  NA,
  15L
), class = "data.frame")
              
df1 |>
  group_by(group_id) |>
  summarise(pct_na = across(everything(), ~ sum(is.na(.x)) / n()))
#> # A tibble: 3 × 2
#>   group_id pct_na$SCR9 $SCR10 $SCR12   $ID
#>   <chr>          <dbl>  <dbl>  <dbl> <dbl>
#> 1 a                0.4  0.8    0         0
#> 2 b                1    0.833  0.167     0
#> 3 c                0.5  1      0.25      0

reprex package (v2.0.1)

创建于 2022-04-21