group_by 使用函数 "for" 无效 (R)

group_by using the function "for" didn´t work (R)

我知道有几个主题与此有关。但是 none 对我有用。我真的试过了。 所以,我有这个文件:

Group   Weight  Size
A   4   1.7
A   5   1.8
A   6   1.9
B   7   1.75
B   4   1.73
B   4   1.77
C   5   1.5
C   6   2.3
C   7   1.7

考虑到 Group,我想对 WeightSize 进行描述性统计。所以我使用了这段代码:

library(dplyr)
    desc_group <- data.frame(matrix(ncol=7,nrow=0, dimnames=list(NULL, c("Trait",
                                                                               "Mean","SD","N", "Min","Max","Coeff.Variation"))))
    
 detach("package:plyr")  
    for (i in c(2,3)) {
      
      descriptive<- df %>%  dplyr::group_by(Group) %>%
        dplyr::summarise(Trait=colnames(df[i]), Mean= mean(df[[i]], na.rm = T),
                  SD= sd(df[[i]], na.rm = T)   ,N=length(na.omit(df[[i]])),
                  Min=min(df[[i]], na.rm = T),Max =max(df[[i]], na.rm = T), 
                  Coeff.Variation=sd(df[[i]], na.rm=TRUE)/mean(df[[i]], na.rm=TRUE)*100)
      desc_group <- merge(descriptive,desc_group, all = T)
      
    }

但我是这样理解的:

  Group  Trait     Mean       SD N Min Max Coeff.Variation
1     A   Size 1.794444 0.217377 9 1.5 2.3        12.11389
2     A Weight 5.333333 1.224745 9 4.0 7.0        22.96397
3     B   Size 1.794444 0.217377 9 1.5 2.3        12.11389
4     B Weight 5.333333 1.224745 9 4.0 7.0        22.96397
5     C   Size 1.794444 0.217377 9 1.5 2.3        12.11389
6     C Weight 5.333333 1.224745 9 4.0 7.0        22.96397 

这是一个重复的一般结果。有什么问题吗?

这是一种使用 summariseacross 的方法,可以在多个列上应用相同的函数。通过使结果成为 data.frame,结果将添加为命名列。然后你可以使用 pivot_longerunpack 来得到想要的结果。

library(dplyr)
library(tidyr)
df %>%
  group_by(Group) %>%
  summarise(across(Weight:Size,~data.frame(Mean = mean(.,na.rm=TRUE),
                                           SD = sd(.,na.rm=TRUE),
                                           Min = min(.,na.rm=TRUE),
                                           Max = max(.,na.rm=TRUE),
                                           Coeff.Variation = sd(., na.rm=TRUE)/mean(., na.rm=TRUE)*100))) %>%
  pivot_longer(-Group, names_to = "Trait") %>%
  unpack(value)
## A tibble: 6 x 7
#  Group Trait   Mean    SD   Min   Max Coeff.Variation
#  <chr> <chr>  <dbl> <dbl> <dbl> <dbl>           <dbl>
#1 A     Weight  5    1      4     6              20   
#2 A     Size    1.8  0.100  1.7   1.9             5.56
#3 B     Weight  5    1.73   4     7              34.6 
#4 B     Size    1.75 0.02   1.73  1.77            1.14
#5 C     Weight  6    1      5     7              16.7 
#6 C     Size    1.83 0.416  1.5   2.3            22.7 

此解决方案无需使用 forloop,而是在创建分组摘要统计信息之前将数据重塑为长格式。

    df %>% 
  tidyr::gather("Type", "Value", -Group) %>% 
  dplyr::group_by(Group, Type) %>% 
  dplyr::summarise(Mean = mean(Value, na.rm = T), SD = sd(Value, na.rm = T),
                   N = length(na.omit(Value)), Min = min(Value, na.rm = T),
                   Max = max(Value, na.rm = T)) %>% 
  ungroup() %>% 
  dplyr::mutate(Coef.Variation = (SD/Mean)*100)

我不清楚您的解决方案中 for for 循环部分的作用。在 tidyverse.

中有一种直接的方法可以做到这一点
library(tidyverse)

test_data <- tibble(group = rep(c("A", "B", "C"), 3),
       weight = sample(4:7, 9, replace = TRUE),
       size = runif(9, 1, 3))

test_data

# A tibble: 9 x 3
group weight  size
<chr>  <int> <dbl>
1 A          6  1.50
2 B          4  2.38
3 C          6  2.02
4 A          4  1.51
5 B          6  2.41
6 C          7  1.31
7 A          4  2.19
8 B          5  1.20
9 C          6  2.22

这执行基本的 group_by() 和 summarize():

test_data %>% 
  group_by(group) %>% 
  summarize(min_weight = min(weight),
            max_weight = max(weight),
            sd_weight = sd(weight),
            min_size = min(size),
            max_size = max(size),
            sd_size = sd(size))

group min_weight max_weight sd_weight min_size max_size sd_size
<chr>      <int>      <int>     <dbl>    <dbl>    <dbl>   <dbl>
1 A              4          6     1.15      1.50     2.19   0.394
2 B              4          6     1         1.20     2.41   0.690
3 C              6          7     0.577     1.31     2.22   0.478

这个更高级,使用across将多个汇总函数应用于多个列:

summarize_funs <- list("min", "max", "sd", "median") %>% 
  set_names()

test_data %>% 
  group_by(group) %>% 
  summarize(across(.cols = c(weight, size), .fns = summarize_funs, .names = "{.col}_{.fn}"))

# A tibble: 3 x 9
group weight_min weight_max weight_sd weight_median size_min size_max size_sd size_median
<chr>      <int>      <int>     <dbl>         <int>    <dbl>    <dbl>   <dbl>       <dbl>
1 A              4          6     1.15              4     1.50     2.19   0.394        1.51
2 B              4          6     1                 5     1.20     2.41   0.690        2.38
3 C              6          7     0.577             6     1.31     2.22   0.478        2.02

试试这个:

  library(dplyr)
  library(data.table)
    melt(as.data.table(df),id=c("Group"), measure=c("Weight","Size")) %>%
      group_by(Group,variable) %>%
      summarise(Mean = mean(value), SD = sd(value), N =n(),
                Min = min(value), Max = max(value), 
                Coeff.Variation = sd(value)/abs(mean(value)))