在 R 中使用均值、总和、长度和 sd 实现频率计数的更简单方法

A simpler way to achieve a frequency count with mean, sum, length and sd in R

我的任务是创建包含统计摘要的频率表。我的目标是创建一个可以简单导出到 excel 的数据框。 其中大部分可能在 sql 中使用存储过程,但我决定在 R 中执行此操作。我正在学习 R,所以我可能会做很长的路要走。这是

的后续问题

给定

    Id <- c(1,2,3,4,5,6,7,8,9,10)
    ClassA <- c(1,NA,3,1,1,2,1,4,5,3)
    ClassB <- c(2,1,1,3,3,2,1,1,3,3)
    R <- c(1,2,3,NA,9,2,4,5,6,7)
    S <- c(3,7,NA,9,5,8,7,NA,7,6)
    df <- data.frame(Id,ClassA,ClassB,R,S)

    ZeroTenNAScale <- c(0:10,NA);

    R.freq <- setNames(nm=c('answer','value'),data.frame(table(factor(df$R,levels=ZeroTenNAScale,exclude=NULL))));
    R.freq[, 1] <- as.numeric(as.character( R.freq[, 1] ))
    R.freq <- cbind(question='R',R.freq)

    S.freq <- setNames(nm=c('answer','value'),data.frame(table(factor(df$S,levels=ZeroTenNAScale,exclude=NULL))));
    S.freq[, 1] <- as.numeric(as.character( S.freq[, 1] ))
    S.freq <- cbind(question='S',S.freq)

    R.mean = mean(df$R, na.rm = TRUE) 
    R.length = sum(!is.na(df$R)) 
    R.sd = sd(df$R, na.rm = TRUE) 
    R.sum = sum(df$R, na.rm = TRUE)

    S.mean = mean(df$S, na.rm = TRUE) 
    S.length = sum(!is.na(df$S)) 
    S.sd = sd(df$S, na.rm = TRUE) 
    S.sum = sum(df$S, na.rm = TRUE)

    S.row <- cbind('S','sum',as.numeric(S.sum))
    S.row <- setNames(nm=c('question','answer','value'),data.frame(S.row))
    S.freq = rbind(S.freq, S.row )

    S.row <- cbind('S','length',as.numeric(S.length))
    S.row <- setNames(nm=c('question','answer','value'),data.frame(S.row))
    S.freq = rbind(S.freq, S.row )

    S.row <- cbind('S','mean',as.numeric(S.mean))
    S.row <- setNames(nm=c('question','answer','value'),data.frame(S.row))
    S.freq = rbind(S.freq, S.row )

    S.row <- cbind('S','sd',as.numeric(S.sd))
    S.row <- setNames(nm=c('question','answer','value'),data.frame(S.row))
    S.freq = rbind(S.freq, S.row )

    R.row <- cbind('R','sum',as.numeric(R.sum))
    R.row <- setNames(nm=c('question','answer','value'),data.frame(R.row))
    R.freq = rbind(R.freq, R.row )

    R.row <- cbind('R','length',as.numeric(R.length))
    R.row <- setNames(nm=c('question','answer','value'),data.frame(R.row))
    R.freq = rbind(R.freq, R.row )

    R.row <- cbind('R','mean',as.numeric(R.mean))
    R.row <- setNames(nm=c('question','answer','value'),data.frame(R.row))
    R.freq = rbind(R.freq, R.row )

    R.row <- cbind('R','sd',as.numeric(R.sd))
    R.row <- setNames(nm=c('question','answer','value'),data.frame(R.row))
    R.freq = rbind(R.freq, R.row )

    result <- rbind(R.freq,S.freq)
    result <- cbind(filter='None',result)
    result  

我明白了

   filter question answer            value
1    None        R      0                0
2    None        R      1                1
3    None        R      2                2
4    None        R      3                1
5    None        R      4                1
6    None        R      5                1
7    None        R      6                1
8    None        R      7                1
9    None        R      8                0
10   None        R      9                1
11   None        R     10                0
12   None        R   <NA>                1
13   None        R    sum               39
14   None        R length                9
15   None        R   mean 4.33333333333333
16   None        R     sd 2.64575131106459
17   None        S      0                0
18   None        S      1                0
19   None        S      2                0
20   None        S      3                1
21   None        S      4                0
22   None        S      5                1
23   None        S      6                1
24   None        S      7                3
25   None        S      8                1
26   None        S      9                1
27   None        S     10                0
28   None        S   <NA>                2
29   None        S    sum               52
30   None        S length                8
31   None        S   mean              6.5
32   None        S     sd  1.8516401995451

这正是我要找的。我认为下一步是开始包装一些函数以简化代码,然后再开始添加来自 ClassA=1、ClassA=n+1 ... ClassA=NA、ClassB=1、ClassB= 的类似结果集2 ... ClassB=NA。有更简单的方法吗?

研究Ernest A and Imo答案后的新代码是

    # 

    # create the summary function
    summaryStatistics <- function(x) {
        xx <- na.omit(x)
        c(table(factor(x, levels=0:10), useNA='always', exclude=NULL),
          sum=sum(xx), length=length(x), mean=mean(xx), sd=sqrt(var(xx)))
    }

    # create the test data frame
    Id <- c(1,2,3,4,5,6,7,8,9,10)
    ClassA <- c(1,NA,3,1,1,2,1,4,5,3)
    ClassB <- c(2,1,1,3,3,2,1,1,3,3)
    R <- c(1,2,3,NA,9,2,4,5,6,7)
    S <- c(3,7,NA,9,5,8,7,NA,7,6)
    df <- data.frame(Id,ClassA,ClassB,R,S)

    # create the result
    result <- setNames(
        nm=c('answer','question','value'),
        as.data.frame(
            as.table(
                simplify2array(
                    lapply(
                        df[c('R', 'S')], 
                        summaryStatistics
                    )
                )
            )
        )
    )

    # change the order to question, answer, value
    result <- result[, c(2, 1, 3)]

    # add the filter
    result <- cbind(filter='None',result)

    # return the result
    result 

这样就简单多了,也让我训练团队的其他任务简单多了。感谢 Ernest A and Imo.

关于我对 R 的理解的下一个问题是

减少代码大小的一个方法是将摘要统计信息包装在一个函数中:

myStats <- function(x) {
  answer <- c("sum"=sum(x, na.rm = TRUE), "length"=sum(!is.na(x)), 
              "mean"=mean(x, na.rm = TRUE), "sd"=sd(x, na.rm = TRUE))

  return(answer)
}

这 returns 一个命名的摘要统计向量,按照您在输出中的顺序排序。然后,您可以 rbind 返回值以及您频率的名称 table:

R.stats <- myStats(df$R)
rbind(R.freq, data.frame("question"='R', "answer"=names(R.stats),
                         "value"=R.stats))

是的,绝对可以简化。通常您会使用汇总函数,例如

smry <- function(x, levels) {
    xx <- na.omit(x)
    c(table(factor(x, levels=levels), useNA='always', exclude=NULL),
      sum=sum(xx), length=length(x), mean=mean(xx), sd=sqrt(var(xx)))
}

然后将其应用于数据的不同子集

> lapply(df[c('R', 'S')], smry, 0:10)
$R
        0         1         2         3         4         5         6         7 
 0.000000  1.000000  2.000000  1.000000  1.000000  1.000000  1.000000  1.000000 
        8         9        10      <NA>       sum    length      mean        sd 
 0.000000  1.000000  0.000000  1.000000 39.000000 10.000000  4.333333  2.645751 

$S
       0        1        2        3        4        5        6        7 
 0.00000  0.00000  0.00000  1.00000  0.00000  1.00000  1.00000  3.00000 
       8        9       10     <NA>      sum   length     mean       sd 
 1.00000  1.00000  0.00000  2.00000 52.00000 10.00000  6.50000  1.85164 

如果您绝对必须将所有内容放入数据框中

> as.data.frame(as.table(simplify2array(lapply(df[c('R', 'S')], smry, 0:10))))
     Var1 Var2      Freq
1       0    R  0.000000
2       1    R  1.000000
3       2    R  2.000000
4       3    R  1.000000
5       4    R  1.000000
6       5    R  1.000000
7       6    R  1.000000
8       7    R  1.000000
9       8    R  0.000000
10      9    R  1.000000
11     10    R  0.000000
12   <NA>    R  1.000000
13    sum    R 39.000000
14 length    R 10.000000
15   mean    R  4.333333
16     sd    R  2.645751
17      0    S  0.000000
18      1    S  0.000000
19      2    S  0.000000
20      3    S  1.000000
21      4    S  0.000000
22      5    S  1.000000
23      6    S  1.000000
24      7    S  3.000000
25      8    S  1.000000
26      9    S  1.000000
27     10    S  0.000000
28   <NA>    S  2.000000
29    sum    S 52.000000
30 length    S 10.000000
31   mean    S  6.500000
32     sd    S  1.851640

然后根据需要更改列名/添加列。