摘要统计节省时间

Time Saver on Summary Stats

目标不是交叉制表。我的目标是 table 具有数据集中所有变量的描述性统计信息(在本例中为计数和百分比),这样我就不需要键入 table(变量名称)大约。在处理新数据集时重复 20 次。我计划只使用 xtable + knitr 导出它。不幸的是,循环有问题。任何帮助,将不胜感激。这个问题与那些反复需要提供汇总统计数据的人非常相关,因为这会节省很多时间。很抱歉第三次问这个问题:) 我听说过 table() 函数 - 事实上,我在下面多次使用它 简而言之,这个函数显示变量的名称、它们的级别、计数和百分比,而无需为数据集中的每个变量输入 table() -

ESRD <- rep(c("Y", "N"), each=10)
DIABETES <- rep(c("Y", "N", "Y", "N"), c(5, 5, 5, 5))
BLAH <- rep(c("Y", "N"), each=10)
categoricalvariables <- data.frame(ESRD, DIABETES, BLAH)

descriptives <- function(VARIABLEMATRIX){
desc <- matrix(0, ncol=4, nrow=2*ncol(VARIABLEMATRIX) + ncol(VARIABLEMATRIX))
  for (i in 1:ncol(VARIABLEMATRIX)){
    matper <- matrix(0, nrow=dim(table(VARIABLEMATRIX[ ,i])), ncol=1)
    for (i in 1:dim(table(VARIABLEMATRIX[ ,i]))){
      matper[i, ] <- paste(round(prop.table(table(VARIABLEMATRIX[ ,i]))[i]*100, 2), "%")
    }

    matcount <- matrix(0, nrow=dim(table(VARIABLEMATRIX[ ,i])), ncol=1)
    for (i in 1:dim(table(VARIABLEMATRIX[ ,i]))){
      matcount[i, ] <- table(VARIABLEMATRIX[ ,i])[i]
    }

    desc[((3*i)-2), ] <- c(colnames(VARIABLEMATRIX)[i], "", "", "") 
    desc[((3*i)-1):(3*i), ] <- cbind("", names(table(VARIABLEMATRIX[ ,i])), matcount[ ,1], matper[ ,1])
return(desc)
}
}
descriptives(categoricalvariables)

我得到的输出是(显然有一个错误,但我不确定哪里出了问题):

     [,1]       [,2] [,3] [,4]  
 [1,] "0"        "0"  "0"  "0"   
 [2,] "0"        "0"  "0"  "0"   
 [3,] "0"        "0"  "0"  "0"   
 [4,] "DIABETES" ""   ""   ""    
 [5,] ""         "N"  "10" "50 %"
 [6,] ""         "Y"  "10" "50 %"
 [7,] "0"        "0"  "0"  "0"   
 [8,] "0"        "0"  "0"  "0"   
 [9,] "0"        "0"  "0"  "0"  

预期的输出应该是:

     [,1]       [,2] [,3] [,4]  
 [1,] "ESRD"     ""   ""   ""     
 [2,] ""         "N"  "10" "50 %" 
 [3,] ""         "Y"  "10" "50 %"   
 [4,] "DIABETES" ""   ""   ""    
 [5,] ""         "N"  "10" "50 %"
 [6,] ""         "Y"  "10" "50 %"
 [7,] "BLAH"     ""   ""   ""     
 [8,] ""         "N"  "10" "50 %"  
 [9,] ""         "Y"  "10" "50 %"

以下是一些使用 tidyverse 函数的选项:

library(tidyverse)

categoricalvariables %>% 
  gather(Measure, Value) %>%
  group_by(Measure, Value) %>%
  tally %>%
  mutate(Percent=n/sum(n))
   Measure Value     n Percent
1     BLAH     N    10     0.5
2     BLAH     Y    10     0.5
3 DIABETES     N    10     0.5
4 DIABETES     Y    10     0.5
5     ESRD     N    10     0.5
6     ESRD     Y    10     0.5
categoricalvariables %>% 
  gather(Measure, Value) %>%
  group_by(Measure, Value) %>%
  tally %>%
  mutate(Percent=n/sum(n)) %>%
  gather(Stats, Value2, -Measure, -Value) %>%
  unite(Value_Stats, Stats, Value) %>%
  spread(Value_Stats, Value2)
   Measure   n_N   n_Y Percent_N Percent_Y
1     BLAH    10    10       0.5       0.5
2 DIABETES    10    10       0.5       0.5
3     ESRD    10    10       0.5       0.5

我保留了数字格式的数据,以备您进行进一步处理。

要为导出的内容设置 table,可能是这样的:

tab = categoricalvariables %>% 
  gather(Measure, Value) %>%
  group_by(Measure, Value) %>%
  summarise(Count=n()) %>%
  mutate(Percent=paste0(sprintf("%1.1f", Count/sum(Count)*100),"%")) %>%
  ungroup %>%
  mutate(Measure = ifelse(duplicated(Measure),"", Measure))
   Measure Value Count Percent
1     BLAH     N    10   50.0%
2              Y    10   50.0%
3 DIABETES     N    10   50.0%
4              Y    10   50.0%
5     ESRD     N    10   50.0%
6              Y    10   50.0%

现在您可以在 tab 上 运行 xtable

library(xtable)

print(xtable(tab, align="llcrr"), include.rownames=FALSE)

rmarkdown 文档输出到 PDF 时看起来像这样:

如果你有想要汇总的数字列,你可以这样做,例如(使用内置的 iris 数据框):

iris %>% group_by(Species) %>%
  summarise_all(funs(mean, min, max)) %>%
  gather(key, value, -Species) %>%
  separate(key, c("Measure","Stat"),"_") %>%
  spread(Stat, value)

您可能想要进一步重塑它或重新格式化以输出到 table,但它让您了解什么是可能的。

      Species      Measure   max  mean   min
1      setosa Petal.Length   1.9 1.462   1.0
2      setosa  Petal.Width   0.6 0.246   0.1
3      setosa Sepal.Length   5.8 5.006   4.3
4      setosa  Sepal.Width   4.4 3.428   2.3
5  versicolor Petal.Length   5.1 4.260   3.0
6  versicolor  Petal.Width   1.8 1.326   1.0
7  versicolor Sepal.Length   7.0 5.936   4.9
8  versicolor  Sepal.Width   3.4 2.770   2.0
9   virginica Petal.Length   6.9 5.552   4.5
10  virginica  Petal.Width   2.5 2.026   1.4
11  virginica Sepal.Length   7.9 6.588   4.9
12  virginica  Sepal.Width   3.8 2.974   2.2

descriptr 包中的 oway_tables 函数创建多个单向表。以下是您示例的输出:

> ESRD <- rep(c("Y", "N"), each=10)
> DIABETES <- rep(c("Y", "N", "Y", "N"), c(5, 5, 5, 5))
> BLAH <- rep(c("Y", "N"), each=10)
> categoricalvariables <- data.frame(ESRD, DIABETES, BLAH)
> descriptr::oway_tables(categoricalvariables)

                           Variable: ESRD                                
|--------------------------------------------------------------------------|
|                                Cumulative                    Cumulative  |
|    Levels    |  Frequency   |   Frequency  |   Percent    |    Percent   |
|--------------------------------------------------------------------------|
|       N      |      10      |      10      |      50      |      50      |
|--------------------------------------------------------------------------|
|       Y      |      10      |      20      |      50      |      100     |
|--------------------------------------------------------------------------|


                         Variable: DIABETES                              
|--------------------------------------------------------------------------|
|                                Cumulative                    Cumulative  |
|    Levels    |  Frequency   |   Frequency  |   Percent    |    Percent   |
|--------------------------------------------------------------------------|
|       N      |      10      |      10      |      50      |      50      |
|--------------------------------------------------------------------------|
|       Y      |      10      |      20      |      50      |      100     |
|--------------------------------------------------------------------------|


                           Variable: BLAH                                
|--------------------------------------------------------------------------|
|                                Cumulative                    Cumulative  |
|    Levels    |  Frequency   |   Frequency  |   Percent    |    Percent   |
|--------------------------------------------------------------------------|
|       N      |      10      |      10      |      50      |      50      |
|--------------------------------------------------------------------------|
|       Y      |      10      |      20      |      50      |      100     |
|--------------------------------------------------------------------------|

Link 到函数的文档:oway_tables.