如何为带有计数的摘要 table 创建箱线图?

How to create a boxplot for summary table with counts?

我有一个摘要 table,其中包含两组的分数统计。我想创建一个箱形图来获取分数的分布、均值和 SD。

我看到了带有所列值的数据集示例。我正在使用的数据集是分数的汇总计数。我将如何创建一个箱形图,以便我可以看到分数分布在两组上的样子。

示例数据如下:

    score  count_new count_prev
 1:     0    1   48
 2:     1   NA   13
 3:     2    1  412
 4:     3   NA  237
 5:     4   NA  169
 6:     5    2   88
 7:     6 1026   60
 8:     7  587   50
 9:     8  471   28
10:     9  266   22
11:    10  181   10
12:    11  115    5
13:    12   67    2
14:    13   34   NA
15:    14   15   NA
16:    15    8    1
17:    16    6   NA
18:    17    2   NA

structure(list(score = 0:17, count_new = c(1L, NA, 1L, NA, NA, 2L, 
1026L, 587L, 471L, 266L, 181L, 115L, 67L, 34L, 15L, 8L, 6L, 2L
), count_prev = c(48L, 13L, 412L, 237L, 169L, 88L, 60L, 50L, 28L, 22L, 
10L, 5L, 2L, NA, NA, 1L, NA, NA)), class = c("data.table", "data.frame"
), row.names = c(NA, -18L))

这是每组的平均值和标准差。我想看看分数是如何分布的。

     sd avg grp
1: 1.88   7  new
2: 2.11   3  prev

这是一种使用 pivot_longer 以正确的绘图格式呈现数据框的方法:

library(tidyverse)
df <- structure(list(score = 0:17, 
                     new = c(1L, NA, 1L, NA, NA, 2L, 1026L, 587L, 471L, 266L, 
                             181L, 115L, 67L, 34L, 15L, 8L, 6L, 2L), 
                     prev = c(48L, 13L, 412L, 237L, 169L, 88L, 60L, 50L, 28L, 
                              22L, 10L, 5L, 2L, NA, NA, 1L, NA, NA)), 
                class = c("data.table", "data.frame"), 
                row.names = c(NA, -18L)) %>%
  tidyr::pivot_longer(cols = c("new", "prev"))

boxplot(value ~ name, data = df)

新答案

我不完全确定你到底想要什么。也许你想要这样的东西:

boxplot(df$score, horizontal = TRUE, staplewex = 1, xlab = "Scores", main = "Boxplot")
text(x=fivenum(df$score), labels =fivenum(df$score), y=1.25)

输出:

您的均值与中位数相同:

> mean(df$score)
[1] 8.5

旧答案

您可以像这样创建两个 geom_boxplot

library(tidyverse)
df %>%
  ggplot() +
  geom_boxplot(aes(x = "new", y = new)) +
  geom_boxplot(aes(x = "prev", y = prev)) +
  labs(x = "groups", y = "count") +
  theme_minimal()

输出:

您可以根据每个类别的计数重新生成原始数据:

# Replace NAs by Os
data[is.na(data$count_new),"count_new"]<-0
data[is.na(data$count_prev),"count_prev"]<-0

# Regenerate data according to counts
new <- data$score[rep(1:nrow(data), data$count_new)]
prev <- data$score[rep(1:nrow(data), data$count_prev)]
regenerated <- data.frame(type=c(rep("prev",length(prev)),rep("new",length(new))),
                          value =c(prev,new))

sd(prev)
#[1] 2.110576
sd(new)
#[1] 1.881819
mean(prev)
#[1] 3.484716
mean(new)
#[1] 7.627606

boxplot(value~type,data=regenerated)

使用uncount()重新生成原始乐谱:

library(tidyr)
library(dplyr)
library(ggplot2)

counts <- structure(list(score = 0:17, count_new = c(1L, NA, 1L, NA, NA, 2L, 
                                           1026L, 587L, 471L, 266L, 181L, 115L, 67L, 34L, 15L, 8L, 6L, 2L
), count_prev = c(48L, 13L, 412L, 237L, 169L, 88L, 60L, 50L, 28L, 22L, 
                  10L, 5L, 2L, NA, NA, 1L, NA, NA)), class = c("data.table", "data.frame"
                  ), row.names = c(NA, -18L))

count_new <- counts %>% 
  select(-count_prev) %>% 
  mutate(count_new = ifelse(is.na(count_new), 0, count_new)) %>% 
  uncount(count_new)

count_prev <- counts %>% 
  select(-count_new) %>% 
  mutate(count_prev = ifelse(is.na(count_prev), 0, count_prev)) %>% 
  uncount(count_prev)

bind_rows(count_new, count_prev, .id = "when") %>% 
  mutate(when = factor(when, labels = c("new", "prev"))) %>% 
  ggplot() +
  geom_boxplot(aes(when, score))

reprex package (v2.0.1)

于 2022-05-24 创建