如何为带有计数的摘要 table 创建箱线图?
How to create a boxplot for summary table with counts?
我有一个摘要 table,其中包含两组的分数统计。我想创建一个箱形图来获取分数的分布、均值和 SD。
我看到了带有所列值的数据集示例。我正在使用的数据集是分数的汇总计数。我将如何创建一个箱形图,以便我可以看到分数分布在两组上的样子。
示例数据如下:
score count_new count_prev
1: 0 1 48
2: 1 NA 13
3: 2 1 412
4: 3 NA 237
5: 4 NA 169
6: 5 2 88
7: 6 1026 60
8: 7 587 50
9: 8 471 28
10: 9 266 22
11: 10 181 10
12: 11 115 5
13: 12 67 2
14: 13 34 NA
15: 14 15 NA
16: 15 8 1
17: 16 6 NA
18: 17 2 NA
structure(list(score = 0:17, count_new = c(1L, NA, 1L, NA, NA, 2L,
1026L, 587L, 471L, 266L, 181L, 115L, 67L, 34L, 15L, 8L, 6L, 2L
), count_prev = c(48L, 13L, 412L, 237L, 169L, 88L, 60L, 50L, 28L, 22L,
10L, 5L, 2L, NA, NA, 1L, NA, NA)), class = c("data.table", "data.frame"
), row.names = c(NA, -18L))
这是每组的平均值和标准差。我想看看分数是如何分布的。
sd avg grp
1: 1.88 7 new
2: 2.11 3 prev
这是一种使用 pivot_longer
以正确的绘图格式呈现数据框的方法:
library(tidyverse)
df <- structure(list(score = 0:17,
new = c(1L, NA, 1L, NA, NA, 2L, 1026L, 587L, 471L, 266L,
181L, 115L, 67L, 34L, 15L, 8L, 6L, 2L),
prev = c(48L, 13L, 412L, 237L, 169L, 88L, 60L, 50L, 28L,
22L, 10L, 5L, 2L, NA, NA, 1L, NA, NA)),
class = c("data.table", "data.frame"),
row.names = c(NA, -18L)) %>%
tidyr::pivot_longer(cols = c("new", "prev"))
boxplot(value ~ name, data = df)
新答案
我不完全确定你到底想要什么。也许你想要这样的东西:
boxplot(df$score, horizontal = TRUE, staplewex = 1, xlab = "Scores", main = "Boxplot")
text(x=fivenum(df$score), labels =fivenum(df$score), y=1.25)
输出:
您的均值与中位数相同:
> mean(df$score)
[1] 8.5
旧答案
您可以像这样创建两个 geom_boxplot
:
library(tidyverse)
df %>%
ggplot() +
geom_boxplot(aes(x = "new", y = new)) +
geom_boxplot(aes(x = "prev", y = prev)) +
labs(x = "groups", y = "count") +
theme_minimal()
输出:
您可以根据每个类别的计数重新生成原始数据:
# Replace NAs by Os
data[is.na(data$count_new),"count_new"]<-0
data[is.na(data$count_prev),"count_prev"]<-0
# Regenerate data according to counts
new <- data$score[rep(1:nrow(data), data$count_new)]
prev <- data$score[rep(1:nrow(data), data$count_prev)]
regenerated <- data.frame(type=c(rep("prev",length(prev)),rep("new",length(new))),
value =c(prev,new))
sd(prev)
#[1] 2.110576
sd(new)
#[1] 1.881819
mean(prev)
#[1] 3.484716
mean(new)
#[1] 7.627606
boxplot(value~type,data=regenerated)
使用uncount()
重新生成原始乐谱:
library(tidyr)
library(dplyr)
library(ggplot2)
counts <- structure(list(score = 0:17, count_new = c(1L, NA, 1L, NA, NA, 2L,
1026L, 587L, 471L, 266L, 181L, 115L, 67L, 34L, 15L, 8L, 6L, 2L
), count_prev = c(48L, 13L, 412L, 237L, 169L, 88L, 60L, 50L, 28L, 22L,
10L, 5L, 2L, NA, NA, 1L, NA, NA)), class = c("data.table", "data.frame"
), row.names = c(NA, -18L))
count_new <- counts %>%
select(-count_prev) %>%
mutate(count_new = ifelse(is.na(count_new), 0, count_new)) %>%
uncount(count_new)
count_prev <- counts %>%
select(-count_new) %>%
mutate(count_prev = ifelse(is.na(count_prev), 0, count_prev)) %>%
uncount(count_prev)
bind_rows(count_new, count_prev, .id = "when") %>%
mutate(when = factor(when, labels = c("new", "prev"))) %>%
ggplot() +
geom_boxplot(aes(when, score))
由 reprex package (v2.0.1)
于 2022-05-24 创建
我有一个摘要 table,其中包含两组的分数统计。我想创建一个箱形图来获取分数的分布、均值和 SD。
我看到了带有所列值的数据集示例。我正在使用的数据集是分数的汇总计数。我将如何创建一个箱形图,以便我可以看到分数分布在两组上的样子。
示例数据如下:
score count_new count_prev
1: 0 1 48
2: 1 NA 13
3: 2 1 412
4: 3 NA 237
5: 4 NA 169
6: 5 2 88
7: 6 1026 60
8: 7 587 50
9: 8 471 28
10: 9 266 22
11: 10 181 10
12: 11 115 5
13: 12 67 2
14: 13 34 NA
15: 14 15 NA
16: 15 8 1
17: 16 6 NA
18: 17 2 NA
structure(list(score = 0:17, count_new = c(1L, NA, 1L, NA, NA, 2L,
1026L, 587L, 471L, 266L, 181L, 115L, 67L, 34L, 15L, 8L, 6L, 2L
), count_prev = c(48L, 13L, 412L, 237L, 169L, 88L, 60L, 50L, 28L, 22L,
10L, 5L, 2L, NA, NA, 1L, NA, NA)), class = c("data.table", "data.frame"
), row.names = c(NA, -18L))
这是每组的平均值和标准差。我想看看分数是如何分布的。
sd avg grp
1: 1.88 7 new
2: 2.11 3 prev
这是一种使用 pivot_longer
以正确的绘图格式呈现数据框的方法:
library(tidyverse)
df <- structure(list(score = 0:17,
new = c(1L, NA, 1L, NA, NA, 2L, 1026L, 587L, 471L, 266L,
181L, 115L, 67L, 34L, 15L, 8L, 6L, 2L),
prev = c(48L, 13L, 412L, 237L, 169L, 88L, 60L, 50L, 28L,
22L, 10L, 5L, 2L, NA, NA, 1L, NA, NA)),
class = c("data.table", "data.frame"),
row.names = c(NA, -18L)) %>%
tidyr::pivot_longer(cols = c("new", "prev"))
boxplot(value ~ name, data = df)
新答案
我不完全确定你到底想要什么。也许你想要这样的东西:
boxplot(df$score, horizontal = TRUE, staplewex = 1, xlab = "Scores", main = "Boxplot")
text(x=fivenum(df$score), labels =fivenum(df$score), y=1.25)
输出:
您的均值与中位数相同:
> mean(df$score)
[1] 8.5
旧答案
您可以像这样创建两个 geom_boxplot
:
library(tidyverse)
df %>%
ggplot() +
geom_boxplot(aes(x = "new", y = new)) +
geom_boxplot(aes(x = "prev", y = prev)) +
labs(x = "groups", y = "count") +
theme_minimal()
输出:
您可以根据每个类别的计数重新生成原始数据:
# Replace NAs by Os
data[is.na(data$count_new),"count_new"]<-0
data[is.na(data$count_prev),"count_prev"]<-0
# Regenerate data according to counts
new <- data$score[rep(1:nrow(data), data$count_new)]
prev <- data$score[rep(1:nrow(data), data$count_prev)]
regenerated <- data.frame(type=c(rep("prev",length(prev)),rep("new",length(new))),
value =c(prev,new))
sd(prev)
#[1] 2.110576
sd(new)
#[1] 1.881819
mean(prev)
#[1] 3.484716
mean(new)
#[1] 7.627606
boxplot(value~type,data=regenerated)
使用uncount()
重新生成原始乐谱:
library(tidyr)
library(dplyr)
library(ggplot2)
counts <- structure(list(score = 0:17, count_new = c(1L, NA, 1L, NA, NA, 2L,
1026L, 587L, 471L, 266L, 181L, 115L, 67L, 34L, 15L, 8L, 6L, 2L
), count_prev = c(48L, 13L, 412L, 237L, 169L, 88L, 60L, 50L, 28L, 22L,
10L, 5L, 2L, NA, NA, 1L, NA, NA)), class = c("data.table", "data.frame"
), row.names = c(NA, -18L))
count_new <- counts %>%
select(-count_prev) %>%
mutate(count_new = ifelse(is.na(count_new), 0, count_new)) %>%
uncount(count_new)
count_prev <- counts %>%
select(-count_new) %>%
mutate(count_prev = ifelse(is.na(count_prev), 0, count_prev)) %>%
uncount(count_prev)
bind_rows(count_new, count_prev, .id = "when") %>%
mutate(when = factor(when, labels = c("new", "prev"))) %>%
ggplot() +
geom_boxplot(aes(when, score))
由 reprex package (v2.0.1)
于 2022-05-24 创建