如何在 R 中显示箱线图的各个点?
How do I show individual points of a boxplot in R?
我有 df1:
Name Y_N FIPS score1 score2
1: Alabama 0 1 2633 8
2: Alaska 0 2 382 1
3: Arizona 1 4 2695 41
4: Arkansas 1 5 2039 10
5: California 1 6 27813 524
6: Colorado 0 8 8609 133
7: Connecticut 1 9 5390 111
8: Delaware 0 10 858 3
9: Florida 1 12 14172 215
10: Georgia 1 13 9847 308
11: Hawaii 0 15 720 0
12: Idaho 1 16 845 7
我想执行 T 检验以查看 score1
是否与 Y_N
不同。然后我想将这两者相互对抗。我制作了一个如下所示的箱线图:
相反,我希望我的图表看起来像除了置信度条:我现在想从箱线图更改为显示所有单个点的图,然后是一条平均水平线 95%置信区间。这是怎么做到的?我还想在图表的一角添加 p 值的文本。
我可能会尝试:
text(x = max(df1$Y_N)+1,
y = min(df1$score1)+20000,
labels = paste0(
"\np-value = ",
round(coef_lm[2,4],5),
pos = 4)
但我意识到 coef_lm[2,4],5
是线性模型的测试统计数据。如何访问 t 检验的输出?
我不确定您为什么要在代码中添加那个额外的点。但是在您的原始数据上,您可能会使用 ggplot2
和 ggpubr
.
编辑
现在更像是你的绘画。
ggplot(df1,aes(x = as.factor(Y_N), y = score1)) +
geom_jitter(position = position_jitter(0.1)) +
stat_summary(fun.data = "mean_cl_normal", geom = "errorbar", width = 0.3) +
stat_summary(fun = "mean", geom = "errorbar", aes(ymax = ..y.., ymin = ..y..), col = "red", width = 0.5) +
stat_compare_means(method="t.test") +
xlab("Group") + ylab("Score 1")
原始数据
df1 <- structure(list(Name = structure(1:12, .Label = c("Alabama", "Alaska",
"Arizona", "Arkansas", "California", "Colorado", "Connecticut",
"Delaware", "Florida", "Georgia", "Hawaii", "Idaho"), class = "factor"),
Y_N = c(0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 1L),
FIPS = c(1L, 2L, 4L, 5L, 6L, 8L, 9L, 10L, 12L, 13L, 15L,
16L), score1 = c(2633L, 382L, 2695L, 2039L, 27813L, 8609L,
5390L, 858L, 14172L, 9847L, 720L, 845L), score2 = c(8L, 1L,
41L, 10L, 524L, 133L, 111L, 3L, 215L, 308L, 0L, 7L)), class = "data.frame", row.names = c("1:",
"2:", "3:", "4:", "5:", "6:", "7:", "8:", "9:", "10:", "11:",
"12:"))
或者,在不安装 ggpubr
的情况下,您可以在 ggplot2
之外计算 p 值,然后使用 annotate
函数将 pvalue 添加到图中:
pval <- t.test(score1~Y_N,data = df)$p.value
library(ggplot2)
ggplot(df, aes(x = as.factor(Y_N), y = score1, fill = as.factor(Y_N), color = as.factor(Y_N)))+
geom_boxplot(alpha = 0.3, color = "black", outlier.shape = NA)+
geom_jitter(show.legend = FALSE)+
annotate(geom = "text", label = paste("p.value: ",round(pval,3)), x = 1.5, y = max(df$score1)*0.9)
编辑:没有箱线图
替代箱线图,如果你想要单独的点和代表平均值的条形图,你可以首先计算 ne 数据集中每组的平均值(这里我使用 dplyr
包来做它):
library(dplyr)
Mean_df <- df %>% group_by(Y_N) %>% summarise(Mean = mean(score1))
# A tibble: 2 x 2
Y_N Mean
<int> <dbl>
1 0 2640.
2 1 8972.
然后,您可以通过调用新数据集 Mean_df
:
使用 geom_jitter
绘制单个点并使用 geom_errobar
绘制平均值
library(ggplot2)
ggplot(df, aes(x = as.factor(Y_N), y = score1))+
geom_jitter(show.legend = FALSE, width = 0.2)+
geom_errorbar(inherit.aes = FALSE, data = Mean_df,
aes(x = as.factor(Y_N),ymin = Mean, ymax = Mean),
color = "red",width = 0.2)+
annotate(geom = "text", label = paste("p.value: ",round(pval,3)),
x = 1.5, y = max(df$score1)*0.9)
可重现的例子
structure(list(Name = c("Alabama", "Alaska", "Arizona", "Arkansas",
"California", "Colorado", "Connecticut", "Delaware", "Florida",
"Georgia", "Hawaii", "Idaho"), Y_N = c(0L, 0L, 1L, 1L, 1L, 0L,
1L, 0L, 1L, 1L, 0L, 1L), FIPS = c(1L, 2L, 4L, 5L, 6L, 8L, 9L,
10L, 12L, 13L, 15L, 16L), score1 = c(2633L, 382L, 2695L, 2039L,
27813L, 8609L, 5390L, 858L, 14172L, 9847L, 720L, 845L), score2 = c(8L,
1L, 41L, 10L, 524L, 133L, 111L, 3L, 215L, 308L, 0L, 7L)), row.names = c(NA,
-12L), class = c("data.table", "data.frame"))
您的一个问题与如何访问 t.test 统计信息有关。这是该问题的答案。假设您有这种类型的数据:
set.seed(12)
YN <- sample(0:1, 100, replace = T)
score1 <- sample(500:1500, 100, replace = T)
df <- data.frame(YN, score1)
并进一步假设您 运行 并像这样存储 t.test:
test <- tapply(df$score1, df$YN, t.test)
然后您可以像这样一点一点地访问测试统计信息,此处针对因子级别 0
:
进行了说明
test$`0`$p.value # p-value
test$`0`$conf.int # confidence interval
test$`0`$estimate # estimate
test$`0`$statistic # statistic
现在显然你不想一点一点地手动完成,而是以一种更自动化和系统化的方式。您可以通过以下方式实现这一点:
df1 <- do.call(rbind, lapply(test, function(x) c(
statistic = unname(x$statistic),
ci = unname(x$conf.int),
est = unname(x$estimate),
pval = unname(x$p.value))))
输出是这样的:
statistic ci1 ci2 est pval
0 22.31155 837.3901 1003.263 920.3265 5.484012e-27
1 22.91558 870.5426 1037.810 954.1765 3.543693e-28
dd <- structure(list(Name = c("Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Hawaii", "Idaho"), Y_N = c(0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 1L), FIPS = c(1L, 2L, 4L, 5L, 6L, 8L, 9L, 10L, 12L, 13L, 15L, 16L), score1 = c(2633L, 382L, 2695L, 2039L, 27813L, 8609L, 5390L, 858L, 14172L, 9847L, 720L, 845L), score2 = c(8L, 1L, 41L, 10L, 524L, 133L, 111L, 3L, 215L, 308L, 0L, 7L)), row.names = c(NA, -12L), class = c("data.table", "data.frame"))
## frame
boxplot(score1 ~ Y_N, dd, border = NA)
## 95% ci, medians
sp <- split(dd$score1, dd$Y_N)
sapply(seq_along(sp), function(ii) {
x <- sp[[ii]]
arrows(ii, quantile(x, 0.025), ii, quantile(x, 0.975), code = 3, angle = 90, length = 0.1)
segments(ii - 0.05, median(x), ii + 0.05, col = 'red', lwd = 2)
})
points(dd$Y_N + 1, dd$score1, col = dd$Y_N + 1)
## t-test
lbl <- sprintf('p = %s', format.pval(t.test(score1 ~ Y_N, dd)$p.value, digits = 2))
mtext(lbl, at = par('usr')[2], adj = 1)
我有 df1:
Name Y_N FIPS score1 score2
1: Alabama 0 1 2633 8
2: Alaska 0 2 382 1
3: Arizona 1 4 2695 41
4: Arkansas 1 5 2039 10
5: California 1 6 27813 524
6: Colorado 0 8 8609 133
7: Connecticut 1 9 5390 111
8: Delaware 0 10 858 3
9: Florida 1 12 14172 215
10: Georgia 1 13 9847 308
11: Hawaii 0 15 720 0
12: Idaho 1 16 845 7
我想执行 T 检验以查看 score1
是否与 Y_N
不同。然后我想将这两者相互对抗。我制作了一个如下所示的箱线图:
相反,我希望我的图表看起来像除了置信度条:
我可能会尝试:
text(x = max(df1$Y_N)+1,
y = min(df1$score1)+20000,
labels = paste0(
"\np-value = ",
round(coef_lm[2,4],5),
pos = 4)
但我意识到 coef_lm[2,4],5
是线性模型的测试统计数据。如何访问 t 检验的输出?
我不确定您为什么要在代码中添加那个额外的点。但是在您的原始数据上,您可能会使用 ggplot2
和 ggpubr
.
编辑 现在更像是你的绘画。
ggplot(df1,aes(x = as.factor(Y_N), y = score1)) +
geom_jitter(position = position_jitter(0.1)) +
stat_summary(fun.data = "mean_cl_normal", geom = "errorbar", width = 0.3) +
stat_summary(fun = "mean", geom = "errorbar", aes(ymax = ..y.., ymin = ..y..), col = "red", width = 0.5) +
stat_compare_means(method="t.test") +
xlab("Group") + ylab("Score 1")
原始数据
df1 <- structure(list(Name = structure(1:12, .Label = c("Alabama", "Alaska",
"Arizona", "Arkansas", "California", "Colorado", "Connecticut",
"Delaware", "Florida", "Georgia", "Hawaii", "Idaho"), class = "factor"),
Y_N = c(0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 1L),
FIPS = c(1L, 2L, 4L, 5L, 6L, 8L, 9L, 10L, 12L, 13L, 15L,
16L), score1 = c(2633L, 382L, 2695L, 2039L, 27813L, 8609L,
5390L, 858L, 14172L, 9847L, 720L, 845L), score2 = c(8L, 1L,
41L, 10L, 524L, 133L, 111L, 3L, 215L, 308L, 0L, 7L)), class = "data.frame", row.names = c("1:",
"2:", "3:", "4:", "5:", "6:", "7:", "8:", "9:", "10:", "11:",
"12:"))
或者,在不安装 ggpubr
的情况下,您可以在 ggplot2
之外计算 p 值,然后使用 annotate
函数将 pvalue 添加到图中:
pval <- t.test(score1~Y_N,data = df)$p.value
library(ggplot2)
ggplot(df, aes(x = as.factor(Y_N), y = score1, fill = as.factor(Y_N), color = as.factor(Y_N)))+
geom_boxplot(alpha = 0.3, color = "black", outlier.shape = NA)+
geom_jitter(show.legend = FALSE)+
annotate(geom = "text", label = paste("p.value: ",round(pval,3)), x = 1.5, y = max(df$score1)*0.9)
编辑:没有箱线图
替代箱线图,如果你想要单独的点和代表平均值的条形图,你可以首先计算 ne 数据集中每组的平均值(这里我使用 dplyr
包来做它):
library(dplyr)
Mean_df <- df %>% group_by(Y_N) %>% summarise(Mean = mean(score1))
# A tibble: 2 x 2
Y_N Mean
<int> <dbl>
1 0 2640.
2 1 8972.
然后,您可以通过调用新数据集 Mean_df
:
geom_jitter
绘制单个点并使用 geom_errobar
绘制平均值
library(ggplot2)
ggplot(df, aes(x = as.factor(Y_N), y = score1))+
geom_jitter(show.legend = FALSE, width = 0.2)+
geom_errorbar(inherit.aes = FALSE, data = Mean_df,
aes(x = as.factor(Y_N),ymin = Mean, ymax = Mean),
color = "red",width = 0.2)+
annotate(geom = "text", label = paste("p.value: ",round(pval,3)),
x = 1.5, y = max(df$score1)*0.9)
可重现的例子
structure(list(Name = c("Alabama", "Alaska", "Arizona", "Arkansas",
"California", "Colorado", "Connecticut", "Delaware", "Florida",
"Georgia", "Hawaii", "Idaho"), Y_N = c(0L, 0L, 1L, 1L, 1L, 0L,
1L, 0L, 1L, 1L, 0L, 1L), FIPS = c(1L, 2L, 4L, 5L, 6L, 8L, 9L,
10L, 12L, 13L, 15L, 16L), score1 = c(2633L, 382L, 2695L, 2039L,
27813L, 8609L, 5390L, 858L, 14172L, 9847L, 720L, 845L), score2 = c(8L,
1L, 41L, 10L, 524L, 133L, 111L, 3L, 215L, 308L, 0L, 7L)), row.names = c(NA,
-12L), class = c("data.table", "data.frame"))
您的一个问题与如何访问 t.test 统计信息有关。这是该问题的答案。假设您有这种类型的数据:
set.seed(12)
YN <- sample(0:1, 100, replace = T)
score1 <- sample(500:1500, 100, replace = T)
df <- data.frame(YN, score1)
并进一步假设您 运行 并像这样存储 t.test:
test <- tapply(df$score1, df$YN, t.test)
然后您可以像这样一点一点地访问测试统计信息,此处针对因子级别 0
:
test$`0`$p.value # p-value
test$`0`$conf.int # confidence interval
test$`0`$estimate # estimate
test$`0`$statistic # statistic
现在显然你不想一点一点地手动完成,而是以一种更自动化和系统化的方式。您可以通过以下方式实现这一点:
df1 <- do.call(rbind, lapply(test, function(x) c(
statistic = unname(x$statistic),
ci = unname(x$conf.int),
est = unname(x$estimate),
pval = unname(x$p.value))))
输出是这样的:
statistic ci1 ci2 est pval
0 22.31155 837.3901 1003.263 920.3265 5.484012e-27
1 22.91558 870.5426 1037.810 954.1765 3.543693e-28
dd <- structure(list(Name = c("Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Hawaii", "Idaho"), Y_N = c(0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 1L), FIPS = c(1L, 2L, 4L, 5L, 6L, 8L, 9L, 10L, 12L, 13L, 15L, 16L), score1 = c(2633L, 382L, 2695L, 2039L, 27813L, 8609L, 5390L, 858L, 14172L, 9847L, 720L, 845L), score2 = c(8L, 1L, 41L, 10L, 524L, 133L, 111L, 3L, 215L, 308L, 0L, 7L)), row.names = c(NA, -12L), class = c("data.table", "data.frame"))
## frame
boxplot(score1 ~ Y_N, dd, border = NA)
## 95% ci, medians
sp <- split(dd$score1, dd$Y_N)
sapply(seq_along(sp), function(ii) {
x <- sp[[ii]]
arrows(ii, quantile(x, 0.025), ii, quantile(x, 0.975), code = 3, angle = 90, length = 0.1)
segments(ii - 0.05, median(x), ii + 0.05, col = 'red', lwd = 2)
})
points(dd$Y_N + 1, dd$score1, col = dd$Y_N + 1)
## t-test
lbl <- sprintf('p = %s', format.pval(t.test(score1 ~ Y_N, dd)$p.value, digits = 2))
mtext(lbl, at = par('usr')[2], adj = 1)