在ggplot中计算中位数的置信区间
Calculation of confidence intervals of the median in ggplot
我有一个数据集,其中包含每个 ID 具有多个观察值的虚拟人口。该人群分为两组。我想绘制每组的观察结果与时间的关系图。我可以使用 stat_summary
成功计算平均值和平均值的置信区间。然而,在第二个情节中,我想对中位数做同样的事情。 stat_summary(fun.y=median)
确实有效,但我找不到计算中位数正确置信区间的解决方案。我必须单独计算它们吗?
这是ggplot
代码的相关部分:
my.labels <- c("1250 mg/m²\nwith dose\nadjustments",
"1250 mg/m²\nwithout dose\nadjustments")
ggplot(data, aes(x=TIME, y=P2X))+
stat_summary(geom="ribbon", fun.data=mean_cl_boot,
fun.args=list(conf.int=0.95),
aes(fill=factor(GROUP),color=NA), alpha = .55)+
stat_summary(geom="line", fun.y=mean, aes(linetype=factor(GROUP)))+
scale_fill_manual("dose",labels=my.labels,values=c("#999999", "lightblue"))+
scale_linetype_manual("dose",labels=my.labels,values = c("solid","dashed"))+
scale_color_manual("dose",values=c("black", "black"))+labs(x="Time [cycle]",y="Probability [%]")+
scale_x_continuous(breaks = c(0,1,2,3,4,5,6))+ theme(legend.title=element_blank())+
scale_y_continuous(labels = function(x) paste0(x*100))
提前致谢!
我使用以下函数轻松添加 non-parametric 中位数的估计值和置信区间 stat_summary
:
quantile_cl <- function(y, q=0.5, conf.level = 0.95, na.rm=TRUE) {
alpha <- 1 - conf.level
if (na.rm) y <- y[!is.na(y)]
n <- length(y)
l <- qbinom(alpha/2, size=n, prob = q)
u <- 1 + n - l
ys <- sort.int(c(-Inf, y, Inf), partial = c(1 + l, 1 + u))
data.frame(
y = quantile(y, probs = q, na.rm=na.rm, type = 8),
ymin = ys[1 + l],
ymax = ys[1 + u]
)
}
median_cl <- function(y, conf.level=0.95, na.rm=TRUE) quantile_cl(y, q=0.5, conf.level=conf.level, na.rm=na.rm)
使用示例:
library(ggplot2)
ggplot(iris, aes(x = Species, y = Sepal.Length)) +
geom_violin(aes(fill = Species), trim = FALSE) +
stat_summary(fun.data = mean_cl_normal, geom="crossbar", aes(color = "normal")) +
stat_summary(fun.data = median_cl, aes(color = "nonparametric"), size = 1) +
scale_color_manual(values = c("red", "black"), "method") +
NULL
无论真实分布如何,这些 non-parametric 中位数的置信区间几乎总是精确的,但如果您知道分布,则可以构建更好的估计值和区间。例如,如果您知道您的数据来自高斯 mean_cl_normal
将给出中位数的最佳估计值(不是您的样本而是基础分布)!
# coverage probability of ci
mean(replicate(1000, {res <- quantile_cl(rnorm(100), q=0.5, conf.level=0.8); res$ymin<=0 & res$ymax>=0}))
#> [1] 0.814
mean(replicate(1000, {res <- mean_cl_normal(rnorm(100), conf.int = 0.8); res$ymin<=0 & res$ymax>=0}))
#> [1] 0.804
# mean width of ci
mean(replicate(1000, {res <- quantile_cl(rnorm(100), q=0.5, conf.level=0.8); res$ymax-res$ymin}))
#> [1] 0.3276851
mean(replicate(1000, {res <- mean_cl_normal(rnorm(100), conf.int = 0.8); res$ymax-res$ymin}))
#> [1] 0.2574329
如果有需求,我会尽量把这个功能弄成Hmisc
(其中实现了mean_cl_boot
(smean.cl.normal
)等)
我有一个数据集,其中包含每个 ID 具有多个观察值的虚拟人口。该人群分为两组。我想绘制每组的观察结果与时间的关系图。我可以使用 stat_summary
成功计算平均值和平均值的置信区间。然而,在第二个情节中,我想对中位数做同样的事情。 stat_summary(fun.y=median)
确实有效,但我找不到计算中位数正确置信区间的解决方案。我必须单独计算它们吗?
这是ggplot
代码的相关部分:
my.labels <- c("1250 mg/m²\nwith dose\nadjustments",
"1250 mg/m²\nwithout dose\nadjustments")
ggplot(data, aes(x=TIME, y=P2X))+
stat_summary(geom="ribbon", fun.data=mean_cl_boot,
fun.args=list(conf.int=0.95),
aes(fill=factor(GROUP),color=NA), alpha = .55)+
stat_summary(geom="line", fun.y=mean, aes(linetype=factor(GROUP)))+
scale_fill_manual("dose",labels=my.labels,values=c("#999999", "lightblue"))+
scale_linetype_manual("dose",labels=my.labels,values = c("solid","dashed"))+
scale_color_manual("dose",values=c("black", "black"))+labs(x="Time [cycle]",y="Probability [%]")+
scale_x_continuous(breaks = c(0,1,2,3,4,5,6))+ theme(legend.title=element_blank())+
scale_y_continuous(labels = function(x) paste0(x*100))
提前致谢!
我使用以下函数轻松添加 non-parametric 中位数的估计值和置信区间 stat_summary
:
quantile_cl <- function(y, q=0.5, conf.level = 0.95, na.rm=TRUE) {
alpha <- 1 - conf.level
if (na.rm) y <- y[!is.na(y)]
n <- length(y)
l <- qbinom(alpha/2, size=n, prob = q)
u <- 1 + n - l
ys <- sort.int(c(-Inf, y, Inf), partial = c(1 + l, 1 + u))
data.frame(
y = quantile(y, probs = q, na.rm=na.rm, type = 8),
ymin = ys[1 + l],
ymax = ys[1 + u]
)
}
median_cl <- function(y, conf.level=0.95, na.rm=TRUE) quantile_cl(y, q=0.5, conf.level=conf.level, na.rm=na.rm)
使用示例:
library(ggplot2)
ggplot(iris, aes(x = Species, y = Sepal.Length)) +
geom_violin(aes(fill = Species), trim = FALSE) +
stat_summary(fun.data = mean_cl_normal, geom="crossbar", aes(color = "normal")) +
stat_summary(fun.data = median_cl, aes(color = "nonparametric"), size = 1) +
scale_color_manual(values = c("red", "black"), "method") +
NULL
无论真实分布如何,这些 non-parametric 中位数的置信区间几乎总是精确的,但如果您知道分布,则可以构建更好的估计值和区间。例如,如果您知道您的数据来自高斯 mean_cl_normal
将给出中位数的最佳估计值(不是您的样本而是基础分布)!
# coverage probability of ci
mean(replicate(1000, {res <- quantile_cl(rnorm(100), q=0.5, conf.level=0.8); res$ymin<=0 & res$ymax>=0}))
#> [1] 0.814
mean(replicate(1000, {res <- mean_cl_normal(rnorm(100), conf.int = 0.8); res$ymin<=0 & res$ymax>=0}))
#> [1] 0.804
# mean width of ci
mean(replicate(1000, {res <- quantile_cl(rnorm(100), q=0.5, conf.level=0.8); res$ymax-res$ymin}))
#> [1] 0.3276851
mean(replicate(1000, {res <- mean_cl_normal(rnorm(100), conf.int = 0.8); res$ymax-res$ymin}))
#> [1] 0.2574329
如果有需求,我会尽量把这个功能弄成Hmisc
(其中实现了mean_cl_boot
(smean.cl.normal
)等)