如何找到给定分布的区间概率?
How to find interval prbability for a given distribution?
假设我有一些数据并将它们拟合到 gamma
分布,如何找到 Pr(1 < x <= 1.5)
的区间概率,其中 x 是样本外数据点?
require(fitdistrplus)
a <- c(2.44121289,1.70292449,0.30550832,0.04332383,1.0553436,0.26912546,0.43590885,0.84514809,
0.36762336,0.94935435,1.30887437,1.08761895,0.66581035,0.83108270,1.7567334,1.00241339,
0.96263021,1.67488277,0.87400413,0.34639636,1.16804671,1.4182144,1.7378907,1.7462686,
1.7427784,0.8377457,0.1428738,0.71473956,0.8458882,0.2140742,0.9663167,0.7933085,
0.0475603,1.8657773,0.18307362,1.13519144)
fit <- fitdist(a, "gamma",lower = c(0, 0))
您可以只使用 pgamma
和 fit
中的估计参数。
b <- fit$estimate
# shape rate
#1.739679 1.815995
pgamma(1.5, b[1], b[2]) - pgamma(1, b[1], b[2])
# [1] 0.1896032
Thanks. But how about P(x > 2)
?
查看 lower.tail
参数:
pgamma(q, shape, rate = 1, scale = 1/rate, lower.tail = TRUE, log.p = FALSE)
默认情况下,pgamma(q)
计算 Pr(x <= q)
。设置 lower.tail = FALSE
得到 Pr(x > q)
。所以你可以这样做:
pgamma(2, b[1], b[2], lower.tail = FALSE)
# [1] 0.08935687
或者您也可以使用
1 - pgamma(2, b[1], b[2])
# [1] 0.08935687
这里有一个示例,它使用 MCMC 技术和贝叶斯推理模式来估计新观测值落在区间 (1:1.5) 中的后验概率。这是一个无条件估计,与通过将伽马分布与最大似然参数估计相结合获得的条件估计相反。
此代码需要在您的计算机上安装 JAGS(免费且易于安装)。
library(rjags)
a <- c(2.44121289,1.70292449,0.30550832,0.04332383,1.0553436,0.26912546,0.43590885,0.84514809,
0.36762336,0.94935435,1.30887437,1.08761895,0.66581035,0.83108270,1.7567334,1.00241339,
0.96263021,1.67488277,0.87400413,0.34639636,1.16804671,1.4182144,1.7378907,1.7462686,
1.7427784,0.8377457,0.1428738,0.71473956,0.8458882,0.2140742,0.9663167,0.7933085,
0.0475603,1.8657773,0.18307362,1.13519144)
# Specify the model in JAGS language using diffuse priors for shape and scale
sink("GammaModel.txt")
cat("model{
# Priors
shape ~ dgamma(.001,.001)
rate ~ dgamma(.001,.001)
# Model structure
for(i in 1:n){
a[i] ~ dgamma(shape, rate)
}
}
", fill=TRUE)
sink()
jags.data <- list(a=a, n=length(a))
# Give overdispersed initial values (not important for this simple model, but very important if running complicated models where you need to check convergence by monitoring multiple chains)
inits <- function(){list(shape=runif(1,0,10), rate=runif(1,0,10))}
# Specify which parameters to monitor
params <- c("shape", "rate")
# Set-up for MCMC run
nc <- 1 # number of chains
n.adapt <-1000 # number of adaptation steps
n.burn <- 1000 # number of burn-in steps
n.iter <- 500000 # number of posterior samples
thin <- 10 # thinning of posterior samples
# Running the model
gamma_mod <- jags.model('GammaModel.txt', data = jags.data, inits=inits, n.chains=nc, n.adapt=n.adapt)
update(gamma_mod, n.burn)
gamma_samples <- coda.samples(gamma_mod,params,n.iter=n.iter, thin=thin)
# Summarize the result
summary(gamma_samples)
# Compute improper (non-normalized) probability distribution for x
x <- rep(NA, 50000)
for(i in 1:50000){
x[i] <- rgamma(1, gamma_samples[[1]][i,1], rate = gamma_samples[[1]][i,2])
}
# Find which values of x fall in the desired range and normalize.
length(which(x>1 & x < 1.5))/length(x)
回答:
Pr(1 < x <= 1.5) = 0.194
非常接近条件估计,但不能保证通常是这种情况。
有人不喜欢我上面这种以MLE为条件的做法;现在让我们看看无条件的东西。如果我们采用直接积分,我们需要三重积分:一个用于 shape
,一个用于 rate
,最后一个用于 x
。这没有吸引力。我只会生成 Monte Carlo 估算值。
根据中心极限定理,MLE 服从正态分布。 fitdistrplus::fitdist
不给出标准错误,但我们可以使用 MASS::fitdistr
来执行精确推理。
fit <- fitdistr(a, "gamma", lower = c(0,0))
b <- fit$estimate
# shape rate
#1.739737 1.816134
V <- fit$vcov ## covariance
shape rate
shape 0.1423679 0.1486193
rate 0.1486193 0.2078086
现在我们想从参数分布中抽样,得到目标概率的样本。
set.seed(0)
## sample from bivariate normal with mean `b` and covariance `V`
## Cholesky method is used here
X <- matrix(rnorm(1000 * 2), 1000) ## 1000 `N(0, 1)` normal samples
R <- chol(V) ## upper triangular Cholesky factor of `V`
X <- X %*% R ## transform X under desired covariance
X <- X + b ## shift to desired mean
## you can use `cov(X)` to check it is very close to `V`
## now samples for `Pr(1 < x < 1.5)`
p <- pgamma(1.5, X[,1], X[,2]) - pgamma(1, X[,1], X[,2])
我们可以制作 p
的直方图(如果需要,也可以进行密度估计):
hist(p, prob = TRUE)
现在,我们通常需要预测变量的样本均值:
mean(p)
# [1] 0.1906975
假设我有一些数据并将它们拟合到 gamma
分布,如何找到 Pr(1 < x <= 1.5)
的区间概率,其中 x 是样本外数据点?
require(fitdistrplus)
a <- c(2.44121289,1.70292449,0.30550832,0.04332383,1.0553436,0.26912546,0.43590885,0.84514809,
0.36762336,0.94935435,1.30887437,1.08761895,0.66581035,0.83108270,1.7567334,1.00241339,
0.96263021,1.67488277,0.87400413,0.34639636,1.16804671,1.4182144,1.7378907,1.7462686,
1.7427784,0.8377457,0.1428738,0.71473956,0.8458882,0.2140742,0.9663167,0.7933085,
0.0475603,1.8657773,0.18307362,1.13519144)
fit <- fitdist(a, "gamma",lower = c(0, 0))
您可以只使用 pgamma
和 fit
中的估计参数。
b <- fit$estimate
# shape rate
#1.739679 1.815995
pgamma(1.5, b[1], b[2]) - pgamma(1, b[1], b[2])
# [1] 0.1896032
Thanks. But how about
P(x > 2)
?
查看 lower.tail
参数:
pgamma(q, shape, rate = 1, scale = 1/rate, lower.tail = TRUE, log.p = FALSE)
默认情况下,pgamma(q)
计算 Pr(x <= q)
。设置 lower.tail = FALSE
得到 Pr(x > q)
。所以你可以这样做:
pgamma(2, b[1], b[2], lower.tail = FALSE)
# [1] 0.08935687
或者您也可以使用
1 - pgamma(2, b[1], b[2])
# [1] 0.08935687
这里有一个示例,它使用 MCMC 技术和贝叶斯推理模式来估计新观测值落在区间 (1:1.5) 中的后验概率。这是一个无条件估计,与通过将伽马分布与最大似然参数估计相结合获得的条件估计相反。
此代码需要在您的计算机上安装 JAGS(免费且易于安装)。
library(rjags)
a <- c(2.44121289,1.70292449,0.30550832,0.04332383,1.0553436,0.26912546,0.43590885,0.84514809,
0.36762336,0.94935435,1.30887437,1.08761895,0.66581035,0.83108270,1.7567334,1.00241339,
0.96263021,1.67488277,0.87400413,0.34639636,1.16804671,1.4182144,1.7378907,1.7462686,
1.7427784,0.8377457,0.1428738,0.71473956,0.8458882,0.2140742,0.9663167,0.7933085,
0.0475603,1.8657773,0.18307362,1.13519144)
# Specify the model in JAGS language using diffuse priors for shape and scale
sink("GammaModel.txt")
cat("model{
# Priors
shape ~ dgamma(.001,.001)
rate ~ dgamma(.001,.001)
# Model structure
for(i in 1:n){
a[i] ~ dgamma(shape, rate)
}
}
", fill=TRUE)
sink()
jags.data <- list(a=a, n=length(a))
# Give overdispersed initial values (not important for this simple model, but very important if running complicated models where you need to check convergence by monitoring multiple chains)
inits <- function(){list(shape=runif(1,0,10), rate=runif(1,0,10))}
# Specify which parameters to monitor
params <- c("shape", "rate")
# Set-up for MCMC run
nc <- 1 # number of chains
n.adapt <-1000 # number of adaptation steps
n.burn <- 1000 # number of burn-in steps
n.iter <- 500000 # number of posterior samples
thin <- 10 # thinning of posterior samples
# Running the model
gamma_mod <- jags.model('GammaModel.txt', data = jags.data, inits=inits, n.chains=nc, n.adapt=n.adapt)
update(gamma_mod, n.burn)
gamma_samples <- coda.samples(gamma_mod,params,n.iter=n.iter, thin=thin)
# Summarize the result
summary(gamma_samples)
# Compute improper (non-normalized) probability distribution for x
x <- rep(NA, 50000)
for(i in 1:50000){
x[i] <- rgamma(1, gamma_samples[[1]][i,1], rate = gamma_samples[[1]][i,2])
}
# Find which values of x fall in the desired range and normalize.
length(which(x>1 & x < 1.5))/length(x)
回答:
Pr(1 < x <= 1.5) = 0.194
非常接近条件估计,但不能保证通常是这种情况。
有人不喜欢我上面这种以MLE为条件的做法;现在让我们看看无条件的东西。如果我们采用直接积分,我们需要三重积分:一个用于 shape
,一个用于 rate
,最后一个用于 x
。这没有吸引力。我只会生成 Monte Carlo 估算值。
根据中心极限定理,MLE 服从正态分布。 fitdistrplus::fitdist
不给出标准错误,但我们可以使用 MASS::fitdistr
来执行精确推理。
fit <- fitdistr(a, "gamma", lower = c(0,0))
b <- fit$estimate
# shape rate
#1.739737 1.816134
V <- fit$vcov ## covariance
shape rate
shape 0.1423679 0.1486193
rate 0.1486193 0.2078086
现在我们想从参数分布中抽样,得到目标概率的样本。
set.seed(0)
## sample from bivariate normal with mean `b` and covariance `V`
## Cholesky method is used here
X <- matrix(rnorm(1000 * 2), 1000) ## 1000 `N(0, 1)` normal samples
R <- chol(V) ## upper triangular Cholesky factor of `V`
X <- X %*% R ## transform X under desired covariance
X <- X + b ## shift to desired mean
## you can use `cov(X)` to check it is very close to `V`
## now samples for `Pr(1 < x < 1.5)`
p <- pgamma(1.5, X[,1], X[,2]) - pgamma(1, X[,1], X[,2])
我们可以制作 p
的直方图(如果需要,也可以进行密度估计):
hist(p, prob = TRUE)
现在,我们通常需要预测变量的样本均值:
mean(p)
# [1] 0.1906975