使用 optim,ML 将 Gamma 分布拟合到 R 中的数据
Fitting Gamma distribution to data in R using optim, ML
我对 R 有点陌生。我有一个数据集,其中还包括家庭收入数据,我必须使用最大似然估计对这些数据进行 Gamma 分布拟合。它特别告诉我们需要使用包 optim,而不是 fitdistr。所以这是我的代码:
t1 <- sum(log(newdata$faminc))
t2 <- sum(newdata$faminc)
obs <- nrow(newdata)
lh.gamma <- function(par) {
-((par[1]-1)*t1 - par[2]*t2 - obs*par[1]*log(par[2]) - obs*lgamma(par[1]))
}
#initial guess for a = mean^2(x)/var(x) and b = mean(x) / var(x)
a1 <- (mean(newdata$faminc))^2/var(newdata$faminc)
b1 <- mean(newdata$faminc)/var(newdata$faminc)
init <- c(a1,b1)
q <- optim(init, lh.gamma, method = "BFGS")
q
还尝试在初始向量中仅填充值,并包括这段代码;
dlh.gamma <- function(par){
cbind(obs*digamma(par[1])+obs*log(par[2])-t2,
obs*par[1]/par[2]-1/par[2]^2*t1)
}
然后优化看起来像:
q <- optim(init, lh.gamma, dhl.gamma, method="BFGS")
None 其中 'works'。首先,当我在学校计算机上尝试代码时,它为我提供了非常大的形状和速率参数数字,这是不可能的。现在,在家里尝试,我明白了:
> q <- optim(init, lh.gamma, method = "BFGS")
Error in optim(init, lh.gamma, method = "BFGS") :
non-finite finite-difference value [2]
In addition: There were 50 or more warnings (use warnings() to see the first 50)
> q
function (save = "default", status = 0, runLast = TRUE)
.Internal(quit(save, status, runLast))
<bytecode: 0x000000000eaac960>
<environment: namespace:base>
q 甚至不是 'created'。除了当我在上面包含 dlh.gamma 部分时,但随后我又得到了巨大的数字并且没有收敛。
有人知道 wrong/what 做什么吗?
编辑:
> dput(sample(newdata$faminc, 500))
c(42.5, 87.5, 22.5, 17.5, 12.5, 30, 30, 17.5, 42.5, 62.5, 62.5,
30, 30, 150, 22.5, 30, 42.5, 30, 17.5, 8.75, 42.5, 42.5, 42.5,
62.5, 42.5, 30, 17.5, 87.5, 62.5, 150, 42.5, 150, 42.5, 42.5,
42.5, 6.25, 62.5, 87.5, 6.25, 87.5, 30, 150, 22.5, 62.5, 42.5,
150, 17.5, 42.5, 42.5, 42.5, 62.5, 22.5, 42.5, 42.5, 30, 62.5,
30, 62.5, 87.5, 87.5, 42.5, 22.5, 62.5, 22.5, 8.75, 30, 30, 17.5,
87.5, 8.75, 62.5, 30, 17.5, 22.5, 62.5, 42.5, 30, 17.5, 62.5,
8.75, 62.5, 42.5, 150, 30, 62.5, 87.5, 17.5, 62.5, 30, 62.5,
87.5, 42.5, 62.5, 30, 62.5, 42.5, 87.5, 150, 12.5, 42.5, 62.5,
42.5, 62.5, 62.5, 150, 30, 87.5, 12.5, 17.5, 42.5, 62.5, 30,
6.25, 62.5, 42.5, 12.5, 62.5, 8.75, 17.5, 42.5, 62.5, 87.5, 8.75,
62.5, 30, 62.5, 87.5, 42.5, 62.5, 62.5, 12.5, 150, 42.5, 62.5,
12.5, 62.5, 42.5, 62.5, 62.5, 87.5, 42.5, 62.5, 30, 42.5, 150,
42.5, 30, 62.5, 62.5, 87.5, 42.5, 30, 62.5, 62.5, 42.5, 42.5,
30, 62.5, 42.5, 42.5, 62.5, 62.5, 150, 42.5, 30, 42.5, 62.5,
17.5, 62.5, 17.5, 150, 8.75, 62.5, 30, 62.5, 42.5, 42.5, 22.5,
150, 62.5, 42.5, 62.5, 62.5, 22.5, 30, 62.5, 30, 150, 42.5, 42.5,
42.5, 62.5, 30, 12.5, 30, 150, 12.5, 8.75, 22.5, 30, 22.5, 30,
42.5, 42.5, 42.5, 30, 12.5, 62.5, 42.5, 30, 22.5, 42.5, 87.5,
22.5, 12.5, 42.5, 62.5, 62.5, 62.5, 30, 42.5, 30, 62.5, 30, 62.5,
12.5, 22.5, 42.5, 22.5, 87.5, 30, 22.5, 17.5, 42.5, 62.5, 17.5,
250, 150, 42.5, 30, 42.5, 30, 62.5, 17.5, 87.5, 22.5, 150, 62.5,
42.5, 6.25, 87.5, 62.5, 42.5, 30, 42.5, 62.5, 42.5, 87.5, 62.5,
150, 42.5, 30, 6.25, 22.5, 30, 42.5, 42.5, 62.5, 250, 8.75, 150,
42.5, 30, 42.5, 30, 42.5, 42.5, 30, 30, 150, 22.5, 62.5, 30,
8.75, 150, 62.5, 87.5, 150, 42.5, 30, 42.5, 42.5, 42.5, 30, 8.75,
42.5, 42.5, 30, 22.5, 62.5, 17.5, 62.5, 62.5, 42.5, 8.75, 42.5,
12.5, 12.5, 150, 42.5, 42.5, 17.5, 42.5, 62.5, 62.5, 42.5, 42.5,
30, 42.5, 62.5, 30, 62.5, 42.5, 42.5, 42.5, 22.5, 62.5, 62.5,
62.5, 22.5, 150, 62.5, 42.5, 62.5, 42.5, 30, 30, 62.5, 22.5,
62.5, 87.5, 62.5, 42.5, 42.5, 22.5, 62.5, 62.5, 30, 42.5, 42.5,
8.75, 87.5, 42.5, 42.5, 87.5, 30, 62.5, 17.5, 62.5, 42.5, 17.5,
22.5, 62.5, 8.75, 62.5, 22.5, 22.5, 22.5, 42.5, 17.5, 22.5, 62.5,
42.5, 42.5, 42.5, 42.5, 42.5, 30, 30, 8.75, 30, 42.5, 62.5, 22.5,
6.25, 30, 42.5, 62.5, 17.5, 62.5, 42.5, 8.75, 22.5, 30, 17.5,
22.5, 62.5, 42.5, 150, 87.5, 22.5, 12.5, 62.5, 62.5, 62.5, 30,
42.5, 22.5, 62.5, 87.5, 30, 42.5, 62.5, 22.5, 87.5, 30, 30, 22.5,
87.5, 87.5, 250, 30, 62.5, 250, 62.5, 42.5, 42.5, 62.5, 62.5,
42.5, 6.25, 62.5, 62.5, 62.5, 42.5, 42.5, 150, 62.5, 62.5, 30,
150, 22.5, 87.5, 30, 150, 17.5, 8.75, 62.5, 42.5, 62.5, 150,
42.5, 22.5, 42.5, 42.5, 17.5, 62.5, 17.5, 62.5, 42.5, 150, 250,
22.5, 42.5, 30, 62.5, 62.5, 42.5, 42.5, 30, 150, 150, 42.5, 17.5,
17.5, 42.5, 8.75, 62.5, 42.5, 42.5, 22.5, 150, 62.5, 30, 250,
62.5, 87.5, 62.5, 8.75, 62.5, 30, 30, 8.75, 17.5, 17.5, 150,
22.5, 62.5, 62.5, 42.5)
faminc变量以1000s为单位
编辑 2:
好的,代码很好,但现在我尝试使用以下方法在直方图上拟合分布:
x <- rgamma(500,shape=q$par[1],scale=q$par[2])
hist(newdata$faminc, prob = TRUE)
curve(dgamma(x, shape=q$par[1], scale=q$par[2]), add=TRUE, col='blue')
它只会在 x 轴上产生一条蓝色的平线。
你有一些我没能解决的事情,但这里有一个估计的演示。
让我们从生成一些数据开始(这样我们就知道优化是否有效)。我只是在下面更改了您的优化功能,并使用了 Nelder-Mead 而不是拟牛顿。
set.seed(23)
a <- 2 # shape
b <- 3 # rate
require(data.table)
newdata <- data.table(faminc = rgamma(10000, a, b))
t1 <- sum(log(newdata$faminc))
t2 <- sum(newdata$faminc)
obs <- nrow(newdata)
llf <- function(x){
a <- x[1]
b <- x[2]
# log-likelihood function
return( - ((a - 1) * t1 - b * t2 - obs * a * log(1/b) - obs * log(gamma(a))))
}
# initial guess for a = mean^2(x)/var(x) and b = mean(x) / var(x)
a1 <- (mean(newdata$faminc))^2/var(newdata$faminc)
b1 <- mean(newdata$faminc)/var(newdata$faminc)
q <- optim(c(a1, b1), llf)
q$par
[1] 2.024353 3.019376
我会说我们非常接近。
你的数据:
(est <- q$par)
[1] 2.21333613 0.04243384
theoretical <- data.table(true = rgamma(10000, est[1], est[2]))
library(ggplot2)
ggplot(newdata, aes(x = faminc)) + geom_density() + geom_density(data = theoretical, aes(x = true, colour = "red")) + theme(legend.position = "none")
不是很好,但对于 500 obs 来说是合理的。
对 OP 编辑 2 的回应:
您应该更仔细地查看您正在使用的函数,curve
接受函数参数,而不是向量值:
gamma_density = function(x, a, b) ((b^a)/gamma(a)) * (x^(a - 1)) * exp(-b * x)
hist(newdata$faminc, prob = TRUE, ylim = c(0, 0.015))
curve(gamma_density(x, a = q$par[1], b = q$par[2]), add=TRUE, col='blue')
我对 R 有点陌生。我有一个数据集,其中还包括家庭收入数据,我必须使用最大似然估计对这些数据进行 Gamma 分布拟合。它特别告诉我们需要使用包 optim,而不是 fitdistr。所以这是我的代码:
t1 <- sum(log(newdata$faminc))
t2 <- sum(newdata$faminc)
obs <- nrow(newdata)
lh.gamma <- function(par) {
-((par[1]-1)*t1 - par[2]*t2 - obs*par[1]*log(par[2]) - obs*lgamma(par[1]))
}
#initial guess for a = mean^2(x)/var(x) and b = mean(x) / var(x)
a1 <- (mean(newdata$faminc))^2/var(newdata$faminc)
b1 <- mean(newdata$faminc)/var(newdata$faminc)
init <- c(a1,b1)
q <- optim(init, lh.gamma, method = "BFGS")
q
还尝试在初始向量中仅填充值,并包括这段代码;
dlh.gamma <- function(par){
cbind(obs*digamma(par[1])+obs*log(par[2])-t2,
obs*par[1]/par[2]-1/par[2]^2*t1)
}
然后优化看起来像:
q <- optim(init, lh.gamma, dhl.gamma, method="BFGS")
None 其中 'works'。首先,当我在学校计算机上尝试代码时,它为我提供了非常大的形状和速率参数数字,这是不可能的。现在,在家里尝试,我明白了:
> q <- optim(init, lh.gamma, method = "BFGS")
Error in optim(init, lh.gamma, method = "BFGS") :
non-finite finite-difference value [2]
In addition: There were 50 or more warnings (use warnings() to see the first 50)
> q
function (save = "default", status = 0, runLast = TRUE)
.Internal(quit(save, status, runLast))
<bytecode: 0x000000000eaac960>
<environment: namespace:base>
q 甚至不是 'created'。除了当我在上面包含 dlh.gamma 部分时,但随后我又得到了巨大的数字并且没有收敛。
有人知道 wrong/what 做什么吗?
编辑:
> dput(sample(newdata$faminc, 500))
c(42.5, 87.5, 22.5, 17.5, 12.5, 30, 30, 17.5, 42.5, 62.5, 62.5,
30, 30, 150, 22.5, 30, 42.5, 30, 17.5, 8.75, 42.5, 42.5, 42.5,
62.5, 42.5, 30, 17.5, 87.5, 62.5, 150, 42.5, 150, 42.5, 42.5,
42.5, 6.25, 62.5, 87.5, 6.25, 87.5, 30, 150, 22.5, 62.5, 42.5,
150, 17.5, 42.5, 42.5, 42.5, 62.5, 22.5, 42.5, 42.5, 30, 62.5,
30, 62.5, 87.5, 87.5, 42.5, 22.5, 62.5, 22.5, 8.75, 30, 30, 17.5,
87.5, 8.75, 62.5, 30, 17.5, 22.5, 62.5, 42.5, 30, 17.5, 62.5,
8.75, 62.5, 42.5, 150, 30, 62.5, 87.5, 17.5, 62.5, 30, 62.5,
87.5, 42.5, 62.5, 30, 62.5, 42.5, 87.5, 150, 12.5, 42.5, 62.5,
42.5, 62.5, 62.5, 150, 30, 87.5, 12.5, 17.5, 42.5, 62.5, 30,
6.25, 62.5, 42.5, 12.5, 62.5, 8.75, 17.5, 42.5, 62.5, 87.5, 8.75,
62.5, 30, 62.5, 87.5, 42.5, 62.5, 62.5, 12.5, 150, 42.5, 62.5,
12.5, 62.5, 42.5, 62.5, 62.5, 87.5, 42.5, 62.5, 30, 42.5, 150,
42.5, 30, 62.5, 62.5, 87.5, 42.5, 30, 62.5, 62.5, 42.5, 42.5,
30, 62.5, 42.5, 42.5, 62.5, 62.5, 150, 42.5, 30, 42.5, 62.5,
17.5, 62.5, 17.5, 150, 8.75, 62.5, 30, 62.5, 42.5, 42.5, 22.5,
150, 62.5, 42.5, 62.5, 62.5, 22.5, 30, 62.5, 30, 150, 42.5, 42.5,
42.5, 62.5, 30, 12.5, 30, 150, 12.5, 8.75, 22.5, 30, 22.5, 30,
42.5, 42.5, 42.5, 30, 12.5, 62.5, 42.5, 30, 22.5, 42.5, 87.5,
22.5, 12.5, 42.5, 62.5, 62.5, 62.5, 30, 42.5, 30, 62.5, 30, 62.5,
12.5, 22.5, 42.5, 22.5, 87.5, 30, 22.5, 17.5, 42.5, 62.5, 17.5,
250, 150, 42.5, 30, 42.5, 30, 62.5, 17.5, 87.5, 22.5, 150, 62.5,
42.5, 6.25, 87.5, 62.5, 42.5, 30, 42.5, 62.5, 42.5, 87.5, 62.5,
150, 42.5, 30, 6.25, 22.5, 30, 42.5, 42.5, 62.5, 250, 8.75, 150,
42.5, 30, 42.5, 30, 42.5, 42.5, 30, 30, 150, 22.5, 62.5, 30,
8.75, 150, 62.5, 87.5, 150, 42.5, 30, 42.5, 42.5, 42.5, 30, 8.75,
42.5, 42.5, 30, 22.5, 62.5, 17.5, 62.5, 62.5, 42.5, 8.75, 42.5,
12.5, 12.5, 150, 42.5, 42.5, 17.5, 42.5, 62.5, 62.5, 42.5, 42.5,
30, 42.5, 62.5, 30, 62.5, 42.5, 42.5, 42.5, 22.5, 62.5, 62.5,
62.5, 22.5, 150, 62.5, 42.5, 62.5, 42.5, 30, 30, 62.5, 22.5,
62.5, 87.5, 62.5, 42.5, 42.5, 22.5, 62.5, 62.5, 30, 42.5, 42.5,
8.75, 87.5, 42.5, 42.5, 87.5, 30, 62.5, 17.5, 62.5, 42.5, 17.5,
22.5, 62.5, 8.75, 62.5, 22.5, 22.5, 22.5, 42.5, 17.5, 22.5, 62.5,
42.5, 42.5, 42.5, 42.5, 42.5, 30, 30, 8.75, 30, 42.5, 62.5, 22.5,
6.25, 30, 42.5, 62.5, 17.5, 62.5, 42.5, 8.75, 22.5, 30, 17.5,
22.5, 62.5, 42.5, 150, 87.5, 22.5, 12.5, 62.5, 62.5, 62.5, 30,
42.5, 22.5, 62.5, 87.5, 30, 42.5, 62.5, 22.5, 87.5, 30, 30, 22.5,
87.5, 87.5, 250, 30, 62.5, 250, 62.5, 42.5, 42.5, 62.5, 62.5,
42.5, 6.25, 62.5, 62.5, 62.5, 42.5, 42.5, 150, 62.5, 62.5, 30,
150, 22.5, 87.5, 30, 150, 17.5, 8.75, 62.5, 42.5, 62.5, 150,
42.5, 22.5, 42.5, 42.5, 17.5, 62.5, 17.5, 62.5, 42.5, 150, 250,
22.5, 42.5, 30, 62.5, 62.5, 42.5, 42.5, 30, 150, 150, 42.5, 17.5,
17.5, 42.5, 8.75, 62.5, 42.5, 42.5, 22.5, 150, 62.5, 30, 250,
62.5, 87.5, 62.5, 8.75, 62.5, 30, 30, 8.75, 17.5, 17.5, 150,
22.5, 62.5, 62.5, 42.5)
faminc变量以1000s为单位
编辑 2:
好的,代码很好,但现在我尝试使用以下方法在直方图上拟合分布:
x <- rgamma(500,shape=q$par[1],scale=q$par[2])
hist(newdata$faminc, prob = TRUE)
curve(dgamma(x, shape=q$par[1], scale=q$par[2]), add=TRUE, col='blue')
它只会在 x 轴上产生一条蓝色的平线。
你有一些我没能解决的事情,但这里有一个估计的演示。
让我们从生成一些数据开始(这样我们就知道优化是否有效)。我只是在下面更改了您的优化功能,并使用了 Nelder-Mead 而不是拟牛顿。
set.seed(23)
a <- 2 # shape
b <- 3 # rate
require(data.table)
newdata <- data.table(faminc = rgamma(10000, a, b))
t1 <- sum(log(newdata$faminc))
t2 <- sum(newdata$faminc)
obs <- nrow(newdata)
llf <- function(x){
a <- x[1]
b <- x[2]
# log-likelihood function
return( - ((a - 1) * t1 - b * t2 - obs * a * log(1/b) - obs * log(gamma(a))))
}
# initial guess for a = mean^2(x)/var(x) and b = mean(x) / var(x)
a1 <- (mean(newdata$faminc))^2/var(newdata$faminc)
b1 <- mean(newdata$faminc)/var(newdata$faminc)
q <- optim(c(a1, b1), llf)
q$par
[1] 2.024353 3.019376
我会说我们非常接近。
你的数据:
(est <- q$par)
[1] 2.21333613 0.04243384
theoretical <- data.table(true = rgamma(10000, est[1], est[2]))
library(ggplot2)
ggplot(newdata, aes(x = faminc)) + geom_density() + geom_density(data = theoretical, aes(x = true, colour = "red")) + theme(legend.position = "none")
不是很好,但对于 500 obs 来说是合理的。
对 OP 编辑 2 的回应:
您应该更仔细地查看您正在使用的函数,curve
接受函数参数,而不是向量值:
gamma_density = function(x, a, b) ((b^a)/gamma(a)) * (x^(a - 1)) * exp(-b * x)
hist(newdata$faminc, prob = TRUE, ylim = c(0, 0.015))
curve(gamma_density(x, a = q$par[1], b = q$par[2]), add=TRUE, col='blue')