最大化包含 pbivnorm 的似然函数
Maximizing a likelihood function that contains pbivnorm
我有一个包含二元正态 CDF 的似然函数。我不断得到接近 1 的相关值,即使真实值为零。
R 包 sampleSelection 最大化包含二元正态 CDF 的似然函数(如 Van de Ven 和 Van Praag (1981))。我试着查看包的源代码,但找不到他们如何写可能性。作为参考,Van de Ven 和 Van Praag 的论文:
The Demand for Deductibles in Private Health Insurance: A Probit Model with Sample Selection.
似然函数为式(19),其中H表示标准正态CDF,H_2表示双变量正态CDF。
我的问题:
谁能告诉我sampleSelection包中似然函数是怎么写的?或
有人能告诉我为什么我在下面的代码中得到的相关性值接近 1 吗?
这是让我彻夜难眠的代码:
########################################################
#
# Trying to code Van de Ven and Van Praag (1981)
#
#
########################################################
library(MASS)
library(pbivnorm)
library(mnormt)
library(maxLik)
library(sampleSelection)
set.seed(1)
# Sample size
full_sample <- 1000
# Parameters
rho <- .1
beta0 <- 0
beta1 <- 1
gamma0 <- .2
gamma1 <- .5
gamma2 <- .5
varcovar <- matrix(c(1,rho,rho,1), nrow = 2, ncol = 2)
# Vectors for storing values
y <- rep(0,full_sample)
s <- rep(0,full_sample)
outcome <- rep(0,full_sample)
select <- rep(0,full_sample)
#######################
# Simulate data
#######################
x <- rnorm(full_sample)
z <- rnorm(full_sample)
errors <- mvrnorm(full_sample, rep(0,2), varcovar)
# note: 1st element for selection eq; 2nd outcome
s <- gamma0 + gamma1*x + gamma2*z + errors[,1]
y <- beta0 + beta1*x + errors[,2]
for(i in 1:full_sample){
if(s[i]>=0){
select[i] <- 1
if(y[i]>=0){
outcome[i] <- 1
}else{
outcome[i] <- 0
}
}else{
outcome[i] <- NA
select[i] <- 0
}
}
#######################################
# Writing the log likelihood
##
# Note: vega1= beta0,
# vega2= beta1,
# vega3= gamma0,
# vega4= gamma1,
# vega5= gamma2,
# vega6= rho
#######################################
first.lf <- function(vega) {
# Transforming this parameter becuase
# correlation is bounded between -1 aad 1
corr <- tanh(vega[6])
# Set up vectors for writing log likelihood
y0 <- 1-outcome
for(i in 1:full_sample) {
if(is.na(y0[i])){ y0[i]<- 0}
if(is.na(outcome[i])){ outcome[i]<- 0}
}
yt0 <- t(y0)
yt1 <- t(outcome)
missing <- 1 - select
ytmiss <- t(missing)
# Terms in the selection and outcome equations
A <- vega[3]+vega[4]*x+vega[5]*z
B <- vega[1]+vega[2]*x
term1 <- pbivnorm(A,B,corr)
term0 <- pbivnorm(A,-B,corr)
term_miss <- pnorm(-A)
log_term1 <- log(term1)
log_term0 <- log(term0)
log_term_miss <- log(term_miss)
# The log likelihood
logl <- sum( yt1%*%log_term1 + yt0%*%log_term0 + ytmiss%*%log_term_miss)
return(logl)
}
startv <- c(beta0,beta1,gamma0,gamma1,gamma2,rho)
# Maxmimizing my likelihood gives
maxLik(logLik = first.lf, start = startv, method="BFGS")
# tanh(7.28604) = 0.9999991, far from the true value of .1
# Using sampleSelection package for comparison
outcomeF<-factor(outcome)
selectEq <- select ~ x + z
outcomeEq <- outcomeF ~ x
selection( selectEq, outcomeEq)
# Notice the value of -0.2162 for rho compared to my 0.9999991
正好论文中方程(19)有错别字。从 i = N_1 + 1 到 N 的项应该有 -rho 而不是 rho。因此,使用
term0 <- pbivnorm(A,-B,-corr)
给予
maxLik(logLik = first.lf, start = startv, method="BFGS")
# Maximum Likelihood estimation
# BFGS maximization, 40 iterations
# Return code 0: successful convergence
# Log-Likelihood: -832.5119 (6 free parameter(s))
# Estimate(s): 0.3723783 0.9307454 0.1349979 0.4693686 0.4572421 -0.219618
根据需要。
我有一个包含二元正态 CDF 的似然函数。我不断得到接近 1 的相关值,即使真实值为零。
R 包 sampleSelection 最大化包含二元正态 CDF 的似然函数(如 Van de Ven 和 Van Praag (1981))。我试着查看包的源代码,但找不到他们如何写可能性。作为参考,Van de Ven 和 Van Praag 的论文:
The Demand for Deductibles in Private Health Insurance: A Probit Model with Sample Selection.
似然函数为式(19),其中H表示标准正态CDF,H_2表示双变量正态CDF。
我的问题:
谁能告诉我sampleSelection包中似然函数是怎么写的?或
有人能告诉我为什么我在下面的代码中得到的相关性值接近 1 吗?
这是让我彻夜难眠的代码:
########################################################
#
# Trying to code Van de Ven and Van Praag (1981)
#
#
########################################################
library(MASS)
library(pbivnorm)
library(mnormt)
library(maxLik)
library(sampleSelection)
set.seed(1)
# Sample size
full_sample <- 1000
# Parameters
rho <- .1
beta0 <- 0
beta1 <- 1
gamma0 <- .2
gamma1 <- .5
gamma2 <- .5
varcovar <- matrix(c(1,rho,rho,1), nrow = 2, ncol = 2)
# Vectors for storing values
y <- rep(0,full_sample)
s <- rep(0,full_sample)
outcome <- rep(0,full_sample)
select <- rep(0,full_sample)
#######################
# Simulate data
#######################
x <- rnorm(full_sample)
z <- rnorm(full_sample)
errors <- mvrnorm(full_sample, rep(0,2), varcovar)
# note: 1st element for selection eq; 2nd outcome
s <- gamma0 + gamma1*x + gamma2*z + errors[,1]
y <- beta0 + beta1*x + errors[,2]
for(i in 1:full_sample){
if(s[i]>=0){
select[i] <- 1
if(y[i]>=0){
outcome[i] <- 1
}else{
outcome[i] <- 0
}
}else{
outcome[i] <- NA
select[i] <- 0
}
}
#######################################
# Writing the log likelihood
##
# Note: vega1= beta0,
# vega2= beta1,
# vega3= gamma0,
# vega4= gamma1,
# vega5= gamma2,
# vega6= rho
#######################################
first.lf <- function(vega) {
# Transforming this parameter becuase
# correlation is bounded between -1 aad 1
corr <- tanh(vega[6])
# Set up vectors for writing log likelihood
y0 <- 1-outcome
for(i in 1:full_sample) {
if(is.na(y0[i])){ y0[i]<- 0}
if(is.na(outcome[i])){ outcome[i]<- 0}
}
yt0 <- t(y0)
yt1 <- t(outcome)
missing <- 1 - select
ytmiss <- t(missing)
# Terms in the selection and outcome equations
A <- vega[3]+vega[4]*x+vega[5]*z
B <- vega[1]+vega[2]*x
term1 <- pbivnorm(A,B,corr)
term0 <- pbivnorm(A,-B,corr)
term_miss <- pnorm(-A)
log_term1 <- log(term1)
log_term0 <- log(term0)
log_term_miss <- log(term_miss)
# The log likelihood
logl <- sum( yt1%*%log_term1 + yt0%*%log_term0 + ytmiss%*%log_term_miss)
return(logl)
}
startv <- c(beta0,beta1,gamma0,gamma1,gamma2,rho)
# Maxmimizing my likelihood gives
maxLik(logLik = first.lf, start = startv, method="BFGS")
# tanh(7.28604) = 0.9999991, far from the true value of .1
# Using sampleSelection package for comparison
outcomeF<-factor(outcome)
selectEq <- select ~ x + z
outcomeEq <- outcomeF ~ x
selection( selectEq, outcomeEq)
# Notice the value of -0.2162 for rho compared to my 0.9999991
正好论文中方程(19)有错别字。从 i = N_1 + 1 到 N 的项应该有 -rho 而不是 rho。因此,使用
term0 <- pbivnorm(A,-B,-corr)
给予
maxLik(logLik = first.lf, start = startv, method="BFGS")
# Maximum Likelihood estimation
# BFGS maximization, 40 iterations
# Return code 0: successful convergence
# Log-Likelihood: -832.5119 (6 free parameter(s))
# Estimate(s): 0.3723783 0.9307454 0.1349979 0.4693686 0.4572421 -0.219618
根据需要。