黑盒可能性示例
Blackbox likelihood example
我正在尝试了解如何在 pymc 中使用黑盒似然函数。基本上,这就是解释here。我已经尝试用一个非常简单的 Python 模型(双逻辑函数)自己实现这个,并且没有梯度。除了我提到的 link 中的代码,它围绕对数似然函数设置了 theano 包装器,我还有以下代码
# Define the model
def my_model(p, t, m_m, m_M):
"""An simple double logistic model"""
return m_m + (m_M - m_m) * (
1.0 / (1 + np.exp(-p[0] * (t - p[1] * 365)))
+ 1.0 / (1 + np.exp(p[2] * (t - p[3] * 365)))
- 1
)
# The loglikelihood function
def log_lklhood(theta, t, data, sigma):
"""Normal log-likelihoood"""
y_pred = my_model(theta, t, data.max(), data.min())
retval = -(0.5 / sigma ** 2) * np.sum((data - y_pred) ** 2)
return retval
## Experimental data
data = np.array([0.104, 0.133, 0.131, 0.329, 0.669, 0.822, 0.847, 0.807, 0.638, 0.486, 0.162, 0.125])
t = np.array([1, 32, 61, 92, 122, 153, 183, 214, 245, 275, 306, 336])
sigma = 0.02 # Uncertainty in the observations
# Straight outta internet...
logl = LogLike(log_lklhood, data, t, sigma)
ndraws = 500 # number of draws from the distribution
nburn = 100 # number of "burn-in points" (which we'll discard)
# use PyMC3 to sampler from log-likelihood
with pm.Model():
theta1 = pm.Normal("theta_1", mu=0, sigma=1, testval=0.1)
theta3 = pm.Normal("theta_3", mu=0, sigma=1, testval=0.1)
theta2 = pm.Uniform("theta_2", lower=0.01, upper=0.5, testval=120.0 / 365)
theta4 = pm.Uniform("theta_4", lower=0.51, upper=0.99, testval=250.0 / 365)
# convert theta1...theta4 to a tensor vector
theta = tt.as_tensor_variable([theta1, theta2, theta3, theta4])
# use a DensityDist (use a lamdba function to "call" the Op)
pm.DensityDist("likelihood", lambda v: logl(v), observed={"v": theta})
# Use simple metropolis
step = pm.Metropolis()
trace = pm.sample(ndraws, tune=nburn, step=step, discard_tuned_samples=True)
这遍历了采样,但随后失败并出现了相当神秘的错误:
Multiprocess sampling (2 chains in 2 jobs)
CompoundStep
>Metropolis: [theta_4]
>Metropolis: [theta_2]
>Metropolis: [theta_3]
>Metropolis: [theta_1]
100.00% [1010/1010 00:01<00:00 Sampling 2 chains, 0 divergences]
Sampling 2 chains for 5 tune and 500 draw iterations (10 + 1_000 draws total) took 2 seconds.
---------------------------------------------------------------------------
MissingInputError Traceback (most recent call last)
<ipython-input-14-3e5ab1ff0971> in <module>
17 pm.DensityDist('likelihood', lambda v: logl(v), observed={'v': theta})
18 step = pm.Metropolis()
---> 19 trace = pm.sample(ndraws, tune=nburn, step=step, discard_tuned_samples=True)
[...]
MissingInputError: Input 0 of the graph (indices start from 0), used to compute sigmoid(theta_4_interval__), was not provided and not given a value. Use the Theano flag exception_verbosity='high', for more information on this error.
我是 pymc 的新手,所以真的不知道这个错误是什么意思,或者我做错了什么。有很多 SO 问题(0 这表明我正在调用一个 python 函数,使用 theano 数组作为问题的基础。但是,我看不出这是在哪里发生的.
所以事实证明黑盒可能性示例存在问题:不要使用 pm.DensityDist
,而是 pm.Potential
(see this arviz issue)。该示例现在可以正常工作,甚至使用 scipy.optimize.approx_fprime
来近似 log-likelihood:
的梯度
import numpy as np
import pymc3 as pm
import theano.tensor as tt
import arviz as az
import scipy.optimize # Can't be bothered calculating derivatives by hand
## Experimental data
data = np.array([0.104, 0.133, 0.131, 0.329, 0.669, 0.822, 0.847, 0.807, 0.638, 0.486, 0.162, 0.125])
t = np.array([1, 32, 61, 92, 122, 153, 183, 214, 245, 275, 306, 336])
## LogLikelihood and gradient of the LogLikelihood functions
def log_likelihood(theta, data, t):
if type(theta) == list:
theta = theta[0]
(
theta1,
theta2,
theta3,
theta4,
sigma,
) = theta
m_m = data.min()
m_M = data.max()
y_pred = m_m + (m_M - m_m) * (
1.0 / (1 + np.exp(-theta1 * (t - theta2)))
+ 1.0 / (1 + np.exp(theta3 * (t - theta4)))
- 1
)
logp = -len(data) * np.log(np.sqrt(2.0 * np.pi) * sigma)
logp += -np.sum((data - y_pred) ** 2.0) / (2.0 * sigma ** 2.0)
return logp
def der_log_likelihood(theta, data, t):
def lnlike(values):
return log_likelihood(values, data, t)
eps = np.sqrt(np.finfo(float).eps)
grads = scipy.optimize.approx_fprime(theta[0], lnlike, eps * np.ones(len(theta)))
return grads
## Wrapper classes to theano-ize LogLklhood and gradient...
class Loglike(tt.Op):
itypes = [tt.dvector]
otypes = [tt.dscalar]
def __init__(self, data, t):
self.data = data
self.t = t
self.loglike_grad = LoglikeGrad(self.data, self.t)
def perform(self, node, inputs, outputs):
logp = log_likelihood(inputs, self.data, self.t)
outputs[0][0] = np.array(logp)
def grad(self, inputs, grad_outputs):
(theta,) = inputs
grads = self.loglike_grad(theta)
return [grad_outputs[0] * grads]
class LoglikeGrad(tt.Op):
itypes = [tt.dvector]
otypes = [tt.dvector]
def __init__(self, data, t):
self.der_likelihood = der_log_likelihood
self.data = data
self.t = t
def perform(self, node, inputs, outputs):
(theta,) = inputs
grads = self.der_likelihood(inputs, self.data, self.t)
outputs[0][0] = grads
## Sample!
loglike = Loglike(data, t)
with pm.Model() as model:
theta1 = pm.Normal("theta_1", mu=0, sigma=1, testval=0.1)
theta3 = pm.Normal("theta_3", mu=0, sigma=1, testval=0.1)
theta2 = pm.DiscreteUniform("theta_2", lower=1, upper=180, testval=120)
theta4 = pm.DiscreteUniform("theta_4", lower=181, upper=365, testval=250)
sigma = pm.HalfNormal("sigma", sigma=0.6, testval=0.01)
# convert parameters to a tensor vector
theta = tt.as_tensor_variable([theta1, theta2, theta3, theta4, sigma])
# Do not use a DensityDist!
pm.Potential("like", loglike(theta))
with model:
trace = pm.sample(step=pm.NUTS())
print(pm.summary(trace).to_string())
幸运的是,上面的代码导致了
Multiprocess sampling (2 chains in 2 jobs)
CompoundStep
>NUTS: [sigma, theta_3, theta_1]
>CompoundStep
>>Metropolis: [theta_4]
>>Metropolis: [theta_2]
100.00% [4000/4000 00:16<00:00 Sampling 2 chains, 0 divergences]
Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 17 seconds.
The number of effective samples is smaller than 25% for some parameters.
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_mean ess_sd ess_bulk ess_tail r_hat
theta_1 0.073 0.019 0.049 0.095 0.001 0.001 258.0 230.0 641.0 305.0 1.01
theta_3 0.052 0.010 0.037 0.070 0.000 0.000 404.0 361.0 681.0 379.0 1.01
theta_2 104.616 3.004 100.000 110.000 0.178 0.126 285.0 283.0 307.0 312.0 1.01
theta_4 270.178 3.139 264.000 275.000 0.147 0.104 455.0 455.0 454.0 606.0 1.01
sigma 0.040 0.013 0.020 0.063 0.001 0.001 324.0 298.0 410.0 422.0 1.01
我正在尝试了解如何在 pymc 中使用黑盒似然函数。基本上,这就是解释here。我已经尝试用一个非常简单的 Python 模型(双逻辑函数)自己实现这个,并且没有梯度。除了我提到的 link 中的代码,它围绕对数似然函数设置了 theano 包装器,我还有以下代码
# Define the model
def my_model(p, t, m_m, m_M):
"""An simple double logistic model"""
return m_m + (m_M - m_m) * (
1.0 / (1 + np.exp(-p[0] * (t - p[1] * 365)))
+ 1.0 / (1 + np.exp(p[2] * (t - p[3] * 365)))
- 1
)
# The loglikelihood function
def log_lklhood(theta, t, data, sigma):
"""Normal log-likelihoood"""
y_pred = my_model(theta, t, data.max(), data.min())
retval = -(0.5 / sigma ** 2) * np.sum((data - y_pred) ** 2)
return retval
## Experimental data
data = np.array([0.104, 0.133, 0.131, 0.329, 0.669, 0.822, 0.847, 0.807, 0.638, 0.486, 0.162, 0.125])
t = np.array([1, 32, 61, 92, 122, 153, 183, 214, 245, 275, 306, 336])
sigma = 0.02 # Uncertainty in the observations
# Straight outta internet...
logl = LogLike(log_lklhood, data, t, sigma)
ndraws = 500 # number of draws from the distribution
nburn = 100 # number of "burn-in points" (which we'll discard)
# use PyMC3 to sampler from log-likelihood
with pm.Model():
theta1 = pm.Normal("theta_1", mu=0, sigma=1, testval=0.1)
theta3 = pm.Normal("theta_3", mu=0, sigma=1, testval=0.1)
theta2 = pm.Uniform("theta_2", lower=0.01, upper=0.5, testval=120.0 / 365)
theta4 = pm.Uniform("theta_4", lower=0.51, upper=0.99, testval=250.0 / 365)
# convert theta1...theta4 to a tensor vector
theta = tt.as_tensor_variable([theta1, theta2, theta3, theta4])
# use a DensityDist (use a lamdba function to "call" the Op)
pm.DensityDist("likelihood", lambda v: logl(v), observed={"v": theta})
# Use simple metropolis
step = pm.Metropolis()
trace = pm.sample(ndraws, tune=nburn, step=step, discard_tuned_samples=True)
这遍历了采样,但随后失败并出现了相当神秘的错误:
Multiprocess sampling (2 chains in 2 jobs)
CompoundStep
>Metropolis: [theta_4]
>Metropolis: [theta_2]
>Metropolis: [theta_3]
>Metropolis: [theta_1]
100.00% [1010/1010 00:01<00:00 Sampling 2 chains, 0 divergences]
Sampling 2 chains for 5 tune and 500 draw iterations (10 + 1_000 draws total) took 2 seconds.
---------------------------------------------------------------------------
MissingInputError Traceback (most recent call last)
<ipython-input-14-3e5ab1ff0971> in <module>
17 pm.DensityDist('likelihood', lambda v: logl(v), observed={'v': theta})
18 step = pm.Metropolis()
---> 19 trace = pm.sample(ndraws, tune=nburn, step=step, discard_tuned_samples=True)
[...]
MissingInputError: Input 0 of the graph (indices start from 0), used to compute sigmoid(theta_4_interval__), was not provided and not given a value. Use the Theano flag exception_verbosity='high', for more information on this error.
我是 pymc 的新手,所以真的不知道这个错误是什么意思,或者我做错了什么。有很多 SO 问题(
所以事实证明黑盒可能性示例存在问题:不要使用 pm.DensityDist
,而是 pm.Potential
(see this arviz issue)。该示例现在可以正常工作,甚至使用 scipy.optimize.approx_fprime
来近似 log-likelihood:
import numpy as np
import pymc3 as pm
import theano.tensor as tt
import arviz as az
import scipy.optimize # Can't be bothered calculating derivatives by hand
## Experimental data
data = np.array([0.104, 0.133, 0.131, 0.329, 0.669, 0.822, 0.847, 0.807, 0.638, 0.486, 0.162, 0.125])
t = np.array([1, 32, 61, 92, 122, 153, 183, 214, 245, 275, 306, 336])
## LogLikelihood and gradient of the LogLikelihood functions
def log_likelihood(theta, data, t):
if type(theta) == list:
theta = theta[0]
(
theta1,
theta2,
theta3,
theta4,
sigma,
) = theta
m_m = data.min()
m_M = data.max()
y_pred = m_m + (m_M - m_m) * (
1.0 / (1 + np.exp(-theta1 * (t - theta2)))
+ 1.0 / (1 + np.exp(theta3 * (t - theta4)))
- 1
)
logp = -len(data) * np.log(np.sqrt(2.0 * np.pi) * sigma)
logp += -np.sum((data - y_pred) ** 2.0) / (2.0 * sigma ** 2.0)
return logp
def der_log_likelihood(theta, data, t):
def lnlike(values):
return log_likelihood(values, data, t)
eps = np.sqrt(np.finfo(float).eps)
grads = scipy.optimize.approx_fprime(theta[0], lnlike, eps * np.ones(len(theta)))
return grads
## Wrapper classes to theano-ize LogLklhood and gradient...
class Loglike(tt.Op):
itypes = [tt.dvector]
otypes = [tt.dscalar]
def __init__(self, data, t):
self.data = data
self.t = t
self.loglike_grad = LoglikeGrad(self.data, self.t)
def perform(self, node, inputs, outputs):
logp = log_likelihood(inputs, self.data, self.t)
outputs[0][0] = np.array(logp)
def grad(self, inputs, grad_outputs):
(theta,) = inputs
grads = self.loglike_grad(theta)
return [grad_outputs[0] * grads]
class LoglikeGrad(tt.Op):
itypes = [tt.dvector]
otypes = [tt.dvector]
def __init__(self, data, t):
self.der_likelihood = der_log_likelihood
self.data = data
self.t = t
def perform(self, node, inputs, outputs):
(theta,) = inputs
grads = self.der_likelihood(inputs, self.data, self.t)
outputs[0][0] = grads
## Sample!
loglike = Loglike(data, t)
with pm.Model() as model:
theta1 = pm.Normal("theta_1", mu=0, sigma=1, testval=0.1)
theta3 = pm.Normal("theta_3", mu=0, sigma=1, testval=0.1)
theta2 = pm.DiscreteUniform("theta_2", lower=1, upper=180, testval=120)
theta4 = pm.DiscreteUniform("theta_4", lower=181, upper=365, testval=250)
sigma = pm.HalfNormal("sigma", sigma=0.6, testval=0.01)
# convert parameters to a tensor vector
theta = tt.as_tensor_variable([theta1, theta2, theta3, theta4, sigma])
# Do not use a DensityDist!
pm.Potential("like", loglike(theta))
with model:
trace = pm.sample(step=pm.NUTS())
print(pm.summary(trace).to_string())
幸运的是,上面的代码导致了
Multiprocess sampling (2 chains in 2 jobs)
CompoundStep
>NUTS: [sigma, theta_3, theta_1]
>CompoundStep
>>Metropolis: [theta_4]
>>Metropolis: [theta_2]
100.00% [4000/4000 00:16<00:00 Sampling 2 chains, 0 divergences]
Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 17 seconds.
The number of effective samples is smaller than 25% for some parameters.
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_mean ess_sd ess_bulk ess_tail r_hat
theta_1 0.073 0.019 0.049 0.095 0.001 0.001 258.0 230.0 641.0 305.0 1.01
theta_3 0.052 0.010 0.037 0.070 0.000 0.000 404.0 361.0 681.0 379.0 1.01
theta_2 104.616 3.004 100.000 110.000 0.178 0.126 285.0 283.0 307.0 312.0 1.01
theta_4 270.178 3.139 264.000 275.000 0.147 0.104 455.0 455.0 454.0 606.0 1.01
sigma 0.040 0.013 0.020 0.063 0.001 0.001 324.0 298.0 410.0 422.0 1.01