黑盒可能性示例

Question

我正在尝试了解如何在 pymc 中使用黑盒似然函数。基本上，这就是解释here。我已经尝试用一个非常简单的 Python 模型（双逻辑函数）自己实现这个，并且没有梯度。除了我提到的 link 中的代码，它围绕对数似然函数设置了 theano 包装器，我还有以下代码

# Define the model
def my_model(p, t, m_m, m_M):
    """An simple double logistic model"""
    return m_m + (m_M - m_m) * (
        1.0 / (1 + np.exp(-p[0] * (t - p[1] * 365)))
        + 1.0 / (1 + np.exp(p[2] * (t - p[3] * 365)))
        - 1
    )

# The loglikelihood function
def log_lklhood(theta, t, data, sigma):
    """Normal log-likelihoood"""
    y_pred = my_model(theta, t, data.max(), data.min())
    retval = -(0.5 / sigma ** 2) * np.sum((data - y_pred) ** 2)
    return retval

## Experimental data

data = np.array([0.104, 0.133, 0.131, 0.329, 0.669, 0.822, 0.847, 0.807, 0.638, 0.486, 0.162, 0.125])
t = np.array([1, 32, 61, 92, 122, 153, 183, 214, 245, 275, 306, 336])
sigma = 0.02 # Uncertainty in the observations
# Straight outta internet...
logl = LogLike(log_lklhood, data, t, sigma)
ndraws = 500  # number of draws from the distribution
nburn = 100  # number of "burn-in points" (which we'll discard)

# use PyMC3 to sampler from log-likelihood
with pm.Model():
    theta1 = pm.Normal("theta_1", mu=0, sigma=1, testval=0.1)
    theta3 = pm.Normal("theta_3", mu=0, sigma=1, testval=0.1)
    theta2 = pm.Uniform("theta_2", lower=0.01, upper=0.5, testval=120.0 / 365)
    theta4 = pm.Uniform("theta_4", lower=0.51, upper=0.99, testval=250.0 / 365)

    # convert theta1...theta4 to a tensor vector
    theta = tt.as_tensor_variable([theta1, theta2, theta3, theta4])
    # use a DensityDist (use a lamdba function to "call" the Op)
    pm.DensityDist("likelihood", lambda v: logl(v), observed={"v": theta})
    # Use simple metropolis
    step = pm.Metropolis()
    trace = pm.sample(ndraws, tune=nburn, step=step, discard_tuned_samples=True)

这遍历了采样，但随后失败并出现了相当神秘的错误：

Multiprocess sampling (2 chains in 2 jobs)
CompoundStep
>Metropolis: [theta_4]
>Metropolis: [theta_2]
>Metropolis: [theta_3]
>Metropolis: [theta_1]

100.00% [1010/1010 00:01<00:00 Sampling 2 chains, 0 divergences]

Sampling 2 chains for 5 tune and 500 draw iterations (10 + 1_000 draws total) took 2 seconds.

---------------------------------------------------------------------------
MissingInputError                         Traceback (most recent call last)
<ipython-input-14-3e5ab1ff0971> in <module>
     17     pm.DensityDist('likelihood', lambda v: logl(v), observed={'v': theta})
     18     step = pm.Metropolis()
---> 19     trace = pm.sample(ndraws, tune=nburn, step=step, discard_tuned_samples=True)

[...]


MissingInputError: Input 0 of the graph (indices start from 0), used to compute sigmoid(theta_4_interval__), was not provided and not given a value. Use the Theano flag exception_verbosity='high', for more information on this error.

我是 pymc 的新手，所以真的不知道这个错误是什么意思，或者我做错了什么。有很多 SO 问题（0 这表明我正在调用一个 python 函数，使用 theano 数组作为问题的基础。但是，我看不出这是在哪里发生的.

Answer 1

所以事实证明黑盒可能性示例存在问题：不要使用 pm.DensityDist，而是 pm.Potential (see this arviz issue)。该示例现在可以正常工作，甚至使用 scipy.optimize.approx_fprime 来近似 log-likelihood:

的梯度

import numpy as np
import pymc3 as pm
import theano.tensor as tt
import arviz as az
import scipy.optimize # Can't be bothered calculating derivatives by hand

## Experimental data

data = np.array([0.104, 0.133, 0.131, 0.329, 0.669, 0.822, 0.847, 0.807, 0.638, 0.486, 0.162, 0.125])
t = np.array([1, 32, 61, 92, 122, 153, 183, 214, 245, 275, 306, 336])

## LogLikelihood and gradient of the LogLikelihood functions
def log_likelihood(theta, data, t):
    if type(theta) == list:
        theta = theta[0]
    (
        theta1,
        theta2,
        theta3,
        theta4,
        sigma,
    ) = theta

    m_m = data.min()
    m_M = data.max()
    y_pred = m_m + (m_M - m_m) * (
        1.0 / (1 + np.exp(-theta1 * (t - theta2)))
        + 1.0 / (1 + np.exp(theta3 * (t - theta4)))
        - 1
    )

    logp = -len(data) * np.log(np.sqrt(2.0 * np.pi) * sigma)
    logp += -np.sum((data - y_pred) ** 2.0) / (2.0 * sigma ** 2.0)
    return logp


def der_log_likelihood(theta, data, t):
    def lnlike(values):
        return log_likelihood(values, data, t)

    eps = np.sqrt(np.finfo(float).eps)
    grads = scipy.optimize.approx_fprime(theta[0], lnlike, eps * np.ones(len(theta)))
    return grads

## Wrapper classes to theano-ize LogLklhood and gradient...
class Loglike(tt.Op):
    itypes = [tt.dvector]
    otypes = [tt.dscalar]

    def __init__(self, data, t):
        self.data = data
        self.t = t
        self.loglike_grad = LoglikeGrad(self.data, self.t)

    def perform(self, node, inputs, outputs):
        logp = log_likelihood(inputs, self.data, self.t)
        outputs[0][0] = np.array(logp)

    def grad(self, inputs, grad_outputs):
        (theta,) = inputs
        grads = self.loglike_grad(theta)
        return [grad_outputs[0] * grads]


class LoglikeGrad(tt.Op):
    itypes = [tt.dvector]
    otypes = [tt.dvector]

    def __init__(self, data, t):
        self.der_likelihood = der_log_likelihood
        self.data = data
        self.t = t

    def perform(self, node, inputs, outputs):
        (theta,) = inputs
        grads = self.der_likelihood(inputs, self.data, self.t)
        outputs[0][0] = grads

## Sample!
loglike = Loglike(data, t)
with pm.Model() as model:
    theta1 = pm.Normal("theta_1", mu=0, sigma=1, testval=0.1)
    theta3 = pm.Normal("theta_3", mu=0, sigma=1, testval=0.1)
    theta2 = pm.DiscreteUniform("theta_2", lower=1, upper=180, testval=120)
    theta4 = pm.DiscreteUniform("theta_4", lower=181, upper=365, testval=250)
    sigma = pm.HalfNormal("sigma", sigma=0.6, testval=0.01)
    # convert parameters to a tensor vector
    theta = tt.as_tensor_variable([theta1, theta2, theta3, theta4, sigma])
    # Do not use a DensityDist!
    pm.Potential("like", loglike(theta))

with model:
    trace = pm.sample(step=pm.NUTS())
    print(pm.summary(trace).to_string())

幸运的是，上面的代码导致了

Multiprocess sampling (2 chains in 2 jobs)
CompoundStep
>NUTS: [sigma, theta_3, theta_1]
>CompoundStep
>>Metropolis: [theta_4]
>>Metropolis: [theta_2]

100.00% [4000/4000 00:16<00:00 Sampling 2 chains, 0 divergences]

Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 17 seconds.
The number of effective samples is smaller than 25% for some parameters.

            mean     sd   hdi_3%  hdi_97%  mcse_mean  mcse_sd  ess_mean  ess_sd  ess_bulk  ess_tail  r_hat
theta_1    0.073  0.019    0.049    0.095      0.001    0.001     258.0   230.0     641.0     305.0   1.01
theta_3    0.052  0.010    0.037    0.070      0.000    0.000     404.0   361.0     681.0     379.0   1.01
theta_2  104.616  3.004  100.000  110.000      0.178    0.126     285.0   283.0     307.0     312.0   1.01
theta_4  270.178  3.139  264.000  275.000      0.147    0.104     455.0   455.0     454.0     606.0   1.01
sigma      0.040  0.013    0.020    0.063      0.001    0.001     324.0   298.0     410.0     422.0   1.01

黑盒可能性示例

Blackbox likelihood example

python

bayesian

pymc