在 PyMC3 中使用 BetaBinomial
Using BetaBinomial in PyMC3
我有一个 table 的二元结果计数,我想拟合一个 beta 二项分布来估计 $\alpha$ 和 $\beta$ 参数,但是当我尝试时出现错误fit/sample 我在其他情况下的模型分布方式:
import pymc3 as pm
import pandas as pd
df = pd.read_csv('~/data.csv', low_memory=False)
df = df[df.Clicks >= 0]
C0=df.C.values
I0=df.N.values
N0 = C0 + I0
with pm.Model() as model:
C=pm.constant(C0)
I=pm.constant(I0)
C1=pm.constant(C0 + 1)
I1=pm.constant(I0 + 1)
N=pm.constant(N0)
alpha = pm.Exponential('alpha', 1/(C0.sum()+1))
beta = pm.Exponential('beta', 1/(I0.sum()+1))
obs = pm.BetaBinomial('obs', alpha, beta, N, observed=C0)
with model:
advi_fit = pm.variational.advi(n=int(1e4))
trace1 = pm.variational.sample_vp(advi_fit, draws=int(1e4))
pm.traceplot(trace1[::10])
with model:
step = pm.NUTS()
#step = pm.Metropolis() # <== same problem
trace2 = pm.sample(int(1e3), step)
pm.traceplot(trace2[::10])
在这两种情况下,采样都失败了:
MissingInputError: ("An input of the graph, used to compute Elemwise{neg,no_inplace}(P_logodds_), was not provided and not given a value.Use the Theano flag exception_verbosity='high',for more information on this error.", P_logodds
在 advi
的情况下,完整的堆栈跟踪是:
MissingInputError Traceback (most recent call last)
<ipython-input-46-8947c7c798e5> in <module>()
----> 1 import codecs, os;__pyfile = codecs.open('''/tmp/py7996Jip''', encoding='''utf-8''');__code = __pyfile.read().encode('''utf-8''');__pyfile.close();os.remove('''/tmp/py7996Jip''');exec(compile(__code, '''/home/dmahler/Scripts/adops-bayes2.py''', 'exec'));
/home/dmahler/Scripts/adops-bayes2.py in <module>()
59 advi_fit = pm.variational.advi(n=int(J*6.4e4), learning_rate=1e-3/J, epsilon=1e-8, accurate_elbo=False)
60 #advi_fit = pm.variational.advi_minibatch(minibatch_RVs=[alpha, beta, p], minibatch_tensors=[C,I,N])
---> 61 trace = pm.variational.sample_vp(advi_fit, draws=int(2e4))
62
63 pm.traceplot(trace[::10])
/home/dmahler/Anaconda/lib/python2.7/site-packages/pymc3/variational/advi.pyc in sample_vp(vparams, draws, model, local_RVs, random_seed, hide_transformed)
317
318 varnames = [str(var) for var in model.unobserved_RVs]
--> 319 trace = NDArray(model=model, vars=vars_sampled)
320 trace.setup(draws=draws, chain=0)
321
/home/dmahler/Anaconda/lib/python2.7/site-packages/pymc3/backends/ndarray.pyc in __init__(self, name, model, vars)
21 """
22 def __init__(self, name=None, model=None, vars=None):
---> 23 super(NDArray, self).__init__(name, model, vars)
24 self.draw_idx = 0
25 self.draws = None
/home/dmahler/Anaconda/lib/python2.7/site-packages/pymc3/backends/base.pyc in __init__(self, name, model, vars)
34 self.vars = vars
35 self.varnames = [var.name for var in vars]
---> 36 self.fn = model.fastfn(vars)
37
38
/home/dmahler/Anaconda/lib/python2.7/site-packages/pymc3/model.pyc in fastfn(self, outs, mode, *args, **kwargs)
374 Compiled Theano function as point function.
375 """
--> 376 f = self.makefn(outs, mode, *args, **kwargs)
377 return FastPointFunc(f)
378
/home/dmahler/Anaconda/lib/python2.7/site-packages/pymc3/memoize.pyc in memoizer(*args, **kwargs)
12
13 if key not in cache:
---> 14 cache[key] = obj(*args, **kwargs)
15
16 return cache[key]
/home/dmahler/Anaconda/lib/python2.7/site-packages/pymc3/model.pyc in makefn(self, outs, mode, *args, **kwargs)
344 on_unused_input='ignore',
345 accept_inplace=True,
--> 346 mode=mode, *args, **kwargs)
347
348 def fn(self, outs, mode=None, *args, **kwargs):
/home/dmahler/Anaconda/lib/python2.7/site-packages/theano/compile/function.pyc in function(inputs, outputs, mode, updates, givens, no_default_updates, accept_inplace, name, rebuild_strict, allow_input_downcast, profile, on_unused_input)
318 on_unused_input=on_unused_input,
319 profile=profile,
--> 320 output_keys=output_keys)
321 # We need to add the flag check_aliased inputs if we have any mutable or
322 # borrowed used defined inputs
/home/dmahler/Anaconda/lib/python2.7/site-packages/theano/compile/pfunc.pyc in pfunc(params, outputs, mode, updates, givens, no_default_updates, accept_inplace, name, rebuild_strict, allow_input_downcast, profile, on_unused_input, output_keys)
477 accept_inplace=accept_inplace, name=name,
478 profile=profile, on_unused_input=on_unused_input,
--> 479 output_keys=output_keys)
480
481
/home/dmahler/Anaconda/lib/python2.7/site-packages/theano/compile/function_module.pyc in orig_function(inputs, outputs, mode, accept_inplace, name, profile, on_unused_input, output_keys)
1774 profile=profile,
1775 on_unused_input=on_unused_input,
-> 1776 output_keys=output_keys).create(
1777 defaults)
1778
/home/dmahler/Anaconda/lib/python2.7/site-packages/theano/compile/function_module.pyc in __init__(self, inputs, outputs, mode, accept_inplace, function_builder, profile, on_unused_input, fgraph, output_keys)
1426 # OUTPUT VARIABLES)
1427 fgraph, additional_outputs = std_fgraph(inputs, outputs,
-> 1428 accept_inplace)
1429 fgraph.profile = profile
1430 else:
/home/dmahler/Anaconda/lib/python2.7/site-packages/theano/compile/function_module.pyc in std_fgraph(input_specs, output_specs, accept_inplace)
175
176 fgraph = gof.fg.FunctionGraph(orig_inputs, orig_outputs,
--> 177 update_mapping=update_mapping)
178
179 for node in fgraph.apply_nodes:
/home/dmahler/Anaconda/lib/python2.7/site-packages/theano/gof/fg.pyc in __init__(self, inputs, outputs, features, clone, update_mapping)
169
170 for output in outputs:
--> 171 self.__import_r__(output, reason="init")
172 for i, output in enumerate(outputs):
173 output.clients.append(('output', i))
/home/dmahler/Anaconda/lib/python2.7/site-packages/theano/gof/fg.pyc in __import_r__(self, variable, reason)
358 # Imports the owners of the variables
359 if variable.owner and variable.owner not in self.apply_nodes:
--> 360 self.__import__(variable.owner, reason=reason)
361 if (variable.owner is None and
362 not isinstance(variable, graph.Constant) and
/home/dmahler/Anaconda/lib/python2.7/site-packages/theano/gof/fg.pyc in __import__(self, apply_node, check, reason)
472 "for more information on this error."
473 % str(node)),
--> 474 r)
475
476 for node in new_nodes:
MissingInputError: ("An input of the graph, used to compute Elemwise{neg,no_inplace}(P_logodds_), was not provided and not given a value.Use the Theano flag exception_verbosity='high',for more information on this error.", P_logodds_)
> /home/dmahler/Anaconda/lib/python2.7/site-packages/theano/gof/fg.py(474)__import__()
472 "for more information on this error."
473 % str(node)),
--> 474 r)
475
476 for node in new_nodes:
在我意识到 pymc3.BetaBinomial
之前,我正在尝试使用单独的 Beta
和 Binomial
实例来获得相同的结果:
with pm.Model() as model:
C=pm.constant(C0)
I=pm.constant(I0)
C1=pm.constant(C0 + 1)
I1=pm.constant(I0 + 1)
N=pm.constant(N0)
alpha = pm.Exponential('alpha', 1/(C0.sum()+1))
beta = pm.Exponential('beta', 1/(I0.sum()+1))
p = pm.Beta('P', alpha, beta, shape=K)
b = pm.Binomial('B', N, p, observed=C0)
这已成功完成,但不同的方法会产生截然不同的结果。我认为这可能部分是由于先验和观察之间的额外间接级别使搜索 space 更大。当我遇到 BetaBinomial
时,我认为它会使搜索更容易,同时也是 正确的事情 ^TM。否则我相信 to 模型在逻辑上应该是等价的。不幸的是,我无法弄清楚如何使 batebinomial
工作,并且我无法在互联网上找到任何使用 BetaBinomial
的示例。
- 如何使
BetaBinomial
modwel 工作?
- 这些模型在逻辑上真的是等价的吗?
- 有没有人能更好地猜测初始分层版本的数值问题的原因?
- 我该如何修复它们?
你的模型应该运行,你可以这样写
with pm.Model() as model:
alpha = pm.Exponential('alpha', 1/(C0.sum()+1))
beta = pm.Exponential('beta', 1/(I0.sum()+1))
obs = pm.BetaBinomial('obs', alpha, beta, N0, observed=C0)
也就是说,(C, I C1, I1) 已在您的模型中定义但未使用。无论如何,这不是问题。您得到的错误是因为 PyMC3 需要一个变量 P
(就像在您拥有的第二个模型中一样)但未定义该变量。可能你正在使用 Jupyter 笔记本并且你 delete/comment 一个 theano 变量。再次尝试 运行ning Notebook。
理论上使用 Beta 和 Binomial 或 BetaBinomial 应该得到相同的结果。从实用的角度来看。从 BetaBinomial 中抽样应该比从 Beta 和二项式中抽样更快,因为部分工作已经通过分析完成!
假设正确采样,两个模型应该提供相同的结果。要检查两个结果是否相同,请尝试增加样本数量(并避免 thinning)。还要比较模型之间和模型内部的结果(差异应该大致相同)。如果您不需要估计 P
变量(beta 分布),则使用 BetaBinomial。
我有一个 table 的二元结果计数,我想拟合一个 beta 二项分布来估计 $\alpha$ 和 $\beta$ 参数,但是当我尝试时出现错误fit/sample 我在其他情况下的模型分布方式:
import pymc3 as pm
import pandas as pd
df = pd.read_csv('~/data.csv', low_memory=False)
df = df[df.Clicks >= 0]
C0=df.C.values
I0=df.N.values
N0 = C0 + I0
with pm.Model() as model:
C=pm.constant(C0)
I=pm.constant(I0)
C1=pm.constant(C0 + 1)
I1=pm.constant(I0 + 1)
N=pm.constant(N0)
alpha = pm.Exponential('alpha', 1/(C0.sum()+1))
beta = pm.Exponential('beta', 1/(I0.sum()+1))
obs = pm.BetaBinomial('obs', alpha, beta, N, observed=C0)
with model:
advi_fit = pm.variational.advi(n=int(1e4))
trace1 = pm.variational.sample_vp(advi_fit, draws=int(1e4))
pm.traceplot(trace1[::10])
with model:
step = pm.NUTS()
#step = pm.Metropolis() # <== same problem
trace2 = pm.sample(int(1e3), step)
pm.traceplot(trace2[::10])
在这两种情况下,采样都失败了:
MissingInputError: ("An input of the graph, used to compute Elemwise{neg,no_inplace}(P_logodds_), was not provided and not given a value.Use the Theano flag exception_verbosity='high',for more information on this error.", P_logodds
在 advi
的情况下,完整的堆栈跟踪是:
MissingInputError Traceback (most recent call last)
<ipython-input-46-8947c7c798e5> in <module>()
----> 1 import codecs, os;__pyfile = codecs.open('''/tmp/py7996Jip''', encoding='''utf-8''');__code = __pyfile.read().encode('''utf-8''');__pyfile.close();os.remove('''/tmp/py7996Jip''');exec(compile(__code, '''/home/dmahler/Scripts/adops-bayes2.py''', 'exec'));
/home/dmahler/Scripts/adops-bayes2.py in <module>()
59 advi_fit = pm.variational.advi(n=int(J*6.4e4), learning_rate=1e-3/J, epsilon=1e-8, accurate_elbo=False)
60 #advi_fit = pm.variational.advi_minibatch(minibatch_RVs=[alpha, beta, p], minibatch_tensors=[C,I,N])
---> 61 trace = pm.variational.sample_vp(advi_fit, draws=int(2e4))
62
63 pm.traceplot(trace[::10])
/home/dmahler/Anaconda/lib/python2.7/site-packages/pymc3/variational/advi.pyc in sample_vp(vparams, draws, model, local_RVs, random_seed, hide_transformed)
317
318 varnames = [str(var) for var in model.unobserved_RVs]
--> 319 trace = NDArray(model=model, vars=vars_sampled)
320 trace.setup(draws=draws, chain=0)
321
/home/dmahler/Anaconda/lib/python2.7/site-packages/pymc3/backends/ndarray.pyc in __init__(self, name, model, vars)
21 """
22 def __init__(self, name=None, model=None, vars=None):
---> 23 super(NDArray, self).__init__(name, model, vars)
24 self.draw_idx = 0
25 self.draws = None
/home/dmahler/Anaconda/lib/python2.7/site-packages/pymc3/backends/base.pyc in __init__(self, name, model, vars)
34 self.vars = vars
35 self.varnames = [var.name for var in vars]
---> 36 self.fn = model.fastfn(vars)
37
38
/home/dmahler/Anaconda/lib/python2.7/site-packages/pymc3/model.pyc in fastfn(self, outs, mode, *args, **kwargs)
374 Compiled Theano function as point function.
375 """
--> 376 f = self.makefn(outs, mode, *args, **kwargs)
377 return FastPointFunc(f)
378
/home/dmahler/Anaconda/lib/python2.7/site-packages/pymc3/memoize.pyc in memoizer(*args, **kwargs)
12
13 if key not in cache:
---> 14 cache[key] = obj(*args, **kwargs)
15
16 return cache[key]
/home/dmahler/Anaconda/lib/python2.7/site-packages/pymc3/model.pyc in makefn(self, outs, mode, *args, **kwargs)
344 on_unused_input='ignore',
345 accept_inplace=True,
--> 346 mode=mode, *args, **kwargs)
347
348 def fn(self, outs, mode=None, *args, **kwargs):
/home/dmahler/Anaconda/lib/python2.7/site-packages/theano/compile/function.pyc in function(inputs, outputs, mode, updates, givens, no_default_updates, accept_inplace, name, rebuild_strict, allow_input_downcast, profile, on_unused_input)
318 on_unused_input=on_unused_input,
319 profile=profile,
--> 320 output_keys=output_keys)
321 # We need to add the flag check_aliased inputs if we have any mutable or
322 # borrowed used defined inputs
/home/dmahler/Anaconda/lib/python2.7/site-packages/theano/compile/pfunc.pyc in pfunc(params, outputs, mode, updates, givens, no_default_updates, accept_inplace, name, rebuild_strict, allow_input_downcast, profile, on_unused_input, output_keys)
477 accept_inplace=accept_inplace, name=name,
478 profile=profile, on_unused_input=on_unused_input,
--> 479 output_keys=output_keys)
480
481
/home/dmahler/Anaconda/lib/python2.7/site-packages/theano/compile/function_module.pyc in orig_function(inputs, outputs, mode, accept_inplace, name, profile, on_unused_input, output_keys)
1774 profile=profile,
1775 on_unused_input=on_unused_input,
-> 1776 output_keys=output_keys).create(
1777 defaults)
1778
/home/dmahler/Anaconda/lib/python2.7/site-packages/theano/compile/function_module.pyc in __init__(self, inputs, outputs, mode, accept_inplace, function_builder, profile, on_unused_input, fgraph, output_keys)
1426 # OUTPUT VARIABLES)
1427 fgraph, additional_outputs = std_fgraph(inputs, outputs,
-> 1428 accept_inplace)
1429 fgraph.profile = profile
1430 else:
/home/dmahler/Anaconda/lib/python2.7/site-packages/theano/compile/function_module.pyc in std_fgraph(input_specs, output_specs, accept_inplace)
175
176 fgraph = gof.fg.FunctionGraph(orig_inputs, orig_outputs,
--> 177 update_mapping=update_mapping)
178
179 for node in fgraph.apply_nodes:
/home/dmahler/Anaconda/lib/python2.7/site-packages/theano/gof/fg.pyc in __init__(self, inputs, outputs, features, clone, update_mapping)
169
170 for output in outputs:
--> 171 self.__import_r__(output, reason="init")
172 for i, output in enumerate(outputs):
173 output.clients.append(('output', i))
/home/dmahler/Anaconda/lib/python2.7/site-packages/theano/gof/fg.pyc in __import_r__(self, variable, reason)
358 # Imports the owners of the variables
359 if variable.owner and variable.owner not in self.apply_nodes:
--> 360 self.__import__(variable.owner, reason=reason)
361 if (variable.owner is None and
362 not isinstance(variable, graph.Constant) and
/home/dmahler/Anaconda/lib/python2.7/site-packages/theano/gof/fg.pyc in __import__(self, apply_node, check, reason)
472 "for more information on this error."
473 % str(node)),
--> 474 r)
475
476 for node in new_nodes:
MissingInputError: ("An input of the graph, used to compute Elemwise{neg,no_inplace}(P_logodds_), was not provided and not given a value.Use the Theano flag exception_verbosity='high',for more information on this error.", P_logodds_)
> /home/dmahler/Anaconda/lib/python2.7/site-packages/theano/gof/fg.py(474)__import__()
472 "for more information on this error."
473 % str(node)),
--> 474 r)
475
476 for node in new_nodes:
在我意识到 pymc3.BetaBinomial
之前,我正在尝试使用单独的 Beta
和 Binomial
实例来获得相同的结果:
with pm.Model() as model:
C=pm.constant(C0)
I=pm.constant(I0)
C1=pm.constant(C0 + 1)
I1=pm.constant(I0 + 1)
N=pm.constant(N0)
alpha = pm.Exponential('alpha', 1/(C0.sum()+1))
beta = pm.Exponential('beta', 1/(I0.sum()+1))
p = pm.Beta('P', alpha, beta, shape=K)
b = pm.Binomial('B', N, p, observed=C0)
这已成功完成,但不同的方法会产生截然不同的结果。我认为这可能部分是由于先验和观察之间的额外间接级别使搜索 space 更大。当我遇到 BetaBinomial
时,我认为它会使搜索更容易,同时也是 正确的事情 ^TM。否则我相信 to 模型在逻辑上应该是等价的。不幸的是,我无法弄清楚如何使 batebinomial
工作,并且我无法在互联网上找到任何使用 BetaBinomial
的示例。
- 如何使
BetaBinomial
modwel 工作? - 这些模型在逻辑上真的是等价的吗?
- 有没有人能更好地猜测初始分层版本的数值问题的原因?
- 我该如何修复它们?
你的模型应该运行,你可以这样写
with pm.Model() as model:
alpha = pm.Exponential('alpha', 1/(C0.sum()+1))
beta = pm.Exponential('beta', 1/(I0.sum()+1))
obs = pm.BetaBinomial('obs', alpha, beta, N0, observed=C0)
也就是说,(C, I C1, I1) 已在您的模型中定义但未使用。无论如何,这不是问题。您得到的错误是因为 PyMC3 需要一个变量 P
(就像在您拥有的第二个模型中一样)但未定义该变量。可能你正在使用 Jupyter 笔记本并且你 delete/comment 一个 theano 变量。再次尝试 运行ning Notebook。
理论上使用 Beta 和 Binomial 或 BetaBinomial 应该得到相同的结果。从实用的角度来看。从 BetaBinomial 中抽样应该比从 Beta 和二项式中抽样更快,因为部分工作已经通过分析完成!
假设正确采样,两个模型应该提供相同的结果。要检查两个结果是否相同,请尝试增加样本数量(并避免 thinning)。还要比较模型之间和模型内部的结果(差异应该大致相同)。如果您不需要估计 P
变量(beta 分布),则使用 BetaBinomial。