将 PyMC2 代码移植到 PyMC3 - 运动分析的层次模型
Porting PyMC2 code to PyMC3 - hierarchical model for sports analytics
我试过下面的代码,但是我运行遇到了问题。
我认为 .values 是问题所在,但我如何将其编码为 Theano 对象?
以下是我的数据来源
home_team,away_team,home_score,away_score
Wales,Italy,23,15
France,England,26,24
Ireland,Scotland,28,6
Ireland,Wales,26,3
Scotland,England,0,20
France,Italy,30,10
Wales,France,27,6
Italy,Scotland,20,21
England,Ireland,13,10
Ireland,Italy,46,7
Scotland,France,17,19
England,Wales,29,18
Italy,England,11,52
Wales,Scotland,51,3
France,Ireland,20,22
这是有效的 PyMC2 代码:
data_file = DATA_DIR + 'results_2014.csv'
df = pd.read_csv(data_file, sep=',')
# Or whatever it takes to get this into a data frame.
teams = df.home_team.unique()
teams = pd.DataFrame(teams, columns=['team'])
teams['i'] = teams.index
df = pd.merge(df, teams, left_on='home_team', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_home'}).drop('team', 1)
df = pd.merge(df, teams, left_on='away_team', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_away'}).drop('team', 1)
observed_home_goals = df.home_score.values
observed_away_goals = df.away_score.values
home_team = df.i_home.values
away_team = df.i_away.values
num_teams = len(df.i_home.drop_duplicates())
num_games = len(home_team)
g = df.groupby('i_away')
att_starting_points = np.log(g.away_score.mean())
g = df.groupby('i_home')
def_starting_points = -np.log(g.away_score.mean())
#hyperpriors
home = pymc.Normal('home', 0, .0001, value=0)
tau_att = pymc.Gamma('tau_att', .1, .1, value=10)
tau_def = pymc.Gamma('tau_def', .1, .1, value=10)
intercept = pymc.Normal('intercept', 0, .0001, value=0)
#team-specific parameters
atts_star = pymc.Normal("atts_star",
mu=0,
tau=tau_att,
size=num_teams,
value=att_starting_points.values)
defs_star = pymc.Normal("defs_star",
mu=0,
tau=tau_def,
size=num_teams,
value=def_starting_points.values)
# trick to code the sum to zero constraint
@pymc.deterministic
def atts(atts_star=atts_star):
atts = atts_star.copy()
atts = atts - np.mean(atts_star)
return atts
@pymc.deterministic
def defs(defs_star=defs_star):
defs = defs_star.copy()
defs = defs - np.mean(defs_star)
return defs
@pymc.deterministic
def home_theta(home_team=home_team,
away_team=away_team,
home=home,
atts=atts,
defs=defs,
intercept=intercept):
return np.exp(intercept +
home +
atts[home_team] +
defs[away_team])
@pymc.deterministic
def away_theta(home_team=home_team,
away_team=away_team,
home=home,
atts=atts,
defs=defs,
intercept=intercept):
return np.exp(intercept +
atts[away_team] +
defs[home_team])
home_points = pymc.Poisson('home_points',
mu=home_theta,
value=observed_home_goals,
observed=True)
away_points = pymc.Poisson('away_points',
mu=away_theta,
value=observed_away_goals,
observed=True)
mcmc = pymc.MCMC([home, intercept, tau_att, tau_def,
home_theta, away_theta,
atts_star, defs_star, atts, defs,
home_points, away_points])
map_ = pymc.MAP( mcmc )
map_.fit()
mcmc.sample(200000, 40000, 20)
我尝试移植到 PyMC3 :)
我包括 w运行gling 代码。
我定义了自己的数据目录等
data_file = DATA_DIR + 'results_2014.csv'
df = pd.read_csv(data_file, sep=',')
# Or whatever it takes to get this into a data frame.
teams = df.home_team.unique()
teams = pd.DataFrame(teams, columns=['team'])
teams['i'] = teams.index
df = pd.merge(df, teams, left_on='home_team', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_home'}).drop('team', 1)
df = pd.merge(df, teams, left_on='away_team', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_away'}).drop('team', 1)
observed_home_goals = df.home_score.values
observed_away_goals = df.away_score.values
home_team = df.i_home.values
away_team = df.i_away.values
num_teams = len(df.i_home.drop_duplicates())
num_games = len(home_team)
g = df.groupby('i_away')
att_starting_points = np.log(g.away_score.mean())
g = df.groupby('i_home')
def_starting_points = -np.log(g.away_score.mean())
import theano.tensor as T
import pymc3 as pm3
#hyperpriors
x = att_starting_points.values
y = def_starting_points.values
model = pm.Model()
with pm3.Model() as model:
home3 = pm3.Normal('home', 0, .0001)
tau_att3 = pm3.Gamma('tau_att', .1, .1)
tau_def3 = pm3.Gamma('tau_def', .1, .1)
intercept3 = pm3.Normal('intercept', 0, .0001)
#team-specific parameters
atts_star3 = pm3.Normal("atts_star",
mu=0,
tau=tau_att3,
observed=x)
defs_star3 = pm3.Normal("defs_star",
mu=0,
tau=tau_def3,
observed=y)
#Seems to be the error here.
atts = pm3.Deterministic('regression',
atts_star3 - np.mean(atts_star3))
home_theta3 = pm3.Deterministic('regression',
T.exp(intercept3 + atts[away_team] + defs[home_team]))
atts = pm3.Deterministic('regression', atts_star3 - np.mean(atts_star3))
home_theta3 = pm3.Deterministic('regression', T.exp(intercept3 + atts[away_team] + defs[home_team]))
# Unknown model parameters
home_points3 = pm3.Poisson('home_points', mu=home_theta3, observed=observed_home_goals)
away_points3 = pm3.Poisson('away_points', mu=home_theta3, observed=observed_away_goals)
start = pm3.find_MAP()
step = pm3.NUTS(state=start)
trace = pm3.sample(2000, step, start=start, progressbar=True)
pm3.traceplot(trace)
然后我得到一个错误,比如 values 不是 Theano 对象。
我认为这是上面的 .values 部分。但我对如何将其转换为 Theano 张量感到困惑。张量让我困惑 :)
为了清楚起见,错误是因为我误解了 PyMC3 语法中的某些内容。
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-71-ce51c1a64412> in <module>()
23
24 #Seems to be the error here.
---> 25 atts = pm3.Deterministic('regression', atts_star3 - np.mean(atts_star3))
26 home_theta3 = pm3.Deterministic('regression', T.exp(intercept3 + atts[away_team] + defs[home_team]))
27
/Users/peadarcoyle/anaconda/lib/python3.4/site-packages/numpy/core/fromnumeric.py in mean(a, axis, dtype, out, keepdims)
2733
2734 return _methods._mean(a, axis=axis, dtype=dtype,
-> 2735 out=out, keepdims=keepdims)
2736
2737 def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
/Users/peadarcoyle/anaconda/lib/python3.4/site-packages/numpy/core/_methods.py in _mean(a, axis, dtype, out, keepdims)
71 ret = ret.dtype.type(ret / rcount)
72 else:
---> 73 ret = ret / rcount
74
75 return ret
TypeError: unsupported operand type(s) for /: 'ObservedRV' and 'int'
你的模型失败了,因为你不能在 theano 张量上使用 NumPy 函数。于是
np.mean(atts_star3)
会给你一个错误。您可以删除 atts_star3 = pm3.Normal("atts_star",...)
并直接使用 NumPy 数组 atts_star3 = x
。
我认为您也不需要显式建模 tau_att3
、tau_def3
或 defs_star
。
或者,如果您想保留这些变量,您可以将 np.mean
替换为 theano.tensor.mean
,这应该有效。
所以我这样做了。它不是我以前版本的直接端口,但它给了我一个答案。有人有任何反馈吗?
import os
import math
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pymc3 as pm3# I know folks are switching to "as pm" but I'm just not there yet
%matplotlib inline
import seaborn as sns
from IPython.core.pylabtools import figsize
import seaborn as sns
import theano.tensor as T
figsize(12, 12)
DATA_DIR = os.path.join(os.getcwd(), 'data/')
data_file = DATA_DIR + 'results_2014.csv'
df = pd.read_csv(data_file, sep=',')
# Or whatever it takes to get this into a data frame.
teams = df.home_team.unique()
teams = pd.DataFrame(teams, columns=['team'])
teams['i'] = teams.index
df = pd.merge(df, teams, left_on='home_team', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_home'}).drop('team', 1)
df = pd.merge(df, teams, left_on='away_team', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_away'}).drop('team', 1)
observed_home_goals = df.home_score.values
observed_away_goals = df.away_score.values
home_team = df.i_home.values
away_team = df.i_away.values
num_teams = len(df.i_home.drop_duplicates())
num_games = len(home_team)
g = df.groupby('i_away')
att_starting_points = np.log(g.away_score.mean())
g = df.groupby('i_home')
def_starting_points = -np.log(g.away_score.mean())
import theano.tensor as T
import pymc3 as pm3
#hyperpriors
'''
def atts3(atts_star3=atts_star3):
atts3 = atts_star.copy()
atts3 = atts3 - np.mean(atts_star)
return atts3
def defs3(defs_star3=defs_star3):
defs3 = defs_star3.copy()
defs3 = defs3 - np.mean(defs_star3)
return defs
'''
model = pm3.Model()
with pm3.Model() as model:
home3 = pm3.Normal('home', 0, .0001)
tau_att3 = pm3.Gamma('tau_att', .1, .1)
tau_def3 = pm3.Gamma('tau_def', .1, .1)
intercept3 = pm3.Normal('intercept', 0, .0001)
#team-specific parameters
atts_star3 = pm3.Normal("atts_star",
mu=0,
tau=tau_att3,
shape=num_teams,
observed=att_starting_points.values)
defs_star3 = pm3.Normal("defs_star",
mu=0,
tau=tau_def3,
shape=num_teams,
observed=def_starting_points.values)
#home_theta3 = atts3 + defs3
#away_theta3 = atts3 + defs3
# Unknown model parameters
home_points3 = pm3.Poisson('home_points', mu=1, observed=observed_home_goals)
away_points3 = pm3.Poisson('away_points', mu=1, observed=observed_away_goals)
start = pm3.find_MAP()
step = pm3.NUTS(state=start)
trace = pm3.sample(2000, step, start=start, progressbar=True)
pm3.traceplot(trace)
这是我对你的 PyMC2 模型的翻译:
model = pm.Model()
with pm.Model() as model:
# global model parameters
home = pm.Normal('home', 0, .0001)
tau_att = pm.Gamma('tau_att', .1, .1)
tau_def = pm.Gamma('tau_def', .1, .1)
intercept = pm.Normal('intercept', 0, .0001)
# team-specific model parameters
atts_star = pm.Normal("atts_star",
mu =0,
tau =tau_att,
shape=num_teams)
defs_star = pm.Normal("defs_star",
mu =0,
tau =tau_def,
shape=num_teams)
atts = pm.Deterministic('atts', atts_star - tt.mean(atts_star))
defs = pm.Deterministic('defs', defs_star - tt.mean(defs_star))
home_theta = tt.exp(intercept + home + atts[home_team] + defs[away_team]
away_theta = tt.exp(intercept + atts[away_team] + defs[home_team])
# likelihood of observed data
home_points = pm.Poisson('home_points', mu=home_theta, observed=observed_home_goals)
away_points = pm.Poisson('away_points', mu=away_theta, observed=observed_away_goals)
在我看来,PyMC2 和 3 模型构建之间的最大区别在于 PyMC2 中初始值的整个业务不包含在 PyMC3 中的模型构建中。它被推入代码的模型拟合部分。
这是一个笔记本,将此模型与您的数据和一些合适的代码放在一起:http://nbviewer.ipython.org/gist/aflaxman/55e23195fe0a0b089103
我试过下面的代码,但是我运行遇到了问题。 我认为 .values 是问题所在,但我如何将其编码为 Theano 对象?
以下是我的数据来源
home_team,away_team,home_score,away_score
Wales,Italy,23,15
France,England,26,24
Ireland,Scotland,28,6
Ireland,Wales,26,3
Scotland,England,0,20
France,Italy,30,10
Wales,France,27,6
Italy,Scotland,20,21
England,Ireland,13,10
Ireland,Italy,46,7
Scotland,France,17,19
England,Wales,29,18
Italy,England,11,52
Wales,Scotland,51,3
France,Ireland,20,22
这是有效的 PyMC2 代码: data_file = DATA_DIR + 'results_2014.csv'
df = pd.read_csv(data_file, sep=',')
# Or whatever it takes to get this into a data frame.
teams = df.home_team.unique()
teams = pd.DataFrame(teams, columns=['team'])
teams['i'] = teams.index
df = pd.merge(df, teams, left_on='home_team', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_home'}).drop('team', 1)
df = pd.merge(df, teams, left_on='away_team', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_away'}).drop('team', 1)
observed_home_goals = df.home_score.values
observed_away_goals = df.away_score.values
home_team = df.i_home.values
away_team = df.i_away.values
num_teams = len(df.i_home.drop_duplicates())
num_games = len(home_team)
g = df.groupby('i_away')
att_starting_points = np.log(g.away_score.mean())
g = df.groupby('i_home')
def_starting_points = -np.log(g.away_score.mean())
#hyperpriors
home = pymc.Normal('home', 0, .0001, value=0)
tau_att = pymc.Gamma('tau_att', .1, .1, value=10)
tau_def = pymc.Gamma('tau_def', .1, .1, value=10)
intercept = pymc.Normal('intercept', 0, .0001, value=0)
#team-specific parameters
atts_star = pymc.Normal("atts_star",
mu=0,
tau=tau_att,
size=num_teams,
value=att_starting_points.values)
defs_star = pymc.Normal("defs_star",
mu=0,
tau=tau_def,
size=num_teams,
value=def_starting_points.values)
# trick to code the sum to zero constraint
@pymc.deterministic
def atts(atts_star=atts_star):
atts = atts_star.copy()
atts = atts - np.mean(atts_star)
return atts
@pymc.deterministic
def defs(defs_star=defs_star):
defs = defs_star.copy()
defs = defs - np.mean(defs_star)
return defs
@pymc.deterministic
def home_theta(home_team=home_team,
away_team=away_team,
home=home,
atts=atts,
defs=defs,
intercept=intercept):
return np.exp(intercept +
home +
atts[home_team] +
defs[away_team])
@pymc.deterministic
def away_theta(home_team=home_team,
away_team=away_team,
home=home,
atts=atts,
defs=defs,
intercept=intercept):
return np.exp(intercept +
atts[away_team] +
defs[home_team])
home_points = pymc.Poisson('home_points',
mu=home_theta,
value=observed_home_goals,
observed=True)
away_points = pymc.Poisson('away_points',
mu=away_theta,
value=observed_away_goals,
observed=True)
mcmc = pymc.MCMC([home, intercept, tau_att, tau_def,
home_theta, away_theta,
atts_star, defs_star, atts, defs,
home_points, away_points])
map_ = pymc.MAP( mcmc )
map_.fit()
mcmc.sample(200000, 40000, 20)
我尝试移植到 PyMC3 :) 我包括 w运行gling 代码。 我定义了自己的数据目录等
data_file = DATA_DIR + 'results_2014.csv'
df = pd.read_csv(data_file, sep=',')
# Or whatever it takes to get this into a data frame.
teams = df.home_team.unique()
teams = pd.DataFrame(teams, columns=['team'])
teams['i'] = teams.index
df = pd.merge(df, teams, left_on='home_team', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_home'}).drop('team', 1)
df = pd.merge(df, teams, left_on='away_team', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_away'}).drop('team', 1)
observed_home_goals = df.home_score.values
observed_away_goals = df.away_score.values
home_team = df.i_home.values
away_team = df.i_away.values
num_teams = len(df.i_home.drop_duplicates())
num_games = len(home_team)
g = df.groupby('i_away')
att_starting_points = np.log(g.away_score.mean())
g = df.groupby('i_home')
def_starting_points = -np.log(g.away_score.mean())
import theano.tensor as T
import pymc3 as pm3
#hyperpriors
x = att_starting_points.values
y = def_starting_points.values
model = pm.Model()
with pm3.Model() as model:
home3 = pm3.Normal('home', 0, .0001)
tau_att3 = pm3.Gamma('tau_att', .1, .1)
tau_def3 = pm3.Gamma('tau_def', .1, .1)
intercept3 = pm3.Normal('intercept', 0, .0001)
#team-specific parameters
atts_star3 = pm3.Normal("atts_star",
mu=0,
tau=tau_att3,
observed=x)
defs_star3 = pm3.Normal("defs_star",
mu=0,
tau=tau_def3,
observed=y)
#Seems to be the error here.
atts = pm3.Deterministic('regression',
atts_star3 - np.mean(atts_star3))
home_theta3 = pm3.Deterministic('regression',
T.exp(intercept3 + atts[away_team] + defs[home_team]))
atts = pm3.Deterministic('regression', atts_star3 - np.mean(atts_star3))
home_theta3 = pm3.Deterministic('regression', T.exp(intercept3 + atts[away_team] + defs[home_team]))
# Unknown model parameters
home_points3 = pm3.Poisson('home_points', mu=home_theta3, observed=observed_home_goals)
away_points3 = pm3.Poisson('away_points', mu=home_theta3, observed=observed_away_goals)
start = pm3.find_MAP()
step = pm3.NUTS(state=start)
trace = pm3.sample(2000, step, start=start, progressbar=True)
pm3.traceplot(trace)
然后我得到一个错误,比如 values 不是 Theano 对象。 我认为这是上面的 .values 部分。但我对如何将其转换为 Theano 张量感到困惑。张量让我困惑 :)
为了清楚起见,错误是因为我误解了 PyMC3 语法中的某些内容。
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-71-ce51c1a64412> in <module>()
23
24 #Seems to be the error here.
---> 25 atts = pm3.Deterministic('regression', atts_star3 - np.mean(atts_star3))
26 home_theta3 = pm3.Deterministic('regression', T.exp(intercept3 + atts[away_team] + defs[home_team]))
27
/Users/peadarcoyle/anaconda/lib/python3.4/site-packages/numpy/core/fromnumeric.py in mean(a, axis, dtype, out, keepdims)
2733
2734 return _methods._mean(a, axis=axis, dtype=dtype,
-> 2735 out=out, keepdims=keepdims)
2736
2737 def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
/Users/peadarcoyle/anaconda/lib/python3.4/site-packages/numpy/core/_methods.py in _mean(a, axis, dtype, out, keepdims)
71 ret = ret.dtype.type(ret / rcount)
72 else:
---> 73 ret = ret / rcount
74
75 return ret
TypeError: unsupported operand type(s) for /: 'ObservedRV' and 'int'
你的模型失败了,因为你不能在 theano 张量上使用 NumPy 函数。于是
np.mean(atts_star3)
会给你一个错误。您可以删除 atts_star3 = pm3.Normal("atts_star",...)
并直接使用 NumPy 数组 atts_star3 = x
。
我认为您也不需要显式建模 tau_att3
、tau_def3
或 defs_star
。
或者,如果您想保留这些变量,您可以将 np.mean
替换为 theano.tensor.mean
,这应该有效。
所以我这样做了。它不是我以前版本的直接端口,但它给了我一个答案。有人有任何反馈吗?
import os
import math
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pymc3 as pm3# I know folks are switching to "as pm" but I'm just not there yet
%matplotlib inline
import seaborn as sns
from IPython.core.pylabtools import figsize
import seaborn as sns
import theano.tensor as T
figsize(12, 12)
DATA_DIR = os.path.join(os.getcwd(), 'data/')
data_file = DATA_DIR + 'results_2014.csv'
df = pd.read_csv(data_file, sep=',')
# Or whatever it takes to get this into a data frame.
teams = df.home_team.unique()
teams = pd.DataFrame(teams, columns=['team'])
teams['i'] = teams.index
df = pd.merge(df, teams, left_on='home_team', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_home'}).drop('team', 1)
df = pd.merge(df, teams, left_on='away_team', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_away'}).drop('team', 1)
observed_home_goals = df.home_score.values
observed_away_goals = df.away_score.values
home_team = df.i_home.values
away_team = df.i_away.values
num_teams = len(df.i_home.drop_duplicates())
num_games = len(home_team)
g = df.groupby('i_away')
att_starting_points = np.log(g.away_score.mean())
g = df.groupby('i_home')
def_starting_points = -np.log(g.away_score.mean())
import theano.tensor as T
import pymc3 as pm3
#hyperpriors
'''
def atts3(atts_star3=atts_star3):
atts3 = atts_star.copy()
atts3 = atts3 - np.mean(atts_star)
return atts3
def defs3(defs_star3=defs_star3):
defs3 = defs_star3.copy()
defs3 = defs3 - np.mean(defs_star3)
return defs
'''
model = pm3.Model()
with pm3.Model() as model:
home3 = pm3.Normal('home', 0, .0001)
tau_att3 = pm3.Gamma('tau_att', .1, .1)
tau_def3 = pm3.Gamma('tau_def', .1, .1)
intercept3 = pm3.Normal('intercept', 0, .0001)
#team-specific parameters
atts_star3 = pm3.Normal("atts_star",
mu=0,
tau=tau_att3,
shape=num_teams,
observed=att_starting_points.values)
defs_star3 = pm3.Normal("defs_star",
mu=0,
tau=tau_def3,
shape=num_teams,
observed=def_starting_points.values)
#home_theta3 = atts3 + defs3
#away_theta3 = atts3 + defs3
# Unknown model parameters
home_points3 = pm3.Poisson('home_points', mu=1, observed=observed_home_goals)
away_points3 = pm3.Poisson('away_points', mu=1, observed=observed_away_goals)
start = pm3.find_MAP()
step = pm3.NUTS(state=start)
trace = pm3.sample(2000, step, start=start, progressbar=True)
pm3.traceplot(trace)
这是我对你的 PyMC2 模型的翻译:
model = pm.Model()
with pm.Model() as model:
# global model parameters
home = pm.Normal('home', 0, .0001)
tau_att = pm.Gamma('tau_att', .1, .1)
tau_def = pm.Gamma('tau_def', .1, .1)
intercept = pm.Normal('intercept', 0, .0001)
# team-specific model parameters
atts_star = pm.Normal("atts_star",
mu =0,
tau =tau_att,
shape=num_teams)
defs_star = pm.Normal("defs_star",
mu =0,
tau =tau_def,
shape=num_teams)
atts = pm.Deterministic('atts', atts_star - tt.mean(atts_star))
defs = pm.Deterministic('defs', defs_star - tt.mean(defs_star))
home_theta = tt.exp(intercept + home + atts[home_team] + defs[away_team]
away_theta = tt.exp(intercept + atts[away_team] + defs[home_team])
# likelihood of observed data
home_points = pm.Poisson('home_points', mu=home_theta, observed=observed_home_goals)
away_points = pm.Poisson('away_points', mu=away_theta, observed=observed_away_goals)
在我看来,PyMC2 和 3 模型构建之间的最大区别在于 PyMC2 中初始值的整个业务不包含在 PyMC3 中的模型构建中。它被推入代码的模型拟合部分。
这是一个笔记本,将此模型与您的数据和一些合适的代码放在一起:http://nbviewer.ipython.org/gist/aflaxman/55e23195fe0a0b089103