PyMC3:分层橄榄球模型后验?
PyMC3: Hierarchical rugby model posterior?
我刚刚开始阅读 PyMC3
documentation (I'm much more comfortable with sklearn
) and came across the Rugby hierarchical model example:
# Imports and Rugby data setup -- model in next section
import numpy as np
import pandas as pd
import pymc3 as pm
import theano.tensor as tt
import matplotlib.pyplot as plt
import seaborn as sns
games = [
['Wales', 'Italy', 23, 15],
['France', 'England', 26, 24],
['Ireland', 'Scotland', 28, 6],
['Ireland', 'Wales', 26, 3],
['Scotland', 'England', 0, 20],
['France', 'Italy', 30, 10],
['Wales', 'France', 27, 6],
['Italy', 'Scotland', 20, 21],
['England', 'Ireland', 13, 10],
['Ireland', 'Italy', 46, 7],
['Scotland', 'France', 17, 19],
['England', 'Wales', 29, 18],
['Italy', 'England', 11, 52],
['Wales', 'Scotland', 51, 3],
['France', 'Ireland', 20, 22],
]
columns = ['home_team', 'away_team', 'home_score', 'away_score']
df = pd.DataFrame(games, columns=columns)
teams = df.home_team.unique()
teams = pd.DataFrame(teams, columns=['team'])
teams['i'] = teams.index
df = pd.merge(df, teams, left_on='home_team', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_home'}).drop('team', 1)
df = pd.merge(df, teams, left_on='away_team', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_away'}).drop('team', 1)
observed_home_goals = df.home_score.values
observed_away_goals = df.away_score.values
home_team = df.i_home.values
away_team = df.i_away.values
num_teams = len(df.i_home.drop_duplicates())
num_games = len(home_team)
g = df.groupby('i_away')
att_starting_points = np.log(g.away_score.mean())
g = df.groupby('i_home')
def_starting_points = -np.log(g.away_score.mean())
这是主要的 PyMC3
模型设置:
with pm.Model() as model:
# Global model parameters
home = pm.Normal('home', 0, tau=.0001)
tau_att = pm.Gamma('tau_att', .1, .1)
tau_def = pm.Gamma('tau_def', .1, .1)
intercept = pm.Normal('intercept', 0, tau=.0001)
# Team-specific model parameters
atts_star = pm.Normal('atts_star', mu=0, tau=tau_att, shape=num_teams)
defs_star = pm.Normal('defs_star', mu=0, tau=tau_def, shape=num_teams)
atts = pm.Deterministic('atts', atts_star - tt.mean(atts_star))
defs = pm.Deterministic('defs', defs_star - tt.mean(defs_star))
home_theta = tt.exp(intercept + home + atts[home_team] + defs[away_team])
away_theta = tt.exp(intercept + atts[away_team] + defs[home_team])
# Likelihood of observed data
home_points = pm.Poisson('home_points', mu=home_theta, observed=observed_home_goals)
away_points = pm.Poisson('away_points', mu=away_theta, observed=observed_away_goals)
start = pm.find_MAP()
step = pm.NUTS(state=start)
trace = pm.sample(20000, step, init=start)
我知道如何绘制 trace
:
pm.traceplot(trace[5000:])
并生成posterior predictive samples:
ppc = pm.sample_ppc(trace[5000:], samples=500, model=model)
我不确定的地方:我该如何提问model/posterior?
例如,我假设 Wales vs Italy
场比赛的得分分布为:
# Wales vs Italy is the first matchup in our dataset
home_wales = ppc['home_points'][:, 0]
away_italy = ppc['away_points'][:, 0]
但是原始数据中没有记录的比赛呢?
- 如果意大利在主场迎战法国,他们的比分分布是怎样的?
- 如果意大利在主场迎战法国,两队得分都低于 15 分的频率是多少?
感谢任何help/insights。
我很确定在通读 PyMC3
Hierarchical Partial Pooling example 之后我能够弄明白这一点。按顺序回答问题:
是的,这就是 Wales vs Italy
场比赛的分布(因为这是观测数据中的第一场比赛)。
为了预测Italy vs France
(因为两支球队在我们的原始数据集中没有交手),我们需要预测thetas
.
这是更新模型的代码:
# Setup the model similarly to the previous one...
with pm.Model() as model:
# Global model parameters
home = pm.Normal('home', 0, tau=.0001)
tau_att = pm.Gamma('tau_att', .1, .1)
tau_def = pm.Gamma('tau_def', .1, .1)
intercept = pm.Normal('intercept', 0, tau=.0001)
# Team-specific model parameters
atts_star = pm.Normal('atts_star', mu=0, tau=tau_att, shape=num_teams)
defs_star = pm.Normal('defs_star', mu=0, tau=tau_def, shape=num_teams)
atts = pm.Deterministic('atts', atts_star - tt.mean(atts_star))
defs = pm.Deterministic('defs', defs_star - tt.mean(defs_star))
home_theta = tt.exp(intercept + home + atts[home_team] + defs[away_team])
away_theta = tt.exp(intercept + atts[away_team] + defs[home_team])
# Likelihood of observed data
home_points = pm.Poisson('home_points', mu=home_theta, observed=observed_home_goals)
away_points = pm.Poisson('away_points', mu=away_theta, observed=observed_away_goals)
# Now for predictions with no games played...
with model:
# IDs from `teams` DataFrame
italy, france = 4, 1
# New `thetas` for Italy vs France predictions
pred_home_theta = tt.exp(intercept + home + atts[italy] + defs[france])
pred_away_theta = tt.exp(intercept + atts[france] + defs[italy])
pred_home_points = pm.Poisson('pred_home_points', mu=pred_home_theta)
pred_away_points = pm.Poisson('pred_away_points', mu=pred_away_theta)
# Sample the final model
with model:
start = pm.find_MAP()
step = pm.NUTS(state=start)
trace = pm.sample(20000, step, init=start)
trace
完成后,我们可以绘制预测:
# Use 5,000 as MCMC burn in
pred = pd.DataFrame({
"italy": trace["pred_home_points"][5000:],
"france": trace["pred_away_points"][5000:],
})
# Plot the distributions
sns.kdeplot(pred.italy, shade=True, label="Italy")
sns.kdeplot(pred.france, shade=True, label="France")
plt.show()
意大利在主场赢球的频率是多少?
# 19% of the time
(pred.italy > pred.france).mean()
两支球队得分低于 15 分的频率是多少?
# 0.7% of the time
1.0 * len(pred[(pred.italy <= 15) & (pred.france <= 15)]) / len(pred)
我刚刚开始阅读 PyMC3
documentation (I'm much more comfortable with sklearn
) and came across the Rugby hierarchical model example:
# Imports and Rugby data setup -- model in next section
import numpy as np
import pandas as pd
import pymc3 as pm
import theano.tensor as tt
import matplotlib.pyplot as plt
import seaborn as sns
games = [
['Wales', 'Italy', 23, 15],
['France', 'England', 26, 24],
['Ireland', 'Scotland', 28, 6],
['Ireland', 'Wales', 26, 3],
['Scotland', 'England', 0, 20],
['France', 'Italy', 30, 10],
['Wales', 'France', 27, 6],
['Italy', 'Scotland', 20, 21],
['England', 'Ireland', 13, 10],
['Ireland', 'Italy', 46, 7],
['Scotland', 'France', 17, 19],
['England', 'Wales', 29, 18],
['Italy', 'England', 11, 52],
['Wales', 'Scotland', 51, 3],
['France', 'Ireland', 20, 22],
]
columns = ['home_team', 'away_team', 'home_score', 'away_score']
df = pd.DataFrame(games, columns=columns)
teams = df.home_team.unique()
teams = pd.DataFrame(teams, columns=['team'])
teams['i'] = teams.index
df = pd.merge(df, teams, left_on='home_team', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_home'}).drop('team', 1)
df = pd.merge(df, teams, left_on='away_team', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_away'}).drop('team', 1)
observed_home_goals = df.home_score.values
observed_away_goals = df.away_score.values
home_team = df.i_home.values
away_team = df.i_away.values
num_teams = len(df.i_home.drop_duplicates())
num_games = len(home_team)
g = df.groupby('i_away')
att_starting_points = np.log(g.away_score.mean())
g = df.groupby('i_home')
def_starting_points = -np.log(g.away_score.mean())
这是主要的 PyMC3
模型设置:
with pm.Model() as model:
# Global model parameters
home = pm.Normal('home', 0, tau=.0001)
tau_att = pm.Gamma('tau_att', .1, .1)
tau_def = pm.Gamma('tau_def', .1, .1)
intercept = pm.Normal('intercept', 0, tau=.0001)
# Team-specific model parameters
atts_star = pm.Normal('atts_star', mu=0, tau=tau_att, shape=num_teams)
defs_star = pm.Normal('defs_star', mu=0, tau=tau_def, shape=num_teams)
atts = pm.Deterministic('atts', atts_star - tt.mean(atts_star))
defs = pm.Deterministic('defs', defs_star - tt.mean(defs_star))
home_theta = tt.exp(intercept + home + atts[home_team] + defs[away_team])
away_theta = tt.exp(intercept + atts[away_team] + defs[home_team])
# Likelihood of observed data
home_points = pm.Poisson('home_points', mu=home_theta, observed=observed_home_goals)
away_points = pm.Poisson('away_points', mu=away_theta, observed=observed_away_goals)
start = pm.find_MAP()
step = pm.NUTS(state=start)
trace = pm.sample(20000, step, init=start)
我知道如何绘制 trace
:
pm.traceplot(trace[5000:])
并生成posterior predictive samples:
ppc = pm.sample_ppc(trace[5000:], samples=500, model=model)
我不确定的地方:我该如何提问model/posterior?
例如,我假设 Wales vs Italy
场比赛的得分分布为:
# Wales vs Italy is the first matchup in our dataset
home_wales = ppc['home_points'][:, 0]
away_italy = ppc['away_points'][:, 0]
但是原始数据中没有记录的比赛呢?
- 如果意大利在主场迎战法国,他们的比分分布是怎样的?
- 如果意大利在主场迎战法国,两队得分都低于 15 分的频率是多少?
感谢任何help/insights。
我很确定在通读 PyMC3
Hierarchical Partial Pooling example 之后我能够弄明白这一点。按顺序回答问题:
是的,这就是
Wales vs Italy
场比赛的分布(因为这是观测数据中的第一场比赛)。为了预测
Italy vs France
(因为两支球队在我们的原始数据集中没有交手),我们需要预测thetas
.
这是更新模型的代码:
# Setup the model similarly to the previous one...
with pm.Model() as model:
# Global model parameters
home = pm.Normal('home', 0, tau=.0001)
tau_att = pm.Gamma('tau_att', .1, .1)
tau_def = pm.Gamma('tau_def', .1, .1)
intercept = pm.Normal('intercept', 0, tau=.0001)
# Team-specific model parameters
atts_star = pm.Normal('atts_star', mu=0, tau=tau_att, shape=num_teams)
defs_star = pm.Normal('defs_star', mu=0, tau=tau_def, shape=num_teams)
atts = pm.Deterministic('atts', atts_star - tt.mean(atts_star))
defs = pm.Deterministic('defs', defs_star - tt.mean(defs_star))
home_theta = tt.exp(intercept + home + atts[home_team] + defs[away_team])
away_theta = tt.exp(intercept + atts[away_team] + defs[home_team])
# Likelihood of observed data
home_points = pm.Poisson('home_points', mu=home_theta, observed=observed_home_goals)
away_points = pm.Poisson('away_points', mu=away_theta, observed=observed_away_goals)
# Now for predictions with no games played...
with model:
# IDs from `teams` DataFrame
italy, france = 4, 1
# New `thetas` for Italy vs France predictions
pred_home_theta = tt.exp(intercept + home + atts[italy] + defs[france])
pred_away_theta = tt.exp(intercept + atts[france] + defs[italy])
pred_home_points = pm.Poisson('pred_home_points', mu=pred_home_theta)
pred_away_points = pm.Poisson('pred_away_points', mu=pred_away_theta)
# Sample the final model
with model:
start = pm.find_MAP()
step = pm.NUTS(state=start)
trace = pm.sample(20000, step, init=start)
trace
完成后,我们可以绘制预测:
# Use 5,000 as MCMC burn in
pred = pd.DataFrame({
"italy": trace["pred_home_points"][5000:],
"france": trace["pred_away_points"][5000:],
})
# Plot the distributions
sns.kdeplot(pred.italy, shade=True, label="Italy")
sns.kdeplot(pred.france, shade=True, label="France")
plt.show()
意大利在主场赢球的频率是多少?
# 19% of the time
(pred.italy > pred.france).mean()
两支球队得分低于 15 分的频率是多少?
# 0.7% of the time
1.0 * len(pred[(pred.italy <= 15) & (pred.france <= 15)]) / len(pred)