Return StatsModels 中样本外预测的标准和置信区间
Return std and confidence intervals for out-of-sample prediction in StatsModels
我想从 OLS 模型中找出样本外预测的标准差和置信区间。
这个问题类似于 Confidence intervals for model prediction,但明确关注使用样本外数据。
这个想法是针对 wls_prediction_std(lm, data_to_use_for_prediction=out_of_sample_df)
的函数,即 returns prstd, iv_l, iv_u
用于样本数据帧之外的函数。
例如:
import pandas as pd
import random
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std
df = pd.DataFrame({"y":[x for x in range(10)],
"x1":[(x*5 + random.random() * 2) for x in range(10)],
"x2":[(x*2.1 + random.random()) for x in range(10)]})
out_of_sample_df = pd.DataFrame({"x1":[(x*3 + random.random() * 2) for x in range(10)],
"x2":[(x + random.random()) for x in range(10)]})
formula_string = "y ~ x1 + x2"
lm = smf.ols(formula=formula_string, data=df).fit()
# Prediction works fine:
print(lm.predict(out_of_sample_df))
# I can also get std and CI for in-sample data:
prstd, iv_l, iv_u = wls_prediction_std(lm)
print(prstd)
# I cannot figure out how to get std and CI for out-of-sample data:
try:
print(wls_prediction_std(lm, exog= out_of_sample_df))
except ValueError as e:
print(str(e))
#returns "ValueError: wrong shape of exog"
# trying to concatenate the DFs:
df_both = pd.concat([df, out_of_sample_df],
ignore_index = True)
# Only returns results for the data from df, not from out_of_sample_df
lm2 = smf.ols(formula=formula_string, data=df_both).fit()
prstd2, iv_l2, iv_u2 = wls_prediction_std(lm2)
print(prstd2)
看起来问题出在 exog
参数的格式上。此方法被 100% 盗用 from this workaround by github user thatneat. It is necessary because of this bug.
def transform_exog_to_model(fit, exog):
transform=True
self=fit
# The following is lifted straight from statsmodels.base.model.Results.predict()
if transform and hasattr(self.model, 'formula') and exog is not None:
from patsy import dmatrix
exog = dmatrix(self.model.data.orig_exog.design_info.builder,
exog)
if exog is not None:
exog = np.asarray(exog)
if exog.ndim == 1 and (self.model.exog.ndim == 1 or
self.model.exog.shape[1] == 1):
exog = exog[:, None]
exog = np.atleast_2d(exog) # needed in count model shape[1]
# end lifted code
return exog
transformed_exog = transform_exog_to_model(lm, out_of_sample_df)
print(transformed_exog)
prstd2, iv_l2, iv_u2 = wls_prediction_std(lm, transformed_exog, weights=[1])
print(prstd2)
另外你可以尝试使用get_prediction方法。
predictions = result.get_prediction(out_of_sample_df)
predictions.summary_frame(alpha=0.05)
这 returns 置信区间和预测区间。我发现 summary_frame() 方法埋藏在 here and you can find the get_prediction() method here 中。您可以通过修改 "alpha" 参数来更改置信区间和预测区间的显着性水平。
我想从 OLS 模型中找出样本外预测的标准差和置信区间。
这个问题类似于 Confidence intervals for model prediction,但明确关注使用样本外数据。
这个想法是针对 wls_prediction_std(lm, data_to_use_for_prediction=out_of_sample_df)
的函数,即 returns prstd, iv_l, iv_u
用于样本数据帧之外的函数。
例如:
import pandas as pd
import random
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std
df = pd.DataFrame({"y":[x for x in range(10)],
"x1":[(x*5 + random.random() * 2) for x in range(10)],
"x2":[(x*2.1 + random.random()) for x in range(10)]})
out_of_sample_df = pd.DataFrame({"x1":[(x*3 + random.random() * 2) for x in range(10)],
"x2":[(x + random.random()) for x in range(10)]})
formula_string = "y ~ x1 + x2"
lm = smf.ols(formula=formula_string, data=df).fit()
# Prediction works fine:
print(lm.predict(out_of_sample_df))
# I can also get std and CI for in-sample data:
prstd, iv_l, iv_u = wls_prediction_std(lm)
print(prstd)
# I cannot figure out how to get std and CI for out-of-sample data:
try:
print(wls_prediction_std(lm, exog= out_of_sample_df))
except ValueError as e:
print(str(e))
#returns "ValueError: wrong shape of exog"
# trying to concatenate the DFs:
df_both = pd.concat([df, out_of_sample_df],
ignore_index = True)
# Only returns results for the data from df, not from out_of_sample_df
lm2 = smf.ols(formula=formula_string, data=df_both).fit()
prstd2, iv_l2, iv_u2 = wls_prediction_std(lm2)
print(prstd2)
看起来问题出在 exog
参数的格式上。此方法被 100% 盗用 from this workaround by github user thatneat. It is necessary because of this bug.
def transform_exog_to_model(fit, exog):
transform=True
self=fit
# The following is lifted straight from statsmodels.base.model.Results.predict()
if transform and hasattr(self.model, 'formula') and exog is not None:
from patsy import dmatrix
exog = dmatrix(self.model.data.orig_exog.design_info.builder,
exog)
if exog is not None:
exog = np.asarray(exog)
if exog.ndim == 1 and (self.model.exog.ndim == 1 or
self.model.exog.shape[1] == 1):
exog = exog[:, None]
exog = np.atleast_2d(exog) # needed in count model shape[1]
# end lifted code
return exog
transformed_exog = transform_exog_to_model(lm, out_of_sample_df)
print(transformed_exog)
prstd2, iv_l2, iv_u2 = wls_prediction_std(lm, transformed_exog, weights=[1])
print(prstd2)
另外你可以尝试使用get_prediction方法。
predictions = result.get_prediction(out_of_sample_df)
predictions.summary_frame(alpha=0.05)
这 returns 置信区间和预测区间。我发现 summary_frame() 方法埋藏在 here and you can find the get_prediction() method here 中。您可以通过修改 "alpha" 参数来更改置信区间和预测区间的显着性水平。