在 pandas 中简单应用滚动回归
rolling regression with a simple apply in pandas
考虑这个简单的例子
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
df = pd.DataFrame({'a':[1,3,5,7,4,5,6,4,7,8,9,1,3,5,7,4,5,6,4,7,8,9],
'b':[3,5,6,2,4,6,2,5,7,1,9,5,3,2,5,4,3,6,4,1,1,9]})
我正在尝试对 b
执行 a
的滚动回归。我正在尝试使用最简单的 pandas 可用工具:apply
。我想使用 apply 因为我想保持返回任何回归参数的灵活性。
然而,下面的简单代码不起作用
df.rolling(10).apply(lambda x: smf.ols('a ~ b', data = x).fit())
File "<string>", line 1, in <module>
PatsyError: Error evaluating factor: NameError: name 'b' is not defined
a ~ b
^
问题是什么?
谢谢!
rolling apply
无法同时与多个列交互,也无法生成非数字值。相反,我们需要利用 rolling
对象的可迭代特性。我们还需要考虑自己处理 min_periods,因为可迭代滚动对象会生成所有 windows 结果,而不管其他 rolling
个参数如何。
然后我们可以创建一些函数来生成回归结果中的每一行,以执行如下操作:
def process(x):
if len(x) >= 10:
reg = smf.ols('a ~ b', data=x).fit()
print(reg.params)
return [
# b from params
reg.params['b'],
# b from tvalues
reg.tvalues['b'],
# Both lower and upper b from conf_int()
*reg.conf_int().loc['b', :].tolist()
]
# Return NaN in the same dimension as the results
return [np.nan] * 4
df = df.join(
# join new DataFrame back to original
pd.DataFrame(
(process(x) for x in df.rolling(10)),
columns=['coef', 't', 'lower', 'upper']
)
)
df
:
a b coef t lower upper
0 1 3 NaN NaN NaN NaN
1 3 5 NaN NaN NaN NaN
2 5 6 NaN NaN NaN NaN
3 7 2 NaN NaN NaN NaN
4 4 4 NaN NaN NaN NaN
5 5 6 NaN NaN NaN NaN
6 6 2 NaN NaN NaN NaN
7 4 5 NaN NaN NaN NaN
8 7 7 NaN NaN NaN NaN
9 8 1 -0.216802 -0.602168 -1.047047 0.613442
10 9 9 0.042781 0.156592 -0.587217 0.672778
11 1 5 0.032086 0.097763 -0.724742 0.788913
12 3 3 0.113475 0.329006 -0.681872 0.908822
13 5 2 0.198582 0.600297 -0.564258 0.961421
14 7 5 0.203540 0.611002 -0.564646 0.971726
15 4 4 0.236599 0.686744 -0.557872 1.031069
16 5 3 0.293651 0.835945 -0.516403 1.103704
17 6 6 0.314286 0.936382 -0.459698 1.088269
18 4 4 0.276316 0.760812 -0.561191 1.113823
19 7 1 0.346491 1.028220 -0.430590 1.123572
20 8 1 -0.492424 -1.234601 -1.412181 0.427332
21 9 9 0.235075 0.879433 -0.381326 0.851476
设置:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
df = pd.DataFrame({
'a': [1, 3, 5, 7, 4, 5, 6, 4, 7, 8, 9, 1, 3, 5, 7, 4, 5, 6, 4, 7, 8, 9],
'b': [3, 5, 6, 2, 4, 6, 2, 5, 7, 1, 9, 5, 3, 2, 5, 4, 3, 6, 4, 1, 1, 9]
})
Rolling.apply
分别对每一列应用滚动操作(相关)。
关注 user3226167's of , it seems that easiest way to accomplish what you want is to use RollingOLS.from_formula
来自 statsmodels.regression.rolling.
from statsmodels.regression.rolling import RollingOLS
df = pd.DataFrame({'a':[1,3,5,7,4,5,6,4,7,8,9,1,3,5,7,4,5,6,4,7,8,9],
'b':[3,5,6,2,4,6,2,5,7,1,9,5,3,2,5,4,3,6,4,1,1,9]})
model = RollingOLS.from_formula('a ~ b', data = df, window=10)
reg_obj = model.fit()
# estimated coefficient
b_coeff = reg_obj.params['b'].rename('coef')
# b t-value
b_t_val = reg_obj.tvalues['b'].rename('t')
# 95 % confidence interval of b
b_conf_int = reg_obj.conf_int(cols=[1]).droplevel(level=0, axis=1)
# join all the desired information to the original df
df = df.join([b_coeff, b_t_val, b_conf_int])
其中 reg_obj
是一个 RollingRegressionResults
,其中包含大量有关回归的信息(请参阅文档中的所有不同属性)
输出
>>> type(reg_obj)
<class 'statsmodels.regression.rolling.RollingRegressionResults'>
>>> df
a b coef t lower upper
0 1 3 NaN NaN NaN NaN
1 3 5 NaN NaN NaN NaN
2 5 6 NaN NaN NaN NaN
3 7 2 NaN NaN NaN NaN
4 4 4 NaN NaN NaN NaN
5 5 6 NaN NaN NaN NaN
6 6 2 NaN NaN NaN NaN
7 4 5 NaN NaN NaN NaN
8 7 7 NaN NaN NaN NaN
9 8 1 -0.216802 -0.602168 -0.922460 0.488856
10 9 9 0.042781 0.156592 -0.492679 0.578240
11 1 5 0.032086 0.097763 -0.611172 0.675343
12 3 3 0.113475 0.329006 -0.562521 0.789472
13 5 2 0.198582 0.600297 -0.449786 0.846949
14 7 5 0.203540 0.611002 -0.449372 0.856452
15 4 4 0.236599 0.686744 -0.438653 0.911851
16 5 3 0.293651 0.835945 -0.394846 0.982147
17 6 6 0.314286 0.936382 -0.343553 0.972125
18 4 4 0.276316 0.760812 -0.435514 0.988146
19 7 1 0.346491 1.028220 -0.313981 1.006963
20 8 1 -0.492424 -1.234601 -1.274162 0.289313
21 9 9 0.235075 0.879433 -0.288829 0.758978
考虑这个简单的例子
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
df = pd.DataFrame({'a':[1,3,5,7,4,5,6,4,7,8,9,1,3,5,7,4,5,6,4,7,8,9],
'b':[3,5,6,2,4,6,2,5,7,1,9,5,3,2,5,4,3,6,4,1,1,9]})
我正在尝试对 b
执行 a
的滚动回归。我正在尝试使用最简单的 pandas 可用工具:apply
。我想使用 apply 因为我想保持返回任何回归参数的灵活性。
然而,下面的简单代码不起作用
df.rolling(10).apply(lambda x: smf.ols('a ~ b', data = x).fit())
File "<string>", line 1, in <module>
PatsyError: Error evaluating factor: NameError: name 'b' is not defined
a ~ b
^
问题是什么? 谢谢!
rolling apply
无法同时与多个列交互,也无法生成非数字值。相反,我们需要利用 rolling
对象的可迭代特性。我们还需要考虑自己处理 min_periods,因为可迭代滚动对象会生成所有 windows 结果,而不管其他 rolling
个参数如何。
然后我们可以创建一些函数来生成回归结果中的每一行,以执行如下操作:
def process(x):
if len(x) >= 10:
reg = smf.ols('a ~ b', data=x).fit()
print(reg.params)
return [
# b from params
reg.params['b'],
# b from tvalues
reg.tvalues['b'],
# Both lower and upper b from conf_int()
*reg.conf_int().loc['b', :].tolist()
]
# Return NaN in the same dimension as the results
return [np.nan] * 4
df = df.join(
# join new DataFrame back to original
pd.DataFrame(
(process(x) for x in df.rolling(10)),
columns=['coef', 't', 'lower', 'upper']
)
)
df
:
a b coef t lower upper
0 1 3 NaN NaN NaN NaN
1 3 5 NaN NaN NaN NaN
2 5 6 NaN NaN NaN NaN
3 7 2 NaN NaN NaN NaN
4 4 4 NaN NaN NaN NaN
5 5 6 NaN NaN NaN NaN
6 6 2 NaN NaN NaN NaN
7 4 5 NaN NaN NaN NaN
8 7 7 NaN NaN NaN NaN
9 8 1 -0.216802 -0.602168 -1.047047 0.613442
10 9 9 0.042781 0.156592 -0.587217 0.672778
11 1 5 0.032086 0.097763 -0.724742 0.788913
12 3 3 0.113475 0.329006 -0.681872 0.908822
13 5 2 0.198582 0.600297 -0.564258 0.961421
14 7 5 0.203540 0.611002 -0.564646 0.971726
15 4 4 0.236599 0.686744 -0.557872 1.031069
16 5 3 0.293651 0.835945 -0.516403 1.103704
17 6 6 0.314286 0.936382 -0.459698 1.088269
18 4 4 0.276316 0.760812 -0.561191 1.113823
19 7 1 0.346491 1.028220 -0.430590 1.123572
20 8 1 -0.492424 -1.234601 -1.412181 0.427332
21 9 9 0.235075 0.879433 -0.381326 0.851476
设置:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
df = pd.DataFrame({
'a': [1, 3, 5, 7, 4, 5, 6, 4, 7, 8, 9, 1, 3, 5, 7, 4, 5, 6, 4, 7, 8, 9],
'b': [3, 5, 6, 2, 4, 6, 2, 5, 7, 1, 9, 5, 3, 2, 5, 4, 3, 6, 4, 1, 1, 9]
})
Rolling.apply
分别对每一列应用滚动操作(相关
关注 user3226167's RollingOLS.from_formula
来自 statsmodels.regression.rolling.
from statsmodels.regression.rolling import RollingOLS
df = pd.DataFrame({'a':[1,3,5,7,4,5,6,4,7,8,9,1,3,5,7,4,5,6,4,7,8,9],
'b':[3,5,6,2,4,6,2,5,7,1,9,5,3,2,5,4,3,6,4,1,1,9]})
model = RollingOLS.from_formula('a ~ b', data = df, window=10)
reg_obj = model.fit()
# estimated coefficient
b_coeff = reg_obj.params['b'].rename('coef')
# b t-value
b_t_val = reg_obj.tvalues['b'].rename('t')
# 95 % confidence interval of b
b_conf_int = reg_obj.conf_int(cols=[1]).droplevel(level=0, axis=1)
# join all the desired information to the original df
df = df.join([b_coeff, b_t_val, b_conf_int])
其中 reg_obj
是一个 RollingRegressionResults
,其中包含大量有关回归的信息(请参阅文档中的所有不同属性)
输出
>>> type(reg_obj)
<class 'statsmodels.regression.rolling.RollingRegressionResults'>
>>> df
a b coef t lower upper
0 1 3 NaN NaN NaN NaN
1 3 5 NaN NaN NaN NaN
2 5 6 NaN NaN NaN NaN
3 7 2 NaN NaN NaN NaN
4 4 4 NaN NaN NaN NaN
5 5 6 NaN NaN NaN NaN
6 6 2 NaN NaN NaN NaN
7 4 5 NaN NaN NaN NaN
8 7 7 NaN NaN NaN NaN
9 8 1 -0.216802 -0.602168 -0.922460 0.488856
10 9 9 0.042781 0.156592 -0.492679 0.578240
11 1 5 0.032086 0.097763 -0.611172 0.675343
12 3 3 0.113475 0.329006 -0.562521 0.789472
13 5 2 0.198582 0.600297 -0.449786 0.846949
14 7 5 0.203540 0.611002 -0.449372 0.856452
15 4 4 0.236599 0.686744 -0.438653 0.911851
16 5 3 0.293651 0.835945 -0.394846 0.982147
17 6 6 0.314286 0.936382 -0.343553 0.972125
18 4 4 0.276316 0.760812 -0.435514 0.988146
19 7 1 0.346491 1.028220 -0.313981 1.006963
20 8 1 -0.492424 -1.234601 -1.274162 0.289313
21 9 9 0.235075 0.879433 -0.288829 0.758978