如何通过 xarray 在 python 中进行回归?
how to do regression in python via xarray?
我正在尝试分别对我的时间序列数据 X 和 Y 进行逐日回归,即根据当前日期的 Y 值回归前一个日期的 X 数据。 X 是具有维度日期、股票和因子的 3-D 数据数组,Y 是具有维度日期和股票的二维数据数组。谁能帮我告诉我如何用一种有效的方式做到这一点?
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import xarray as xr
import os
import warnings
from functools import reduce
import math as mt
import statsmodels.api as sm
from lib.gftTools import gftIO
import datetime
import logging
time = pd.date_range('2000-01-01', freq='D', periods=365)
X = xr.DataArray(
np.random.randn(365, 10, 3), [('date', time), ('stock', list('abcdefghij')),
('factor', list('xyz'))])
Y = xr.DataArray(
np.random.randn(365, 10), [('date', time), ('stock', list('abcdefghij'))])
# create regression result dateframe
params = pd.DataFrame(index=X.date, columns=X.factor)
residuals = pd.DataFrame(index=X.date, columns=X.symbol)
# get the datetimeindex
idx_date = y.get_index('date')
idx_symbol = X.get_index('symbol')
for dt in y.date.values:
logger.debug('regression on %s', dt)
cur_date = pd.Timestamp(dt)
# get the position of current date
dt_pos = idx_date.get_loc(cur_date)
if dt_pos == 0:
continue
dt_pre_pos = dt_pos - 1
# symbols having valid value(not nan)
s = X[:, dt_pre_pos].notnull().all(axis=0)
valid_x = X[:, dt_pre_pos, s].symbol.values
w = y.loc[cur_date].notnull()
valid_y = y.loc[cur_date, w].symbol.values
valid_symbol = np.intersect1d(valid_x, valid_y)
try:
model = sm.RLM(
y.loc[cur_date, valid_symbol].values,
X.isel(
date=dt_pre_pos,
symbol=idx_symbol.get_indexer(valid_symbol)).values.T,
M=sm.robust.norms.HuberT())
results = model.fit()
except ValueError:
continue
params.loc[cur_date] = results.params
residuals.loc[cur_date, valid_symbol] = results.resid
我正在尝试分别对我的时间序列数据 X 和 Y 进行逐日回归,即根据当前日期的 Y 值回归前一个日期的 X 数据。 X 是具有维度日期、股票和因子的 3-D 数据数组,Y 是具有维度日期和股票的二维数据数组。谁能帮我告诉我如何用一种有效的方式做到这一点?
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import xarray as xr
import os
import warnings
from functools import reduce
import math as mt
import statsmodels.api as sm
from lib.gftTools import gftIO
import datetime
import logging
time = pd.date_range('2000-01-01', freq='D', periods=365)
X = xr.DataArray(
np.random.randn(365, 10, 3), [('date', time), ('stock', list('abcdefghij')),
('factor', list('xyz'))])
Y = xr.DataArray(
np.random.randn(365, 10), [('date', time), ('stock', list('abcdefghij'))])
# create regression result dateframe
params = pd.DataFrame(index=X.date, columns=X.factor)
residuals = pd.DataFrame(index=X.date, columns=X.symbol)
# get the datetimeindex
idx_date = y.get_index('date')
idx_symbol = X.get_index('symbol')
for dt in y.date.values:
logger.debug('regression on %s', dt)
cur_date = pd.Timestamp(dt)
# get the position of current date
dt_pos = idx_date.get_loc(cur_date)
if dt_pos == 0:
continue
dt_pre_pos = dt_pos - 1
# symbols having valid value(not nan)
s = X[:, dt_pre_pos].notnull().all(axis=0)
valid_x = X[:, dt_pre_pos, s].symbol.values
w = y.loc[cur_date].notnull()
valid_y = y.loc[cur_date, w].symbol.values
valid_symbol = np.intersect1d(valid_x, valid_y)
try:
model = sm.RLM(
y.loc[cur_date, valid_symbol].values,
X.isel(
date=dt_pre_pos,
symbol=idx_symbol.get_indexer(valid_symbol)).values.T,
M=sm.robust.norms.HuberT())
results = model.fit()
except ValueError:
continue
params.loc[cur_date] = results.params
residuals.loc[cur_date, valid_symbol] = results.resid