三次样条回归与sklearn?

cubic spline regression with sklearn?

我想非常准确地回归一个非线性依赖于单个变量 x 的目标。在我的 sklearn 管道中,我使用:

pipe = Pipeline([('poly', PolynomialFeatures(3, include_bias=False)), \
                     ('regr', ElasticNet(random_state=0))])

这似乎在准确性方面给出了与 np.polyfit(x, y, 3) 相似的结果。但是,我基本上可以通过使用三次样条来提高机器精度。请参见下图,其中我显示了数据和各种拟合以及残差。 [注意:下面的示例有 50 个样本。我现实中有2000个样本]

我有两个问题:

  1. 知道为什么 polyfitpolyfeat + Elasticnet 无法达到相同的准确度水平吗?
  2. scikit-learn中有三次样条目标回归的例子吗?
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    from scipy.interpolate import interp1d
    %matplotlib inline
    
    data = pd.read_csv('example.txt') # added to this post below
    
    p = np.polyfit(data['x'], data['y'], 3)
    data['polyfit'] = np.poly1d(p)(data['x'])
    f = interp1d(data['x'], data['y'], kind='cubic')
    data['spline'] = f(data['x'])
    
    fig, axes = plt.subplots(nrows=3, sharex=True)
    
    axes[0].plot(data['x'], data['polyfit'],'.', label='polyfit')
    axes[0].plot(data['x'], data['spline'],'.', label='spline')
    axes[0].plot(data['x'], data['y'],'.', label='true')
    axes[0].legend()
    
    axes[1].plot(data['x'], data['polyfit']-data['y'],'.', label='error polyfit')
    axes[1].legend()
    
    axes[2].plot(data['x'], data['spline']-data['y'],'.', label='error spline')
    axes[2].legend()
    
    plt.show()

数据如下:

example.txt:

    ,x,y
257,6.26028462060192,-1233.748349982897
944,4.557099191827032,928.1430280794456
1560,6.765081341690966,-1807.9090703632864
504,4.0015671921214775,1683.311523022658
1499,3.0496689401255783,3055.291788377236
1247,5.608726443314061,-441.9226126757499
1856,4.6124942196224845,845.129184983355
1495,1.273838638033053,5479.078773760113
1052,5.353775782315625,-115.14032709875217
247,2.6495185259267076,3656.7467318648232
1841,9.73337795053495,-4884.806993807511
1574,1.1772247845133335,5544.080005636716
1116,5.698561786140379,-555.3435567718
1489,4.184371293153768,1427.6922357286753
603,1.568868565047676,5179.156099377357
358,4.534081088923849,960.3983442022857
774,9.304809492028289,-4468.215701489676
1525,9.17423541311121,-4340.565494266174
1159,6.705834877066449,-1750.189447626367
1959,3.0431599461645207,3065.358649171256
1086,1.3861557136230234,5378.274828554064
81,4.728366950632029,682.7245723055514
1791,6.954198834068505,-2027.0414501796324
234,2.8672306789699844,3330.7282514295102
1850,2.0086469278742363,4603.0931759401155
1531,9.843164998128215,-4973.735518791005
903,1.534448692052103,5220.331847067942
1258,7.243723209152924,-2354.629822080041
645,2.3302780902754514,4128.077572586273
1425,3.295574067849755,2694.766296765896
311,2.3225198086033756,4152.206609684557
219,8.479436097125713,-3665.2515034579396
1917,7.1524135031820135,-2253.3455629418195
1412,6.79800860136838,-1861.3756670478142
705,1.9001265482939966,4756.283634364785
663,3.441268690856777,2489.7632239249424
1871,6.473544271091015,-1480.6593600880415
1897,8.217615163361007,-3386.5427698021977
558,6.609652057181176,-1634.1672307700298
553,5.679571371137544,-524.352981663938
1847,6.487178186324092,-1500.1891501936236
752,9.377368455681758,-4548.188126821915
1469,8.586759667609758,-3771.691600599668
1794,6.649801445466815,-1674.4870918398076
968,1.6226439291315056,5117.8804886837
108,3.0077346937655647,3118.0786841570025
96,6.278616413290749,-1245.4758811316083
994,7.631678455127069,-2767.3224262153176
871,2.6696610777085863,3630.02481913033
1405,9.209358577104299,-4368.622350004463

前两种方法对您的数据进行单个三次方程,但是(顾名思义)interp1d 插值 具有三次样条的数据:即, 每个 对连续的点都有一条三次曲线,因此可以保证完美拟合(达到计算精度)。

我想 post 跟进我之前的问题。使用 scikit-learn 可以拟合基于 B 样条的模型,其复杂度有限(预定义的样条数量——不像 interp1d 那样随着点数的增加而增长)。以下代码摘自robust_splines_sklearn.py.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import splrep, splev

from sklearn.preprocessing import PolynomialFeatures
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, ElasticNet

from sklearn.metrics import mean_squared_error

def get_bspline_basis(knots, degree=3, periodic=False):
    """Get spline coefficients for each basis spline."""
    nknots = len(knots)
    y_dummy = np.zeros(nknots)

    knots, coeffs, degree = splrep(knots, y_dummy, k=degree,
                                      per=periodic)
    ncoeffs = len(coeffs)
    bsplines = []
    for ispline in range(nknots):
        coeffs = [1.0 if ispl == ispline else 0.0 for ispl in range(ncoeffs)]
        bsplines.append((knots, coeffs, degree))
    return bsplines

class BSplineFeatures(TransformerMixin):
    def __init__(self, knots, degree=3, periodic=False):
        self.bsplines = get_bspline_basis(knots, degree, periodic=periodic)
        self.nsplines = len(self.bsplines)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        nsamples, nfeatures = X.shape
        features = np.zeros((nsamples, nfeatures * self.nsplines))
        for ispline, spline in enumerate(self.bsplines):
            istart = ispline * nfeatures
            iend = (ispline + 1) * nfeatures
            features[:, istart:iend] = splev(X, spline)
        return features

    
data = pd.read_csv('example.txt') 
knots = np.array([1, 1.5, 2, 3, 4, 5, 7.5, 10])

bspline_features = BSplineFeatures(knots, degree=3, periodic=False)
base_regressor = LinearRegression()


pipe1 = Pipeline([('poly', PolynomialFeatures(3, include_bias=False)), \
                  ('regr', ElasticNet(random_state=0))])

pipe2 = Pipeline([('feature', bspline_features), \
                  ('regression', base_regressor)])

models = {"polyfit": pipe1, "spline": pipe2}
    
X = data['x'].to_numpy().reshape((data.shape[0], 1))
y = data['y']

for label, model in models.items():
    model.fit(X, y)
    data[label] = model.predict(X)
    
fig, axes = plt.subplots(nrows=3, sharex=True)

axes[0].plot(data['x'], data['polyfit'],'.', label='polyfit')
axes[0].plot(data['x'], data['spline'],'.', label='spline')
axes[0].plot(data['x'], data['y'],'.', label='true')
axes[0].legend()

axes[1].plot(data['x'], data['polyfit']-data['y'],'.', label='error polyfit')
axes[1].legend()

axes[2].plot(data['x'], data['spline']-data['y'],'.', label='error spline')
axes[2].legend()

plt.show()