使用 UnivariateSpline 紧密拟合数据

Use UnivariateSpline to fit data tightly

我有一堆代表 sigmoidal 函数的 x,y 点:

x=[ 1.00094909  1.08787635  1.17481363  1.2617564   1.34867881  1.43562284
  1.52259341  1.609522    1.69631283  1.78276102  1.86426648  1.92896789
  1.9464453   1.94941586  2.00062852  2.073691    2.14982808  2.22808316
  2.30634034  2.38456905  2.46280126  2.54106611  2.6193345   2.69748825]
y=[-0.10057627 -0.10172142 -0.10320428 -0.10378959 -0.10348456 -0.10312503
 -0.10276956 -0.10170055 -0.09778279 -0.08608644 -0.05797392  0.00063599
  0.08732999  0.16429878  0.2223306   0.25368884  0.26830932  0.27313931
  0.27308756  0.27048902  0.26626313  0.26139534  0.25634544  0.2509893 ]

我使用 scipy.interpolate.UnivariateSpline() 来拟合一些三次样条,如下所示:

from scipy.interpolate import UnivariateSpline
s = UnivariateSpline(x, y, k=3, s=0)

xfit = np.linspace(x.min(), x.max(), 200)
plt.scatter(x,y)
plt.plot(xfit, s(xfit))
plt.show()

这是我得到的:

由于我指定了 s=0,样条曲线完全符合数据,但有太多摆动。使用更高的 k 值会导致更多的摆动。

所以我的问题是——

  1. 我应该如何正确使用 scipy.interpolate.UnivariateSpline() 来拟合我的数据?更准确地说,如何使样条曲线的摆动最小化?
  2. 对于这种 S 型函数,这甚至是正确的选择吗?我应该使用 scipy.optimize.curve_fit() 之类的东西和试用 tanh(x) 函数吗?

有几个选项,我在下面列出几个。最后一个似乎给出了最好的输出。您应该使用样条曲线还是实际函数取决于您要对输出做什么;我在下面列出了两个可以使用的分析函数,但我不知道数据是在哪个上下文中得出的,因此很难找到最适合您的函数。

你可以玩 s,例如对于 s=0.005,情节看起来像这样(仍然不是很漂亮,但您可以进一步调整):

但我确实会使用 "proper" 函数并适合使用例如curve_fit。下面的函数仍然不理想,因为它是单调递增的,所以我们错过了最后的递减;情节如下所示:

这是样条曲线和实际拟合的完整代码:

from scipy.interpolate import UnivariateSpline
import matplotlib.pyplot as plt
import numpy as np
from scipy.optimize import curve_fit


def func(x, ymax, n, k, c):
    return ymax * x ** n / (k ** n + x ** n) + c

x=np.array([ 1.00094909,  1.08787635,  1.17481363,  1.2617564,   1.34867881,  1.43562284,
  1.52259341,  1.609522,    1.69631283,  1.78276102,  1.86426648,  1.92896789,
  1.9464453,   1.94941586,  2.00062852,  2.073691,    2.14982808,  2.22808316,
  2.30634034,  2.38456905,  2.46280126,  2.54106611,  2.6193345,   2.69748825])
y=np.array([-0.10057627, -0.10172142, -0.10320428, -0.10378959, -0.10348456, -0.10312503,
 -0.10276956, -0.10170055, -0.09778279, -0.08608644, -0.05797392,  0.00063599,
  0.08732999,  0.16429878,  0.2223306,   0.25368884,  0.26830932,  0.27313931,
  0.27308756,  0.27048902,  0.26626313,  0.26139534,  0.25634544,  0.2509893 ])


popt, pcov = curve_fit(func, x, y, p0=[y.max(), 2, 2, -0.1], bounds=([0, 0, 0, -0.2], [0.4, 45, 2000, 10]))
xfit = np.linspace(x.min(), x.max(), 200)
plt.scatter(x, y)
plt.plot(xfit, func(xfit, *popt))
plt.show()

s = UnivariateSpline(x, y, k=3, s=0.005)

xfit = np.linspace(x.min(), x.max(), 200)
plt.scatter(x, y)
plt.plot(xfit, s(xfit))
plt.show()

第三种选择是使用更高级的函数,该函数还可以重现末尾的减少和 differential_evolution 的拟合;这似乎是最合适的:

代码如下(使用与上述相同的数据):

from scipy.optimize import curve_fit, differential_evolution    

def sigmoid_with_decay(x, a, b, c, d, e, f):

    return a * (1. / (1. + np.exp(-b * (x - c)))) * (1. / (1. + np.exp(d * (x - e)))) + f

def error_sigmoid_with_decay(parameters, x_data, y_data):

    return np.sum((y_data - sigmoid_with_decay(x_data, *parameters)) ** 2)

res = differential_evolution(error_sigmoid_with_decay,
                             bounds=[(0, 10), (0, 25), (0, 10), (0, 10), (0, 10), (-1, 0.1)],
                             args=(x, y),
                             seed=42)

xfit = np.linspace(x.min(), x.max(), 200)
plt.scatter(x, y)
plt.plot(xfit, sigmoid_with_decay(xfit, *res.x))
plt.show()

拟合对边界非常敏感,所以当你玩那个的时候要小心......

这说明了将两半数据拟合到不同函数的结果,下半部分拟合 X < 2.0 的所有数据,上半部分拟合 X >= 1.9 的所有数据,因此在拟合曲线的数据。代码在重叠区域的中心从一个方程切换到另一个方程,X = 1.95.

import numpy, matplotlib
import matplotlib.pyplot as plt

xData=numpy.array([ 1.00094909,  1.08787635,  1.17481363,  1.2617564,   1.34867881,  1.43562284,
  1.52259341,  1.609522,    1.69631283,  1.78276102,  1.86426648,  1.92896789,
  1.9464453,   1.94941586,  2.00062852,  2.073691,    2.14982808,  2.22808316,
  2.30634034,  2.38456905,  2.46280126,  2.54106611,  2.6193345,   2.69748825])
yData=numpy.array([-0.10057627, -0.10172142, -0.10320428, -0.10378959, -0.10348456, -0.10312503,
 -0.10276956, -0.10170055, -0.09778279, -0.08608644, -0.05797392,  0.00063599,
  0.08732999,  0.16429878,  0.2223306,   0.25368884,  0.26830932,  0.27313931,
  0.27308756,  0.27048902,  0.26626313,  0.26139534,  0.25634544,  0.2509893 ])


# function for x < 1.95 (fitted up to 2.0 for overlap)
def lowerFunc(x_in): # Bleasdale-Nelder Power With Offset
    # coefficients
    a = -1.1431476643503597E+03
    b = 3.3819340844164983E+21
    c = -6.3633178925040745E+01
    d = 3.1481973843740194E+00
    Offset = -1.0300724909782859E-01

    temp = numpy.power(a + b * numpy.power(x_in, c), -1.0 / d)
    temp += Offset
    return temp

# function for x >= 1.95 (fitted down to 1.9 for overlap)
def upperFunc(x_in): # rational equation with Offset
    # coefficients
    a = -2.5294212380048242E-01
    b = 1.4262697377369586E+00
    c = -2.6141935706529118E-01
    d = -8.8730045918252121E-02
    Offset = -4.8283287597672708E-01

    temp = (a * numpy.power(x_in, 2) + b * numpy.log(x_in)) # numerator
    temp /= (1.0 + c * numpy.power(numpy.log(x_in), -1) + d * numpy.exp(x_in)) # denominator
    temp += Offset
    return temp


def combinedFunc(x_in):
    returnVal = []
    for x in x_in:
        if x < 1.95:
            returnVal.append(lowerFunc(x))
        else:
            returnVal.append(upperFunc(x))
    return returnVal


modelPredictions = combinedFunc(xData) 

absError = modelPredictions - yData

SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print('RMSE:', RMSE)
print('R-squared:', Rsquared)


##########################################################
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
    f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
    axes = f.add_subplot(111)

    # first the raw data as a scatter plot
    axes.plot(xData, yData,  'D')

    # create data for the fitted equation plot
    xModel = numpy.linspace(min(xData), max(xData))
    yModel = combinedFunc(xModel)

    # now the model as a line plot
    axes.plot(xModel, yModel)

    axes.set_xlabel('X Data') # X axis data label
    axes.set_ylabel('Y Data') # Y axis data label

    plt.show()
    plt.close('all') # clean up after using pyplot

graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)