如何设置 GEKKO 以从多个独立数据集进行参数估计?

How to set up GEKKO for parameter estimation from multiple independent sets of data?

我正在学习如何根据实验室间歇反应器数据使用 GEKKO 进行动力学参数估计,这些数据主要由三种物质 A、C 和 P 的浓度分布组成。为了我的问题,我正在使用我之前在与单个数据集的参数估计相关的 中介绍的模型。

我的最终目标是能够使用多个实验 运行 进行参数估计,利用可能在不同温度、物种浓度等下收集的数据。由于单个间歇式反应器的独立性实验中,每个数据集都包含在不同时间点收集的样本。这些不同的时间点(以及将来,例如不同的温度)对我来说很难实现到 GEKKO 模型中,因为我之前使用实验数据收集时间点作为 GEKKO 模型的 m.time 参数。 (请参阅代码 post 的末尾)我过去曾使用 gPROMS 和 Athena Visual Studio.

解决过此类问题

为了说明我的问题,我通过在物种浓度分布中引入噪声并稍微移动实验时间点,从我的原始模型生成了一个包含 'experimental' 数据的人工数据集。然后,我将同一实验物种的所有数据集组合成具有多列的新数组。我这里的思路是,GEKKO会利用数组对应的每一列的实验数据进行参数估计,这样times_comb[:,0]就会和A_comb[:,0]相关,而times_comb[:,1]就会与 A_comb[:,1].

有关

当我尝试 运行 GEKKO 模型时,系统确实获得了参数估计的解决方案,但我不清楚问题解决方案是否合理,因为我注意到 GEKKO 变量 A 、B、C、P是34个元素向量,是每个实验数据集中元素的两倍。我假设 GEKKO 在模型设置期间以某种方式组合了时间和参数向量的两列,从而导致了这 34 个元素变量?我还担心在每个输入参数的列组合过程中,某个时间点与收集的物种信息之间的关系丢失了。

考虑到每个数据集的时间点可能不同,如何改进 GEKKO 可以同时用于参数估计的多个数据集的使用?我查看了 GEKKO 文档示例以及 APMonitor 网站,但我找不到可用于指导的包含多个数据集的示例,因为我对 GEKKO 包还很陌生。

感谢您花时间阅读我的问题以及您可能有的任何 help/ideas。

代码如下:

import numpy as np
import matplotlib.pyplot as plt
from gekko import GEKKO

#Experimental data
times  = np.array([0.0, 0.071875, 0.143750, 0.215625, 0.287500, 0.359375, 0.431250,
                      0.503125, 0.575000, 0.646875, 0.718750, 0.790625, 0.862500,
                      0.934375, 1.006250, 1.078125, 1.150000])
A_obs = np.array([1.0, 0.552208, 0.300598, 0.196879, 0.101175, 0.065684, 0.045096,
                      0.028880, 0.018433, 0.011509, 0.006215, 0.004278, 0.002698,
                      0.001944, 0.001116, 0.000732, 0.000426])
C_obs = np.array([0.0, 0.187768, 0.262406, 0.350412, 0.325110, 0.367181, 0.348264,
                      0.325085, 0.355673, 0.361805, 0.363117, 0.327266, 0.330211,
                      0.385798, 0.358132, 0.380497, 0.383051])
P_obs = np.array([0.0, 0.117684, 0.175074, 0.236679, 0.234442, 0.270303, 0.272637,
                      0.274075, 0.278981, 0.297151, 0.297797, 0.298722, 0.326645,
                      0.303198, 0.277822, 0.284194, 0.301471])

#Generate second set of 'experimental data'
times_new = times + np.random.uniform(0.0,0.01)
P_obs_noisy = P_obs+np.random.normal(0,0.05,P_obs.shape)
A_obs_noisy = A_obs+np.random.normal(0,0.05,A_obs.shape)
C_obs_noisy = A_obs+np.random.normal(0,0.05,C_obs.shape)

#Combine two data sets into multi-column arrays
times_comb = np.array([times, times_new]).T
P_comb = np.array([P_obs, P_obs_noisy]).T
A_comb = np.array([A_obs, A_obs_noisy]).T
C_comb = np.array([C_obs, C_obs_noisy]).T

m = GEKKO(remote=False)

t = m.time = times_comb #using two column time array

Am = m.Param(value=A_comb) #Using the two column data as observed parameter
Cm = m.Param(value=C_comb)
Pm = m.Param(value=P_comb)

A = m.Var(1, lb = 0)
B = m.Var(0, lb = 0)
C = m.Var(0, lb = 0)
P = m.Var(0, lb = 0)

k = m.Array(m.FV,6,value=1,lb=0)  
for ki in k:
    ki.STATUS = 1
k1,k2,k3,k4,k5,k6 = k

r1 = m.Var(0, lb = 0)
r2 = m.Var(0, lb = 0)
r3 = m.Var(0, lb = 0)
r4 = m.Var(0, lb = 0)
r5 = m.Var(0, lb = 0)
r6 = m.Var(0, lb = 0)
   
m.Equation(r1 == k1 * A)
m.Equation(r2 == k2 * A * B)
m.Equation(r3 == k3 * C * B)
m.Equation(r4 == k4 * A)
m.Equation(r5 == k5 * A)
m.Equation(r6 == k6 * A * B)

#mass balance diff eqs, function calls rxn function 
m.Equation(A.dt() == - r1 - r2 - r4 - r5 - r6)
m.Equation(B.dt() ==  r1 - r2 - r3 - r6)
m.Equation(C.dt() ==  r2 - r3 + r4)
m.Equation(P.dt() ==  r3 + r5 + r6)

m.Minimize((A-Am)**2)
m.Minimize((P-Pm)**2)
m.Minimize((C-Cm)**2)

m.options.IMODE = 5
m.options.SOLVER = 3 #IPOPT optimizer
m.options.NODES = 6
m.solve()

k_opt = []
for ki in k:
    k_opt.append(ki.value[0])
print(k_opt)

plt.plot(t,A)
plt.plot(t,C)
plt.plot(t,P)
plt.plot(t,B)
plt.plot(times,A_obs,'bo')
plt.plot(times,C_obs,'gx')
plt.plot(times,P_obs,'rs')
plt.plot(times_new, A_obs_noisy,'b*')
plt.plot(times_new, C_obs_noisy,'g*')
plt.plot(times_new, P_obs_noisy,'r*')
plt.show()

要拥有多个具有不同时间和数据点的数据集,您可以将数据集作为 pandas 数据框加入。这是一个简单的例子:

# data set 1
t_data1 = [0.0,  0.1,  0.2, 0.4, 0.8, 1.00]
x_data1 = [2.0,  1.6,  1.2, 0.7, 0.3, 0.15]

# data set 2
t_data2 = [0.0,  0.15, 0.25, 0.45, 0.85, 0.95]
x_data2 = [3.6,  2.25, 1.75, 1.00, 0.35, 0.20]

合并后的数据有NaN处数据缺失:

       x1    x2
Time           
0.00  2.0  3.60
0.10  1.6   NaN
0.15  NaN  2.25
0.20  1.2   NaN
0.25  NaN  1.75

记下数据丢失的地方,1=已测量,0=未测量。

# indicate which points are measured
z1 = (data['x1']==data['x1']).astype(int) # 0 if NaN
z2 = (data['x2']==data['x2']).astype(int) # 1 if number

最后一步是设置 Gekko 变量、方程和 objective 以容纳数据集。

xm = m.Array(m.Param,2)
zm = m.Array(m.Param,2)
for i in range(2):
    m.Equation(x[i].dt()== -k * x[i])  # differential equations
    m.Minimize(zm[i]*(x[i]-xm[i])**2)  # objectives

你也可以用m.free_initial(x[i])计算初始条件。这给出了 2 个数据集上一个参数值 (k) 的最优解。这种方法可以扩展到多个变量或多个不同时间的数据集。

from gekko import GEKKO
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# data set 1
t_data1 = [0.0,  0.1,  0.2, 0.4, 0.8, 1.00]
x_data1 = [2.0,  1.6,  1.2, 0.7, 0.3, 0.15]

# data set 2
t_data2 = [0.0,  0.15, 0.25, 0.45, 0.85, 0.95]
x_data2 = [3.6,  2.25, 1.75, 1.00, 0.35, 0.20]

# combine with dataframe join
data1 = pd.DataFrame({'Time':t_data1,'x1':x_data1})
data2 = pd.DataFrame({'Time':t_data2,'x2':x_data2})
data1.set_index('Time', inplace=True)
data2.set_index('Time', inplace=True)
data = data1.join(data2,how='outer')
print(data.head())

# indicate which points are measured
z1 = (data['x1']==data['x1']).astype(int) # 0 if NaN
z2 = (data['x2']==data['x2']).astype(int) # 1 if number

# replace NaN with any number (0)
data.fillna(0,inplace=True)

m = GEKKO(remote=False)

# measurements
xm = m.Array(m.Param,2)
xm[0].value = data['x1'].values
xm[1].value = data['x2'].values

# index for objective (0=not measured, 1=measured)
zm = m.Array(m.Param,2)
zm[0].value=z1
zm[1].value=z2

m.time = data.index
x = m.Array(m.Var,2)                   # fit to measurement
x[0].value=x_data1[0]; x[1].value=x_data2[0]

k = m.FV(); k.STATUS = 1               # adjustable parameter
for i in range(2):
    m.free_initial(x[i])               # calculate initial condition
    m.Equation(x[i].dt()== -k * x[i])  # differential equations
    m.Minimize(zm[i]*(x[i]-xm[i])**2)  # objectives

m.options.IMODE = 5   # dynamic estimation
m.options.NODES = 2   # collocation nodes
m.solve(disp=True)    # solve
k = k.value[0]
print('k = '+str(k))

# plot solution
plt.plot(m.time,x[0].value,'b.--',label='Predicted 1')
plt.plot(m.time,x[1].value,'r.--',label='Predicted 2')
plt.plot(t_data1,x_data1,'bx',label='Measured 1')
plt.plot(t_data2,x_data2,'rx',label='Measured 2')
plt.legend(); plt.xlabel('Time'); plt.ylabel('Value')
plt.xlabel('Time'); 
plt.show()

包括我更新的代码(没有完全清理以尽量减少变量的数量)并结合我的问题的选定答案以供参考。该模型在两个单独的 'datasets.'

中对 3 个测量物种进行回归
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from gekko import GEKKO

#Experimental data
times  = np.array([0.0, 0.071875, 0.143750, 0.215625, 0.287500, 0.359375, 0.431250,
                      0.503125, 0.575000, 0.646875, 0.718750, 0.790625, 0.862500,
                      0.934375, 1.006250, 1.078125, 1.150000])
A_obs = np.array([1.0, 0.552208, 0.300598, 0.196879, 0.101175, 0.065684, 0.045096,
                      0.028880, 0.018433, 0.011509, 0.006215, 0.004278, 0.002698,
                      0.001944, 0.001116, 0.000732, 0.000426])
C_obs = np.array([0.0, 0.187768, 0.262406, 0.350412, 0.325110, 0.367181, 0.348264,
                      0.325085, 0.355673, 0.361805, 0.363117, 0.327266, 0.330211,
                      0.385798, 0.358132, 0.380497, 0.383051])
P_obs = np.array([0.0, 0.117684, 0.175074, 0.236679, 0.234442, 0.270303, 0.272637,
                      0.274075, 0.278981, 0.297151, 0.297797, 0.298722, 0.326645,
                      0.303198, 0.277822, 0.284194, 0.301471])

#Generate second set of 'experimental data'
times_new = times + np.random.uniform(0.0,0.01)
P_obs_noisy = (P_obs+ np.random.normal(0,0.05,P_obs.shape))
A_obs_noisy = (A_obs+np.random.normal(0,0.05,A_obs.shape))
C_obs_noisy = (C_obs+np.random.normal(0,0.05,C_obs.shape))

#Combine two data sets into multi-column arrays using pandas DataFrames
#Set dataframe index to be combined time discretization of both data sets
exp1 = pd.DataFrame({'Time':times,'A':A_obs,'C':C_obs,'P':P_obs})
exp2 = pd.DataFrame({'Time':times_new,'A':A_obs_noisy,'C':C_obs_noisy,'P':P_obs_noisy})
exp1.set_index('Time',inplace=True)
exp2.set_index('Time',inplace=True)
exps = exp1.join(exp2, how ='outer',lsuffix = '_1',rsuffix = '_2')
#print(exps.head())

#Combine both data sets into a single data frame
meas_data = pd.DataFrame().reindex_like(exps)

#define measurement locations  for each data set, with NaN written for time points 
#not common in both data sets
for cols in exps:
    meas_data[cols] = (exps[cols]==exps[cols]).astype(int) 

exps.fillna(0,inplace = True) #replace NaN with 0

m = GEKKO(remote=False)

t = m.time = exps.index #set GEKKO time domain to use experimental time points

#Generate two-column GEKKO arrays to store observed values of each species, A, C and P
Am = m.Array(m.Param,2)
Cm = m.Array(m.Param,2)
Pm = m.Array(m.Param,2)

Am[0].value = exps['A_1'].values
Am[1].value = exps['A_2'].values
Cm[0].value = exps['C_1'].values
Cm[1].value = exps['C_2'].values
Pm[0].value = exps['P_1'].values
Pm[1].value = exps['P_2'].values

#Define GEKKO variables that determine if time point contatins data to be used in regression
#If time point contains species data, meas_ variable = 1, else = 0
meas_A = m.Array(m.Param,2)
meas_C = m.Array(m.Param,2)
meas_P = m.Array(m.Param,2)

meas_A[0].value = meas_data['A_1'].values
meas_A[1].value = meas_data['A_2'].values
meas_C[0].value = meas_data['C_1'].values
meas_C[1].value = meas_data['C_2'].values
meas_P[0].value = meas_data['P_1'].values
meas_P[1].value = meas_data['P_2'].values

#Define Variables for differential equations A, B, C, P, with initial conditions set by experimental observation at first time point
A = m.Array(m.Var,2, lb = 0)
B = m.Array(m.Var,2, lb = 0)
C = m.Array(m.Var,2, lb = 0)
P = m.Array(m.Var,2, lb = 0)

A[0].value = exps['A_1'][0] ; A[1].value = exps['A_2'][0]
B[0].value = 0 ; B[1].value = 0
C[0].value = exps['C_1'][0] ; C[1].value = exps['C_2'][0]
P[0].value = exps['P_1'][0] ; P[1].value = exps['P_2'][0]


#Define kinetic coefficients, k1-k6 as regression FV's
k = m.Array(m.FV,6,value=1,lb=0,ub = 20)
   
for ki in k:
    ki.STATUS = 1
k1,k2,k3,k4,k5,k6 = k

#If doing paramrter estimation, enable free_initial condition, else not include them in model to reduce DOFs (for simulation, for example)
if k1.STATUS == 1:
    for i in range(2):
        m.free_initial(A[i])
        m.free_initial(B[i])
        m.free_initial(C[i])
        m.free_initial(P[i])

#Define reaction rate variables
r1 = m.Array(m.Var,2, value = 1, lb = 0)
r2 = m.Array(m.Var,2, value = 1, lb = 0)
r3 = m.Array(m.Var,2, value = 1, lb = 0)
r4 = m.Array(m.Var,2, value = 1, lb = 0)
r5 = m.Array(m.Var,2, value = 1, lb = 0)
r6 = m.Array(m.Var,2, value = 1, lb = 0)

#Model Equations
for i in range(2):
#Rate equations
    m.Equation(r1[i] == k1 * A[i])
    m.Equation(r2[i] == k2 * A[i] * B[i])
    m.Equation(r3[i] == k3 * C[i] * B[i])
    m.Equation(r4[i] == k4 * A[i])
    m.Equation(r5[i] == k5 * A[i])
    m.Equation(r6[i] == k6 * A[i] * B[i])
#Differential species balances
    m.Equation(A[i].dt() == - r1[i] - r2[i] - r4[i] - r5[i] - r6[i])
    m.Equation(B[i].dt() ==  r1[i] - r2[i] - r3[i] - r6[i])
    m.Equation(C[i].dt() ==  r2[i] - r3[i] + r4[i])
    m.Equation(P[i].dt() ==  r3[i] + r5[i] + r6[i])
#Minimization objective functions
    m.Obj(meas_A[i]*(A[i]-Am[i])**2)
    m.Obj(meas_P[i]*(P[i]-Pm[i])**2)
    m.Obj(meas_C[i]*(C[i]-Cm[i])**2)
    
#Solver options
m.options.IMODE = 5
m.options.SOLVER = 3 #APOPT optimizer
m.options.NODES = 6
m.solve()

k_opt = []
for ki in k:
    k_opt.append(ki.value[0])
print(k_opt)

plt.plot(t,A[0],'b-')
plt.plot(t,A[1],'b--')
plt.plot(t,C[0],'g-')
plt.plot(t,C[1],'g--')
plt.plot(t,P[0],'r-')
plt.plot(t,P[1],'r--')
plt.plot(times,A_obs,'bo')
plt.plot(times,C_obs,'gx')
plt.plot(times,P_obs,'rs')
plt.plot(times_new, A_obs_noisy,'b*')
plt.plot(times_new, C_obs_noisy,'g*')
plt.plot(times_new, P_obs_noisy,'r*')
plt.show()