添加了 Standardscaler 但在交叉验证和相关矩阵中收到错误
added Standardscaler but receive errors in Cross Validation and the correlation matrix
这是我为应用多元线性回归而构建的代码。我添加了标准缩放器来修复 Y 截距 p 值,该值并不重要,但问题是 CV RMSE 的结果最终发生了变化并且不再有意义,并且在绘制相关矩阵的代码中收到错误消息:AttributeError:'numpy.ndarray' 对象没有属性 'corr'
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy import stats
from scipy.stats.stats import pearsonr
# Import Excel File
data = pd.read_excel("C:\Users\AchourAh\Desktop\Multiple_Linear_Regression\SP Level Reasons Excels\SP000273701_PL14_IPC_03_09_2018_Reasons.xlsx",'Sheet1') #Import Excel file
# Replace null values of the whole dataset with 0
data1 = data.fillna(0)
print(data1)
# Extraction of the independent and dependent variables
X = data1.iloc[0:len(data1),[1,2,3,4,5,6,7]] #Extract the column of the COPCOR SP we are going to check its impact
Y = data1.iloc[0:len(data1),9] #Extract the column of the PAUS SP
# Data Splitting to train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size =0.25,random_state=1)
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)
# Statistical Analysis of the training set with Statsmodels
X = sm.add_constant(X_train) # add a constant to the model
est = sm.OLS(Y_train, X).fit()
print(est.summary()) # print the results
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import math
lm = LinearRegression() # create an lm object of LinearRegression Class
lm.fit(X_train,Y_train) # train our LinearRegression model using the training set of data - dependent and independent variables as parameters. Teaching lm that Y_train values are all corresponding to X_train.
print(lm.intercept_)
print(lm.coef_)
mse_test = mean_squared_error(Y_test, lm.predict(X_test))
print(math.sqrt(mse_test))
# Data Splitting to train and test set of the reduced data
X_1 = data1.iloc[0:len(data1),[1,2]] #Extract the column of the COPCOR SP we are going to check its impact
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X_1, Y, test_size =0.25,random_state=1)
X_train2 = ss.fit_transform(X_train2)
X_test2 = ss.transform(X_test2)
# Statistical Analysis of the reduced model with Statsmodels
X_reduced = sm.add_constant(X_train2) # add a constant to the model
est_reduced = sm.OLS(Y_train2, X_reduced).fit()
print(est_reduced.summary()) # print the results
# Fitting a Linear Model for the reduced model with Scikit-Learn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import math
lm1 = LinearRegression() #create an lm object of LinearRegression Class
lm1.fit(X_train2, Y_train2)
print(lm1.intercept_)
print(lm1.coef_)
mse_test1 = mean_squared_error(Y_test2, lm1.predict(X_test2))
print(math.sqrt(mse_test1))
#Cross Validation and Training again the model
from sklearn.model_selection import KFold
from sklearn import model_selection
kf = KFold(n_splits=6, random_state=1)
for train_index, test_index in kf.split(X_train2):
print("Train:", train_index, "Validation:",test_index)
X_train1, X_test1 = X[train_index], X[test_index]
Y_train1, Y_test1 = Y[train_index], Y[test_index]
results = -1 * model_selection.cross_val_score(lm1, X_train1, Y_train1,scoring='neg_mean_squared_error', cv=kf)
print(np.sqrt(results))
#RMSE values interpretation
print(math.sqrt(mse_test1))
print(math.sqrt(results.mean()))
#Good model built no overfitting or underfitting (Barely Same for test and training :Goal of Cross validation but low prediction accuracy = Value is big
import seaborn
Corr=X_train2.corr(method='pearson')
mask=np.zeros_like(Corr)
mask[np.triu_indices_from(mask)]=True
seaborn.heatmap(Corr,cmap='RdYlGn_r',vmax=1.0,vmin=-1.0,mask=mask, linewidths=2.5)
plt.yticks(rotation=0)
plt.xticks(rotation=90)
plt.show()
enter code here
你知道如何解决这个问题吗?
我猜问题在于:
Corr=X_train2.corr(method='pearson')
.corr 是一个 pandas 数据框方法,但 X_train2 在那个阶段是一个 numpy 数组。如果将 dataframe/series 传递给 StandardScaler,则会返回一个 numpy 数组。尝试将上面的替换为:
Corr=pd.DataFrame(X_train2).corr(method='pearson')
或使用 numpy.corrcoef 或 numpy.correlate 各自的形式。
这是我为应用多元线性回归而构建的代码。我添加了标准缩放器来修复 Y 截距 p 值,该值并不重要,但问题是 CV RMSE 的结果最终发生了变化并且不再有意义,并且在绘制相关矩阵的代码中收到错误消息:AttributeError:'numpy.ndarray' 对象没有属性 'corr'
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy import stats
from scipy.stats.stats import pearsonr
# Import Excel File
data = pd.read_excel("C:\Users\AchourAh\Desktop\Multiple_Linear_Regression\SP Level Reasons Excels\SP000273701_PL14_IPC_03_09_2018_Reasons.xlsx",'Sheet1') #Import Excel file
# Replace null values of the whole dataset with 0
data1 = data.fillna(0)
print(data1)
# Extraction of the independent and dependent variables
X = data1.iloc[0:len(data1),[1,2,3,4,5,6,7]] #Extract the column of the COPCOR SP we are going to check its impact
Y = data1.iloc[0:len(data1),9] #Extract the column of the PAUS SP
# Data Splitting to train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size =0.25,random_state=1)
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)
# Statistical Analysis of the training set with Statsmodels
X = sm.add_constant(X_train) # add a constant to the model
est = sm.OLS(Y_train, X).fit()
print(est.summary()) # print the results
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import math
lm = LinearRegression() # create an lm object of LinearRegression Class
lm.fit(X_train,Y_train) # train our LinearRegression model using the training set of data - dependent and independent variables as parameters. Teaching lm that Y_train values are all corresponding to X_train.
print(lm.intercept_)
print(lm.coef_)
mse_test = mean_squared_error(Y_test, lm.predict(X_test))
print(math.sqrt(mse_test))
# Data Splitting to train and test set of the reduced data
X_1 = data1.iloc[0:len(data1),[1,2]] #Extract the column of the COPCOR SP we are going to check its impact
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X_1, Y, test_size =0.25,random_state=1)
X_train2 = ss.fit_transform(X_train2)
X_test2 = ss.transform(X_test2)
# Statistical Analysis of the reduced model with Statsmodels
X_reduced = sm.add_constant(X_train2) # add a constant to the model
est_reduced = sm.OLS(Y_train2, X_reduced).fit()
print(est_reduced.summary()) # print the results
# Fitting a Linear Model for the reduced model with Scikit-Learn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import math
lm1 = LinearRegression() #create an lm object of LinearRegression Class
lm1.fit(X_train2, Y_train2)
print(lm1.intercept_)
print(lm1.coef_)
mse_test1 = mean_squared_error(Y_test2, lm1.predict(X_test2))
print(math.sqrt(mse_test1))
#Cross Validation and Training again the model
from sklearn.model_selection import KFold
from sklearn import model_selection
kf = KFold(n_splits=6, random_state=1)
for train_index, test_index in kf.split(X_train2):
print("Train:", train_index, "Validation:",test_index)
X_train1, X_test1 = X[train_index], X[test_index]
Y_train1, Y_test1 = Y[train_index], Y[test_index]
results = -1 * model_selection.cross_val_score(lm1, X_train1, Y_train1,scoring='neg_mean_squared_error', cv=kf)
print(np.sqrt(results))
#RMSE values interpretation
print(math.sqrt(mse_test1))
print(math.sqrt(results.mean()))
#Good model built no overfitting or underfitting (Barely Same for test and training :Goal of Cross validation but low prediction accuracy = Value is big
import seaborn
Corr=X_train2.corr(method='pearson')
mask=np.zeros_like(Corr)
mask[np.triu_indices_from(mask)]=True
seaborn.heatmap(Corr,cmap='RdYlGn_r',vmax=1.0,vmin=-1.0,mask=mask, linewidths=2.5)
plt.yticks(rotation=0)
plt.xticks(rotation=90)
plt.show()
enter code here
你知道如何解决这个问题吗?
我猜问题在于:
Corr=X_train2.corr(method='pearson')
.corr 是一个 pandas 数据框方法,但 X_train2 在那个阶段是一个 numpy 数组。如果将 dataframe/series 传递给 StandardScaler,则会返回一个 numpy 数组。尝试将上面的替换为:
Corr=pd.DataFrame(X_train2).corr(method='pearson')
或使用 numpy.corrcoef 或 numpy.correlate 各自的形式。