如何通过超参数调优来改进这个模型

How to improve this model with hyperparameter tuning

我正在使用 KNN 为泰坦尼克号数据集构建机器学习模型 algorithm.I 已将数据划分为训练、验证和测试 70%、15% 和 15% respectively.I 使用了 GridSearch CV 和RandomizedsearchCV 找到最好的超参数,但测试集的准确率仍然是 56%。有什么方法可以提高使用 KNN 的模型的准确性吗?




#!/usr/bin/env python
# coding: utf-8

# In[1]:


import pandas as pd
from pandas.api.types import is_string_dtype,is_numeric_dtype
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,matthews_corrcoef,f1_score
from sklearn.externals import joblib
from matplotlib import pyplot as plt
import seaborn as sns
import joblib

get_ipython().run_line_magic('matplotlib', 'inline')


# In[2]:


get_ipython().run_line_magic('pwd', '')


# In[3]:


my_path = '/Users/kiran/DataScience'


# In[4]:


get_ipython().run_line_magic('pwd', '')


# In[5]:


my_df = pd.read_csv(f'{my_path}/Titanic_full.csv')


# In[6]:


print(my_df.shape)


# In[7]:


my_df.head(3)


# In[8]:


print(my_df.shape)


# In[9]:


#Splitting the data by writing a fucntion

def mydf_splitter(my_df,num_rows):
    return my_df[:num_rows].copy(),my_df[num_rows:]

mydf_train_valid,mydf_test = mydf_splitter(my_df,1100)

print(mydf_train_valid.shape,mydf_test.shape)


# In[10]:


print(len(mydf_train_valid))


# In[11]:


print(mydf_train_valid.isnull().sum())


# In[12]:


null_vals = mydf_train_valid.isnull().sum()/len(mydf_train_valid)
null_vals = pd.DataFrame(null_vals)
null_vals.reset_index(inplace=True)
null_vals.columns = ["Feature","Percent missing"]
plt.figure(figsize=(8,6))
plt.xticks(rotation = 45)
sns.barplot(x="Feature",y="Percent missing",data=null_vals)


# In[13]:


mydf_train_valid_2 = mydf_train_valid.drop("Cabin",axis = 1)
print(mydf_train_valid_2.shape)
mydf_train_valid_2.head(2)


# In[14]:


mydf_train_valid_2.info()


# In[15]:


def str_to_cat(my_df):
    for p,q in my_df.items():
        if is_string_dtype(q):
            my_df[p] = q.astype('category').cat.as_ordered()
    return my_df


# In[16]:


mydf_train_valid_3 = str_to_cat(mydf_train_valid_2)
mydf_train_valid_3.info()


# In[17]:


#Check category mapping of Embarked and Sex columns

print(mydf_train_valid_3.Embarked.cat.categories)
print(mydf_train_valid_3.Sex.cat.categories)


# In[18]:


#categorical missing values function
def mydf_to_nums(my_df,feature,null_status):
    if not is_numeric_dtype(feature):
        my_df[null_status] = feature.cat.codes+1;

def mydf_imputer(my_df, feature, null_status, null_table):
    if is_numeric_dtype(feature):
        if pd.isnull(feature).sum() or (null_status in null_table):
            my_df[null_status+'_na'] = pd.isnull(feature)
            filler = null_table[null_status] if null_status in null_table else feature.median()
            my_df[null_status] = feature.fillna(filler)
            null_table[null_status] = filler
    return null_table 

def mydf_preprocessor(my_df,null_table):
    if null_table is None:
        null_table = dict()
    for p,q in my_df.items():
        null_table = mydf_imputer(my_df,q,p,null_table)
    for p,q in my_df.items():
        mydf_to_nums(my_df,q,p)
    my_df = pd.get_dummies(my_df,dummy_na = True)
    res = [my_df, null_table]
    return res;


# In[19]:


mydf_train_valid_4,my_table = mydf_preprocessor(mydf_train_valid_3,null_table = None)


# In[20]:


mydf_train_valid_4.head(3)


# In[21]:


my_table


# In[22]:


Y = mydf_train_valid_4["Survived"]
X = mydf_train_valid_4.drop(["Survived"],axis = 1)

print(X.shape,Y.shape)


# In[23]:


#Scale the continuous value so seperate continuous and categorical value
X_cat = X[['PassengerId', 'Pclass', 'Name', 'Sex', 'SibSp', 'Parch',
       'Ticket', 'Embarked', 'Age_na', 'Fare_na']]
X_con = X.drop(X_cat,axis = 1)
print(X_cat.shape,X_con.shape)


# In[24]:


#Scale the data
scaler = preprocessing.StandardScaler().fit(X_con)
X_con_sc = pd.DataFrame(scaler.transform(X_con))
X_con_sc.columns = ["Age","Fare"]
print(X_con_sc.shape)
X_con_sc.head(2)


# In[25]:


#Store the scalar variable and categorical value for test data
df_list = [X_cat,X_con_sc]
X_full = pd.concat(df_list,axis = 1)
print(X_full.shape)
X_full.head(2)


# In[26]:


X_train,X_valid = mydf_splitter(X_full,900)
Y_train,Y_valid = mydf_splitter(Y,900)

print(X_train.shape,X_valid.shape,Y_train.shape,Y_valid.shape)


# In[27]:


#Build the classifier model
my_knn_model = KNeighborsClassifier(n_neighbors=5,weights='uniform')
my_knn_model.fit(X_train,Y_train)

#Predict on the validation set
Y_pred = my_knn_model.predict(X_valid)


# In[68]:


#Plot confusion matrix
from sklearn.metrics import confusion_matrix

my_knn_cmatrix = confusion_matrix(Y_valid,Y_pred)

my_knn_df = pd.DataFrame(my_knn_cmatrix)
plt.figure(figsize=(8,8))
sns.heatmap(my_knn_df,xticklabels=["Unlucky","Survived"],
           yticklabels=["Unlucky","Survived"],annot=True)


# In[ ]:





# In[69]:


print(accuracy_score(Y_valid,Y_pred),
      matthews_corrcoef(Y_valid,Y_pred),f1_score(Y_valid,Y_pred))


# In[70]:


import imblearn
print(imblearn.__version__)
from imblearn.over_sampling import SMOTE
from collections import Counter


# In[71]:


oversample = SMOTE()
X_train, Y_train = oversample.fit_resample(X_train, Y_train)
counter = Counter(Y_train)
print(counter)


# In[78]:


#Build the classifier model
my_knn_model = KNeighborsClassifier(n_neighbors=5,weights='uniform')
my_knn_model.fit(X_train,Y_train)

#Predict on the validation set
Y_pred = my_knn_model.predict(X_valid)


# In[79]:


#Plot confusion matrix
from sklearn.metrics import confusion_matrix

my_knn_cmatrix = confusion_matrix(Y_valid,Y_pred)

my_knn_df = pd.DataFrame(my_knn_cmatrix)
plt.figure(figsize=(8,8))
sns.heatmap(my_knn_df,xticklabels=["Unlucky","Survived"],
           yticklabels=["Unlucky","Survived"],annot=True)


# In[82]:




print(accuracy_score(Y_valid,Y_pred),
      matthews_corrcoef(Y_valid,Y_pred),f1_score(Y_valid,Y_pred))


# In[ ]:





# In[ ]:





# # Got the accuracy score 54%

# # Hyper Parameter Tuning

# #  Using RandomizedSearchCV

# In[84]:


from sklearn.model_selection import  RandomizedSearchCV
import numpy as np
k_range = np.arange(1,31)
weights = ["uniform","distance"]
p = [1,2]
metric=['minkowski','euclidean','manhattan']
leaf_size = list(range(1,50))
metric_params = dict
metric_params = None
n_jobs = None
param_grid = dict(n_neighbors = k_range, weights = weights,p=p,metric = metric,leaf_size=leaf_size)
knn = KNeighborsClassifier()
randomized = RandomizedSearchCV(knn, param_grid,scoring = "accuracy", cv = 10, n_iter = 10)


# In[85]:


randomized.fit(X_train,Y_train)


# In[86]:


randomized.best_estimator_


# In[87]:


randomized.cv_results_


# In[88]:


randomized.best_params_


# In[89]:


#Randomized Search CV is able to find the best accuracy.
randomized.best_score_


# #  Building the model with tuned parameters

# In[90]:


#Build the classifier model
my_knn_model = KNeighborsClassifier(algorithm='auto', leaf_size=47, metric='manhattan',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=1,
                     weights='uniform')
my_knn_model.fit(X_train,Y_train)

#Predict on the validation set
Y_pred = my_knn_model.predict(X_valid)


# In[91]:


#Plot confusion matrix
from sklearn.metrics import confusion_matrix

my_knn_cmatrix = confusion_matrix(Y_valid,Y_pred)

my_knn_df = pd.DataFrame(my_knn_cmatrix)
plt.figure(figsize=(8,8))
sns.heatmap(my_knn_df,xticklabels=["Unlucky","Survived"],
           yticklabels=["Unlucky","Survived"],annot=True)


# In[93]:


print(accuracy_score(Y_valid,Y_pred),
      matthews_corrcoef(Y_valid,Y_pred),f1_score(Y_valid,Y_pred))


# In[ ]:





# In[94]:


from sklearn.metrics import accuracy_score

num_neighs = list()
accuracy_list = list()


for neighbor in range(1,30):
    my_knn_model = KNeighborsClassifier(n_neighbors=neighbor,weights='uniform',p=1,metric='manhattan',leaf_size = 47)
    my_knn_model.fit(X_train,Y_train)
    Y_pred = my_knn_model.predict(X_valid)
    accuracy = accuracy_score(Y_valid,Y_pred)
    num_neighs.append(neighbor)
    accuracy_list.append(accuracy)


# In[95]:


eval_df = pd.DataFrame({"Num of neighbors":num_neighs,"Valid accuracy Score":accuracy_list})
eval_df


# In[96]:


sns.set_style("whitegrid")
sns.pairplot(eval_df,x_vars="Num of neighbors",
            y_vars = "Valid accuracy Score",plot_kws={'s':60},height=4.0)


# In[97]:


#For K=25

knn_model_fin = KNeighborsClassifier(n_neighbors=3,weights='uniform',p=1,metric='manhattan',leaf_size = 47)
knn_model_fin.fit(X_full,Y)

#!mkdir knn_model
knn_model_name1 = f'{my_path}/knn_model/knn_model_final1.sav'
joblib.dump(knn_model_fin,knn_model_name1)


# In[98]:


knn_model_loaded = joblib.load(knn_model_name1)


# In[99]:


#Evaluating test set accuracy with the trained model
mydf_test1 = mydf_test.drop("Cabin",axis = 1)
print(mydf_test1.shape)
mydf_test1.head(3)


# In[100]:


mydf_test2 = str_to_cat(mydf_test1)
mydf_test2.Sex.cat.categories


# In[101]:


#Check for the Embarked column
mydf_test2.Embarked.cat.categories


# In[102]:


mydf_test3,my_table1 = mydf_preprocessor(mydf_test2,
                                        null_table = my_table)
print(mydf_test3.shape)
mydf_test3.head(3)


# In[103]:


my_table1


# In[104]:


Y_t = mydf_test3["Survived"]
X_t = mydf_test3.drop(["Survived"],axis = 1)

print(X_t.shape,Y_t.shape)


# In[105]:


X_cat_t = X_t[['PassengerId', 'Pclass', 'Name', 'Sex', 'SibSp', 'Parch',
       'Ticket', 'Embarked', 'Age_na', 'Fare_na']]
X_con_t = X_t.drop(X_cat_t,axis = 1)
print(X_cat_t.shape,X_con_t.shape)


# In[106]:


X_con_sct = pd.DataFrame(scaler.transform(X_con_t))
X_con_sct.columns = ["Age","Fare"]
print(X_con_sct.shape)
X_con_sct.head(2)


# In[107]:


print(X_cat_t.shape,X_con_sct.shape)


# In[108]:


X_cat_t.head()


# In[109]:


#Re-index before merging
X_cat_t.reset_index(inplace = True,drop = False)


# In[110]:


X_cat_t.drop("index",inplace = True,axis = 1)
X_cat_t.head(2)


# In[111]:


#Merge the two sets of columns
df_list_I = [X_cat_t,X_con_sct]
X_test_I = pd.concat(df_list_I,axis = 1)
print(X_test_I.shape)
X_test_I.head(3)


# In[112]:


#Now we are ready to test it out. Let's load the saved model first.
kNN_loaded = joblib.load(f'{my_path}/knn_model/knn_model_final1.sav')


# In[113]:


#Testing...
Y_test_pred = kNN_loaded.predict(X_test_I)


# In[114]:


print(accuracy_score(Y_t,Y_test_pred),
      matthews_corrcoef(Y_t,Y_test_pred),f1_score(Y_t,Y_test_pred))


# In[ ]:





# In[ ]:





# In[ ]:





这更像是一个数据科学问题,因此您可以考虑将问题移至 Data Science Stack Exchange

话虽如此,我鼓励您进行更多的特征工程或 select 可能比其他特征更重要的特征子集,因为看起来您对所有内容的权重都相同。假设地说,如果一个或多个变量与目标没有相关性,那么使用 KNN 并对这些变量进行加权以使其具有与其他变量相同的重要性可能会导致模型不理想。由于您正在使用 sklearn 库,SelectKBest 可能是一个有用的起点。

56% 的准确率偏低,所以在模型表现更好之前我不会费心调整超参数(但既然你已经实现了它,请随意离开它,因为它不会造成伤害).实际上,超参数调整最多只会对模型的性能有一点提升。