模型的特征数量必须与输入相匹配。模型 n_features 为 16，输入 n_features 为 1

Question

我正在使用中风的 Kaggle 数据集，在使用 randomforesrtclassifier 之后，我使用了 RandomSearchCV。我不明白为什么显示 n_features 16，这让我很困惑，而且我是数据科学的新手，所以我什至不知道我做错了什么

import pandas as pd
df = pd.read_csv("healthcare-dataset-stroke-data.csv")
print(df)

df.dropna(inplace=True)

df.isnull().sum()

df.corr()

final_dataset=pd.get_dummies(df,drop_first=True)

print(final_dataset)

import seaborn as sns
import matplotlib.pyplot as plt

corrmat= final_dataset.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20)) 
g=sns.heatmap(final_dataset[top_corr_features].corr(),annot=True,cmap="RdYlGn")

final_dataset.columns

X = final_dataset[['age', 'hypertension', 'heart_disease', 'avg_glucose_level',
       'bmi','gender_Male', 'gender_Other', 'ever_married_Yes',
       'work_type_Never_worked', 'work_type_Private',
       'work_type_Self-employed', 'work_type_children', 'Residence_type_Urban',
       'smoking_status_formerly smoked', 'smoking_status_never smoked',
       'smoking_status_smokes']]

y = final_dataset[['stroke']]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)
y_train.shape
y_test.shape

from sklearn.ensemble import  RandomForestClassifier
rf = RandomForestClassifier()

"""Hyperparameters"""

import numpy as np
n_estimators = [int(x) for x in np.linspace(100,1200,12)]
max_features = ["auto", "sqrt"]
max_depth = [int(x) for x in np.linspace(5,30,6)]
min_samples_split = [2,5,10,15,100]
min_samples_leaf = [1,2,5,10]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

from sklearn.model_selection import RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = 1)

rf_random.fit(X_train,y_train.values.ravel())

predictions = rf_random.predict(X_test)



print(rf.score(y_test,predictions))

我遇到的错误

Traceback (most recent call last):
  File "d:/object_detection/untitled3.py", line 82, in <module>
    print(rf.score(y_test,predictions))
  File "C:\Users\Amit\anaconda3\lib\site-packages\sklearn\base.py", line 499, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
  File "C:\Users\Amit\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 629, in predict
    proba = self.predict_proba(X)
  File "C:\Users\Amit\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 673, in predict_proba
    X = self._validate_X_predict(X)
  File "C:\Users\Amit\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 421, in _validate_X_predict
    return self.estimators_[0]._validate_X_predict(X, check_input=True)
  File "C:\Users\Amit\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 396, in _validate_X_predict
    raise ValueError("Number of features of the model must "
ValueError: Number of features of the model must match the input. Model n_features is 16 and input n_features is 1

Answer 1

    import pandas as pd
    df = pd.read_csv("healthcare-dataset-stroke-data.csv")
    print(df)
    
    df.dropna(inplace=True)
    
    df.isnull().sum()
    
    df.corr()
    
    final_dataset=pd.get_dummies(df,drop_first=True)
    
    print(final_dataset)
    
    import seaborn as sns
    import matplotlib.pyplot as plt
    
    corrmat= final_dataset.corr()
    top_corr_features = corrmat.index
    plt.figure(figsize=(20,20)) 
    g=sns.heatmap(final_dataset[top_corr_features].corr(),annot=True,cmap="RdYlGn")
    
    final_dataset.columns
    
    X = final_dataset.drop("stroke",axis=1)
    y = final_dataset['stroke']
    
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)
    y_train.shape
    y_test.shape
    
    from sklearn.ensemble import  RandomForestClassifier
    rf = RandomForestClassifier()
    
    """Hyperparameters"""
    
    import numpy as np
    n_estimators = [int(x) for x in np.linspace(100,1200,12)]
    max_features = ["auto", "sqrt"]
    max_depth = [int(x) for x in np.linspace(5,30,6)]
    min_samples_split = [2,5,10,15,100]
    min_samples_leaf = [1,2,5,10]
    
    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf}
    
    print(random_grid)
    
    from sklearn.model_selection import RandomizedSearchCV
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train,y_train)
rf_random.best_params_##check what could be the best parameters and then appy GridSearchCV
rf2=RandomForestClassifier(n_estimators=900,min_samples_split=5,min_samples_leaf=5,max_features="sqrt",max_depth=10)##make a new model for that params
rf2.fit(X_train,y_train)
rf2.score(X_test,y_test)##score take X and Y

输出

0.9663951120162932

模型的特征数量必须与输入相匹配。模型 n_features 为 16，输入 n_features 为 1

Number of features of the model must match the input. Model n_features is 16 and input n_features is 1

python

python-3.x

dataframe

scikit-learn

random-forest