比较 ML 中不同分类模型的准确性

Question

我正在做一个项目来测试我在分类模型方面的技能，但似乎我犯了一个错误，我不确定如何解决它。以下是我的代码和错误：

必需：实施不同的算法，如决策树、逻辑回归和支持向量机，看看哪个算法的准确性更高。比较每个算法的结果并了解模型的行为。

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn import metrics

def main():
    x, y = loadData()
    x_train, x_test, y_train, y_test = trainTest(x,y)
    logisticAccuracy = logisticRegressionModel(x_train, x_test, y_train, y_test)
    dectreeAccuracy = decisionTreeModel(x_train, x_test, y_train, y_test)
    svmAccuracy = svmModel(x_train, x_test, y_train, y_test)
    printComparisonResults(logisticAccuracy,dectreeAccuracy,svmAccuracy)

def loadData():
    df = pd.read_csv('D:\Tutorials\Resources\creditcard.csv')
    x = df[['Time','V1','V2','V3','V5','V6','V7','V8','V9','V10','V11','V12',\
    'V13','V14','V5','V16','V17','V18','V19','V20','V21','V22','V23',\
    'V24','V25','V26','V27', 'V28','Amount']]
    y = df['Class']

    return x, y

def trainTest(x,y):
    x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, random_state=0)
    return x_train, x_test, y_train, y_test

# LogisticRegression model
def logisticRegressionModel(x_train, x_test, y_train, y_test):
    logistic_reg = LogisticRegression().fit(x_train, y_train)
    y_pred = logistic_reg.predict(x_test)

    #evaluate accuracy of our model
    acc = metrics.accuracy_score(y_test, y_pred)

    return acc

def decisionTreeModel(x_train, x_test, y_train, y_test):
    dec_tree = DecisionTreeClassifier()
    dec_tree = dec_tree.fit(x_train, y_train)
    y_pred = dec_tree.predict(x_test)

    #evaluate model
    acc = metrics.accuracy_score(y_test, y_pred)

    return acc

def svmModel(x_train, x_test, y_train, y_test):
    svm_model = svm.SVC(kernel='linear')
    svm_model = svm_model.fit(x_train, y_train)
    y_pred = svm_model.predict(y_test)

    #evaluate model
    acc = metrics.accuracy_score(y_test, y_pred)

    return acc

def printComparisonResults(logisticAccuracy,dectreeAccuracy,svmAccuracy):
    print("Printing Results")

    lr = "Logistic Regression"
    dt = "Decision Tree"
    sv = "SVM is Accurate"

    if logisticAccuracy > (dectreeAccuracy and svmAccuracy):
        print(lr, "is accurate than", dt, "and", sv)
    elif dectreeAccuracy > (logisticAccuracy and svmAccuracy):
        print(dt, "is accurate than", lr, "and", sv)
    else:
        print(sv, "is accurate than", lr, "and", dt)

    print("Done")

if name == 'main': main()

我遇到的错误：

ValueError: Expected 2D array, got 1D array instead:
array=[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

我已将我的数组重塑为 x = np.array(df[[...]].reshape((1,-1)) 但仍然出现错误：

raise ValueError("Found input variables with inconsistent numbers of"

ValueError: Found input variables with inconsistent numbers of samples: [1, 2470]

Answer 1

欢迎来到 SO！您需要考虑几项更改。

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn import metrics
import sys

def main():
    x, y = loadData()
    x_train, x_test, y_train, y_test = trainTest(x,y)
    logisticAccuracy = logisticRegressionModel(x_train, x_test, y_train, y_test)
    dectreeAccuracy = decisionTreeModel(x_train, x_test, y_train, y_test)
    svmAccuracy = svmModel(x_train, x_test, y_train, y_test)
    printComparisonResults(logisticAccuracy,dectreeAccuracy,svmAccuracy)

def loadData():
    df = pd.read_csv('.\creditcard.csv')
    x = df[['Time','V1','V2','V3','V5','V6','V7','V8','V9','V10','V11','V12',\
    'V13','V14','V5','V16','V17','V18','V19','V20','V21','V22','V23',\
    'V24','V25','V26','V27', 'V28','Amount']].values
    y = df['Class'].values.reshape(-1, 1)
    return x, y

def trainTest(x,y):
    x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, random_state=0, stratify=y)
    return x_train, x_test, y_train, y_test

# LogisticRegression model
def logisticRegressionModel(x_train, x_test, y_train, y_test):
    logistic_reg = LogisticRegression().fit(x_train, y_train)
    y_pred = logistic_reg.predict(x_test)

    #evaluate accuracy of our model
    acc = metrics.accuracy_score(y_test, y_pred)

    return acc
def decisionTreeModel(x_train, x_test, y_train, y_test):
    dec_tree = DecisionTreeClassifier()
    dec_tree = dec_tree.fit(x_train, y_train)
    y_pred = dec_tree.predict(x_test)

    #evaluate model
    acc = metrics.accuracy_score(y_test, y_pred)

    return acc
def svmModel(x_train, x_test, y_train, y_test):
    svm_model = svm.SVC(kernel='linear')
    svm_model = svm_model.fit(x_train, y_train)
    y_pred = svm_model.predict(x_test)

    #evaluate model
    acc = metrics.accuracy_score(y_test, y_pred)

    return acc
def printComparisonResults(logisticAccuracy,dectreeAccuracy,svmAccuracy):
    print("Printing Results")

    lr = "Logistic Regression"
    dt = "Decision Tree"
    sv = "SVM is Accurate"

    if logisticAccuracy > (dectreeAccuracy and svmAccuracy):
        print(lr, "is accurate than", dt, "and", sv)
    elif dectreeAccuracy > (logisticAccuracy and svmAccuracy):
        print(dt, "is accurate than", lr, "and", sv)
    else:
        print(sv, "is accurate than", lr, "and", dt)

    print("Done")
if __name__ == '__main__': 
    main()

输出：

Printing Results
Logistic Regression is accurate than Decision Tree and SVM is Accurate
Done

并且您的实施将完美运行 :)，我使用了可用的数据 here。

比较 ML 中不同分类模型的准确性

Comparing Accuracy of different classification models in ML

python

svm

pandas

logistic-regression