决策树分类器的准确性

Question

我有一个决策树分类器，它可以预测我数据集中最后一列的值，它是 'made' 或 'missed'，我有几次运行这个程序，但是准确度始终是 100%。我希望它大约是 95-100%。知道为什么吗？这是数据集的一个片段（原始数据集有超过 74 000 行）：

A,L,-5,8,3,475,11.8,1,1.6,6.1,2,2.7,made
A,L,-39,10,2,30,18.5,6,5.4,24.3,3,3.1,missed
A,L,-20,8,3,327,6.2,0,1.8,2.3,2,0,missed
A,W,16,5,1,504,11.7,0,1,18,2,7.3,missed
A,L,-5,3,2,547,19.9,0,1.2,23.9,3,7.5,made
H,W,14,4,2,600,17.6,0,0.5,5.5,2,3.8,made
H,L,-8,6,3,692,23,1,1.9,4.4,2,4.1,made
H,L,-10,11,3,171,14.4,0,0.9,25.2,3,5.8,missed

这是分类器的代码：

from math import log
import operator

def load_csv(filename):
    headers = ["location","w","final_margin","shot_number","period","game_clock","shot_clock","dribbles","touch_time",
           "shot_dist","pts_type","close_def_dist","target"]
    df = pd.read_csv(filename, header=None, names=headers, na_values="?")

    obj_df=list(df.values.flatten())

    i=0
    new_list=[]
    while i<len(dataset):
        new_list.append(obj_df[i:i+13])
        i+=13

    labels = ["location","w","final_margin","shot_number","period","game_clock","shot_clock", "dribbles","touch_time",
           "shot_dist","pts_type","close_def_dist"]
    return new_list, labels

def calcShannonEnt(dataSet):
    numEntries = len(dataSet)
    labelCounts = {}
    for featVec in dataSet:  # the the number of unique elements and their occurance
        currentLabel = featVec[-1]
        if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key]) / numEntries
        shannonEnt -= prob * log(prob, 2)  # log base 2
    return shannonEnt


def splitDataSet(dataSet, axis, value):
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            reducedFeatVec = featVec[:axis]  # chop out axis used for splitting
            reducedFeatVec.extend(featVec[axis + 1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet


def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) - 1  # the last column is used for the labels
    baseEntropy = calcShannonEnt(dataSet)
    bestInfoGain = 0.0;
    bestFeature = -1
    for i in range(numFeatures):  # iterate over all the features
        featList = [example[i] for example in dataSet]  # create a list of all the examples of this feature
        uniqueVals = set(featList)  # get a set of unique values
        newEntropy = 0.0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet, i, value)
            prob = len(subDataSet) / float(len(dataSet))
            newEntropy += prob * calcShannonEnt(subDataSet)


        infoGain = baseEntropy - newEntropy  # calculate the info gain; ie reduction in entropy
        """
        print("feature : " + str(i))
        print("baseEntropy : "+str(baseEntropy))
        print("newEntropy : " + str(newEntropy))
        print("infoGain : " + str(infoGain))
        """
        if (infoGain > bestInfoGain):  # compare this to the best gain so far
            bestInfoGain = infoGain  # if better than current best, set to best
            bestFeature = i
    return bestFeature  # returns an integer


def majorityCnt(classList):
    classCount = {}
    for vote in classList:
        if vote not in classCount.keys(): classCount[vote] = 0
        classCount[vote] += 1
    sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]


def createTree(dataSet, labels):
    # extracting data
    classList = [example[-1] for example in dataSet]
    if classList.count(classList[0]) == len(classList):
        return classList[0]  # stop splitting when all of the classes are equal
    if len(dataSet[0]) == 1:  # stop splitting when there are no more features in dataSet
        return majorityCnt(classList)
    # use Information Gain
    bestFeat = chooseBestFeatureToSplit(dataSet)
    bestFeatLabel = labels[bestFeat]

    #build a tree recursively
    myTree = {bestFeatLabel: {}}
    #print("myTree : "+labels[bestFeat])
    del (labels[bestFeat])
    featValues = [example[bestFeat] for example in dataSet]
    #print("featValues: "+str(featValues))
    uniqueVals = set(featValues)
    #print("uniqueVals: " + str(uniqueVals))
    for value in uniqueVals:
        subLabels = labels[:]  # copy all of labels, so trees don't mess up existing labels
        #print("subLabels"+str(subLabels))
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)
        #print("myTree : " + str(myTree))
    return myTree


def classify(inputTree, featLabels, testVec):
    firstStr = list(inputTree)[0]    #print("fistStr : "+firstStr)
    secondDict = inputTree[firstStr]
    #print("secondDict : " + str(secondDict))
    featIndex = featLabels.index(firstStr)
    #print("featIndex : " + str(featIndex))
    key = testVec[featIndex]
    #print("key : " + str(key))
    valueOfFeat = secondDict[key]
    #print("valueOfFeat : " + str(valueOfFeat))
    if isinstance(valueOfFeat, dict):
        #print("is instance: "+str(valueOfFeat))
        classLabel = classify(valueOfFeat, featLabels, testVec)
    else:
        #print("is Not instance: " + valueOfFeat)
        classLabel = valueOfFeat
    return classLabel


def storeTree(inputTree, filename):
    import pickle
    fw = open(filename, 'w')
    pickle.dump(inputTree, fw)
    fw.close()


def grabTree(filename):
    import pickle
    fr = open(filename)
    return pickle.load(fr)

def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

# collect data
myDat, labels = load_csv('data/basketball.train.csv')
#print(myDat)
#build a tree
mytree = createTree(myDat, labels)
#print(mytree)

#run test

predictions=[]
for row in myDat:
    prediction = classify(mytree, ["location","w","final_margin","shot_number","period","game_clock","shot_clock","dribbles","touch_time",
           "shot_dist","pts_type","close_def_dist"], [row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8],
                                                      row[9],row[10],row[11]])
    #print('Expected=%s, Got=%s' % (row[-1], prediction))
    predictions.append(prediction)
actual = [row[-1] for row in myDat]
accuracy = accuracy_metric(actual, predictions)
print(accuracy)

Answer 1

您似乎没有将数据集拆分为单独的训练和测试数据集。这样做的结果是您的分类器可能过度拟合数据集，并且可能无法很好地处理来自数据集之外的样本。

尝试随机选择（比如说）75% 的数据进行训练，然后用剩下的 25% 测试准确性。例如，替换代码的最后一部分：

import random

dataset, labels = load_csv('data/basketball.train.csv')
random.shuffle(dataset)
split_index = int(len(dataset) * 0.75)

train_dataset = dataset[:split_index]
test_dataset = dataset[split_index:]

mytree = createTree(train_dataset, labels)

predictions=[]
for row in test_dataset:
    prediction = classify(mytree, ["location","w","final_margin","shot_number","period","game_clock","shot_clock","dribbles","touch_time",
           "shot_dist","pts_type","close_def_dist"], [row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8],
                                                      row[9],row[10],row[11]])
    #print('Expected=%s, Got=%s' % (row[-1], prediction))
    predictions.append(prediction)
actual = [row[-1] for row in test_dataset]
accuracy = accuracy_metric(actual, predictions)
print(accuracy)

(注意: 未测试)

决策树分类器的准确性

Accuracy of Decision Tree Classifier

python

classification

entropy

prediction

decision-tree