决策树分类器的准确性
Accuracy of Decision Tree Classifier
我有一个决策树分类器,它可以预测我数据集中最后一列的值,它是 'made' 或 'missed',我有几次 运行 这个程序,但是准确度始终是 100%。我希望它大约是 95-100%。知道为什么吗?这是数据集的一个片段(原始数据集有超过 74 000 行):
A,L,-5,8,3,475,11.8,1,1.6,6.1,2,2.7,made
A,L,-39,10,2,30,18.5,6,5.4,24.3,3,3.1,missed
A,L,-20,8,3,327,6.2,0,1.8,2.3,2,0,missed
A,W,16,5,1,504,11.7,0,1,18,2,7.3,missed
A,L,-5,3,2,547,19.9,0,1.2,23.9,3,7.5,made
H,W,14,4,2,600,17.6,0,0.5,5.5,2,3.8,made
H,L,-8,6,3,692,23,1,1.9,4.4,2,4.1,made
H,L,-10,11,3,171,14.4,0,0.9,25.2,3,5.8,missed
这是分类器的代码:
from math import log
import operator
def load_csv(filename):
headers = ["location","w","final_margin","shot_number","period","game_clock","shot_clock","dribbles","touch_time",
"shot_dist","pts_type","close_def_dist","target"]
df = pd.read_csv(filename, header=None, names=headers, na_values="?")
obj_df=list(df.values.flatten())
i=0
new_list=[]
while i<len(dataset):
new_list.append(obj_df[i:i+13])
i+=13
labels = ["location","w","final_margin","shot_number","period","game_clock","shot_clock", "dribbles","touch_time",
"shot_dist","pts_type","close_def_dist"]
return new_list, labels
def calcShannonEnt(dataSet):
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet: # the the number of unique elements and their occurance
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key]) / numEntries
shannonEnt -= prob * log(prob, 2) # log base 2
return shannonEnt
def splitDataSet(dataSet, axis, value):
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = featVec[:axis] # chop out axis used for splitting
reducedFeatVec.extend(featVec[axis + 1:])
retDataSet.append(reducedFeatVec)
return retDataSet
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0]) - 1 # the last column is used for the labels
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0;
bestFeature = -1
for i in range(numFeatures): # iterate over all the features
featList = [example[i] for example in dataSet] # create a list of all the examples of this feature
uniqueVals = set(featList) # get a set of unique values
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet) / float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy # calculate the info gain; ie reduction in entropy
"""
print("feature : " + str(i))
print("baseEntropy : "+str(baseEntropy))
print("newEntropy : " + str(newEntropy))
print("infoGain : " + str(infoGain))
"""
if (infoGain > bestInfoGain): # compare this to the best gain so far
bestInfoGain = infoGain # if better than current best, set to best
bestFeature = i
return bestFeature # returns an integer
def majorityCnt(classList):
classCount = {}
for vote in classList:
if vote not in classCount.keys(): classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def createTree(dataSet, labels):
# extracting data
classList = [example[-1] for example in dataSet]
if classList.count(classList[0]) == len(classList):
return classList[0] # stop splitting when all of the classes are equal
if len(dataSet[0]) == 1: # stop splitting when there are no more features in dataSet
return majorityCnt(classList)
# use Information Gain
bestFeat = chooseBestFeatureToSplit(dataSet)
bestFeatLabel = labels[bestFeat]
#build a tree recursively
myTree = {bestFeatLabel: {}}
#print("myTree : "+labels[bestFeat])
del (labels[bestFeat])
featValues = [example[bestFeat] for example in dataSet]
#print("featValues: "+str(featValues))
uniqueVals = set(featValues)
#print("uniqueVals: " + str(uniqueVals))
for value in uniqueVals:
subLabels = labels[:] # copy all of labels, so trees don't mess up existing labels
#print("subLabels"+str(subLabels))
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)
#print("myTree : " + str(myTree))
return myTree
def classify(inputTree, featLabels, testVec):
firstStr = list(inputTree)[0] #print("fistStr : "+firstStr)
secondDict = inputTree[firstStr]
#print("secondDict : " + str(secondDict))
featIndex = featLabels.index(firstStr)
#print("featIndex : " + str(featIndex))
key = testVec[featIndex]
#print("key : " + str(key))
valueOfFeat = secondDict[key]
#print("valueOfFeat : " + str(valueOfFeat))
if isinstance(valueOfFeat, dict):
#print("is instance: "+str(valueOfFeat))
classLabel = classify(valueOfFeat, featLabels, testVec)
else:
#print("is Not instance: " + valueOfFeat)
classLabel = valueOfFeat
return classLabel
def storeTree(inputTree, filename):
import pickle
fw = open(filename, 'w')
pickle.dump(inputTree, fw)
fw.close()
def grabTree(filename):
import pickle
fr = open(filename)
return pickle.load(fr)
def accuracy_metric(actual, predicted):
correct = 0
for i in range(len(actual)):
if actual[i] == predicted[i]:
correct += 1
return correct / float(len(actual)) * 100.0
# collect data
myDat, labels = load_csv('data/basketball.train.csv')
#print(myDat)
#build a tree
mytree = createTree(myDat, labels)
#print(mytree)
#run test
predictions=[]
for row in myDat:
prediction = classify(mytree, ["location","w","final_margin","shot_number","period","game_clock","shot_clock","dribbles","touch_time",
"shot_dist","pts_type","close_def_dist"], [row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8],
row[9],row[10],row[11]])
#print('Expected=%s, Got=%s' % (row[-1], prediction))
predictions.append(prediction)
actual = [row[-1] for row in myDat]
accuracy = accuracy_metric(actual, predictions)
print(accuracy)
您似乎没有将数据集拆分为单独的训练和测试数据集。这样做的结果是您的分类器可能过度拟合数据集,并且可能无法很好地处理来自数据集之外的样本。
尝试随机选择(比如说)75% 的数据进行训练,然后用剩下的 25% 测试准确性。例如,替换代码的最后一部分:
import random
dataset, labels = load_csv('data/basketball.train.csv')
random.shuffle(dataset)
split_index = int(len(dataset) * 0.75)
train_dataset = dataset[:split_index]
test_dataset = dataset[split_index:]
mytree = createTree(train_dataset, labels)
predictions=[]
for row in test_dataset:
prediction = classify(mytree, ["location","w","final_margin","shot_number","period","game_clock","shot_clock","dribbles","touch_time",
"shot_dist","pts_type","close_def_dist"], [row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8],
row[9],row[10],row[11]])
#print('Expected=%s, Got=%s' % (row[-1], prediction))
predictions.append(prediction)
actual = [row[-1] for row in test_dataset]
accuracy = accuracy_metric(actual, predictions)
print(accuracy)
(注意: 未测试)
我有一个决策树分类器,它可以预测我数据集中最后一列的值,它是 'made' 或 'missed',我有几次 运行 这个程序,但是准确度始终是 100%。我希望它大约是 95-100%。知道为什么吗?这是数据集的一个片段(原始数据集有超过 74 000 行):
A,L,-5,8,3,475,11.8,1,1.6,6.1,2,2.7,made
A,L,-39,10,2,30,18.5,6,5.4,24.3,3,3.1,missed
A,L,-20,8,3,327,6.2,0,1.8,2.3,2,0,missed
A,W,16,5,1,504,11.7,0,1,18,2,7.3,missed
A,L,-5,3,2,547,19.9,0,1.2,23.9,3,7.5,made
H,W,14,4,2,600,17.6,0,0.5,5.5,2,3.8,made
H,L,-8,6,3,692,23,1,1.9,4.4,2,4.1,made
H,L,-10,11,3,171,14.4,0,0.9,25.2,3,5.8,missed
这是分类器的代码:
from math import log
import operator
def load_csv(filename):
headers = ["location","w","final_margin","shot_number","period","game_clock","shot_clock","dribbles","touch_time",
"shot_dist","pts_type","close_def_dist","target"]
df = pd.read_csv(filename, header=None, names=headers, na_values="?")
obj_df=list(df.values.flatten())
i=0
new_list=[]
while i<len(dataset):
new_list.append(obj_df[i:i+13])
i+=13
labels = ["location","w","final_margin","shot_number","period","game_clock","shot_clock", "dribbles","touch_time",
"shot_dist","pts_type","close_def_dist"]
return new_list, labels
def calcShannonEnt(dataSet):
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet: # the the number of unique elements and their occurance
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key]) / numEntries
shannonEnt -= prob * log(prob, 2) # log base 2
return shannonEnt
def splitDataSet(dataSet, axis, value):
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = featVec[:axis] # chop out axis used for splitting
reducedFeatVec.extend(featVec[axis + 1:])
retDataSet.append(reducedFeatVec)
return retDataSet
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0]) - 1 # the last column is used for the labels
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0;
bestFeature = -1
for i in range(numFeatures): # iterate over all the features
featList = [example[i] for example in dataSet] # create a list of all the examples of this feature
uniqueVals = set(featList) # get a set of unique values
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet) / float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy # calculate the info gain; ie reduction in entropy
"""
print("feature : " + str(i))
print("baseEntropy : "+str(baseEntropy))
print("newEntropy : " + str(newEntropy))
print("infoGain : " + str(infoGain))
"""
if (infoGain > bestInfoGain): # compare this to the best gain so far
bestInfoGain = infoGain # if better than current best, set to best
bestFeature = i
return bestFeature # returns an integer
def majorityCnt(classList):
classCount = {}
for vote in classList:
if vote not in classCount.keys(): classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def createTree(dataSet, labels):
# extracting data
classList = [example[-1] for example in dataSet]
if classList.count(classList[0]) == len(classList):
return classList[0] # stop splitting when all of the classes are equal
if len(dataSet[0]) == 1: # stop splitting when there are no more features in dataSet
return majorityCnt(classList)
# use Information Gain
bestFeat = chooseBestFeatureToSplit(dataSet)
bestFeatLabel = labels[bestFeat]
#build a tree recursively
myTree = {bestFeatLabel: {}}
#print("myTree : "+labels[bestFeat])
del (labels[bestFeat])
featValues = [example[bestFeat] for example in dataSet]
#print("featValues: "+str(featValues))
uniqueVals = set(featValues)
#print("uniqueVals: " + str(uniqueVals))
for value in uniqueVals:
subLabels = labels[:] # copy all of labels, so trees don't mess up existing labels
#print("subLabels"+str(subLabels))
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)
#print("myTree : " + str(myTree))
return myTree
def classify(inputTree, featLabels, testVec):
firstStr = list(inputTree)[0] #print("fistStr : "+firstStr)
secondDict = inputTree[firstStr]
#print("secondDict : " + str(secondDict))
featIndex = featLabels.index(firstStr)
#print("featIndex : " + str(featIndex))
key = testVec[featIndex]
#print("key : " + str(key))
valueOfFeat = secondDict[key]
#print("valueOfFeat : " + str(valueOfFeat))
if isinstance(valueOfFeat, dict):
#print("is instance: "+str(valueOfFeat))
classLabel = classify(valueOfFeat, featLabels, testVec)
else:
#print("is Not instance: " + valueOfFeat)
classLabel = valueOfFeat
return classLabel
def storeTree(inputTree, filename):
import pickle
fw = open(filename, 'w')
pickle.dump(inputTree, fw)
fw.close()
def grabTree(filename):
import pickle
fr = open(filename)
return pickle.load(fr)
def accuracy_metric(actual, predicted):
correct = 0
for i in range(len(actual)):
if actual[i] == predicted[i]:
correct += 1
return correct / float(len(actual)) * 100.0
# collect data
myDat, labels = load_csv('data/basketball.train.csv')
#print(myDat)
#build a tree
mytree = createTree(myDat, labels)
#print(mytree)
#run test
predictions=[]
for row in myDat:
prediction = classify(mytree, ["location","w","final_margin","shot_number","period","game_clock","shot_clock","dribbles","touch_time",
"shot_dist","pts_type","close_def_dist"], [row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8],
row[9],row[10],row[11]])
#print('Expected=%s, Got=%s' % (row[-1], prediction))
predictions.append(prediction)
actual = [row[-1] for row in myDat]
accuracy = accuracy_metric(actual, predictions)
print(accuracy)
您似乎没有将数据集拆分为单独的训练和测试数据集。这样做的结果是您的分类器可能过度拟合数据集,并且可能无法很好地处理来自数据集之外的样本。
尝试随机选择(比如说)75% 的数据进行训练,然后用剩下的 25% 测试准确性。例如,替换代码的最后一部分:
import random
dataset, labels = load_csv('data/basketball.train.csv')
random.shuffle(dataset)
split_index = int(len(dataset) * 0.75)
train_dataset = dataset[:split_index]
test_dataset = dataset[split_index:]
mytree = createTree(train_dataset, labels)
predictions=[]
for row in test_dataset:
prediction = classify(mytree, ["location","w","final_margin","shot_number","period","game_clock","shot_clock","dribbles","touch_time",
"shot_dist","pts_type","close_def_dist"], [row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8],
row[9],row[10],row[11]])
#print('Expected=%s, Got=%s' % (row[-1], prediction))
predictions.append(prediction)
actual = [row[-1] for row in test_dataset]
accuracy = accuracy_metric(actual, predictions)
print(accuracy)
(注意: 未测试)