训练和测试数据集正在更改以进行 k 折交叉验证,因此朴素贝叶斯分类器的准确性发生了变化
Train and Test dataset are changing for k-fold cross validation so the accuracy is changed in naive bayes classifier
我正在尝试使用来自 here 的朴素贝叶斯分类器代码。
我使用 5 折 dataset.The 问题是,测试和训练数据集在每次折中都在变化,因此每次执行的准确性也在变化。但我需要一个固定的精度结果。我正在尝试使用一些示例数据集获得结果。
我的 Jupyter 代码在这里:
import numpy as np
from random import randrange
import csv
import math
import codecs
# Returns the mean of numbers
def mean(numbers):
return np.mean(numbers)
#Returns the std_deviation of numbers
def stdev(numbers):
return np.std(numbers)
#Split dataset into the k folds. Returns the list of k folds
def cross_validation_split(dataset, n_folds):
dataset_split = list()
dataset_copy = list(dataset)
fold_size = int(len(dataset) / n_folds)
for i in range(n_folds):
fold = list()
while len(fold) < fold_size:
index = randrange(len(dataset_copy))
fold.append(dataset_copy.pop(index))
dataset_split.append(fold)
return dataset_split
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
correct = 0
for i in range(len(actual)):
if actual[i] == predicted[i]:
correct += 1
return correct / float(len(actual)) * 100.0
#Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, ):
folds = cross_validation_split(dataset, n_folds)
scores = list()
for fold in folds:
train_set = list(folds)
train_set.remove(fold)
train_set = sum(train_set, [])
test_set = list()
for row in fold:
row_copy = list(row)
test_set.append(row_copy)
row_copy[-1] = None
predicted = algorithm(train_set, test_set, )
actual = [row[-1] for row in fold]
accuracy = accuracy_metric(actual, predicted)
scores.append(accuracy)
return scores, predicted
#Split training set by class value
def separate_by_class(dataset):
separated = {}
for i in range(len(dataset)):
row = dataset[i]
if row[-1] not in separated:
separated[row[-1]] = []
separated[row[-1]].append(row)
return separated
#Find the mean and standard deviation of each feature in dataset
def model(dataset):
models = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
models.pop() #Remove last entry because it is class value.
return models
#find the mean and standard deviation of each feature in dataset by their class
def model_by_class(dataset):
separated = separate_by_class(dataset)
class_models = {}
for (classValue, instances) in separated.items():
class_models[classValue] = model(instances)
return class_models
#Calculate probability using gaussian density function
def calculate_pdf(x, mean, stdev):
if stdev == 0.0:
if x == mean:
return 1.0
else:
return 0.0
exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
return 1 / (math.sqrt(2 * math.pi) * stdev) * exponent
#Calculate the class probability for input sample. Combine probability of each feature
def calculate_class_probabilities(models, input):
probabilities = {}
for (classValue, classModels) in models.items():
probabilities[classValue] = 1
for i in range(len(classModels)):
(mean, stdev) = classModels[i]
x = input[i]
probabilities[classValue] *= calculate_pdf(x, mean, stdev)
return probabilities
#Compare probability for each class. Return the class label which has max probability.
def predict(models, inputVector):
probabilities = calculate_class_probabilities(models, inputVector)
(bestLabel, bestProb) = (None, -1)
for (classValue, probability) in probabilities.items():
if bestLabel is None or probability > bestProb:
bestProb = probability
bestLabel = classValue
return bestLabel
#Get class label for each value in test set.
def getPredictions(models, testSet):
predictions = []
for i in range(len(testSet)):
result = predict(models, testSet[i])
predictions.append(result)
return predictions
#Create a naive bayes model. Then test the model and returns the testing result.
def naive_bayes(train, test, ):
summaries = model_by_class(train)
predictions = getPredictions(summaries, test)
return predictions
# load and prepare data for result
dataset =[[1, 20, 1],
[2, 21, 0],
[3, 22, 1],
[4, 22, 0],
[5, 20, 0],
[6, 20, 1],
[7, 21, 0],
[8, 22, 1],
[9, 22, 0],
[10, 20, 1]]
n_folds = 5
print ("---------- Gaussian Naive Bayes ---------------")
accuracy_naive = evaluate_algorithm(dataset, naive_bayes, n_folds)
print ("Naive Bayes Classification")
print ('Accuracy in each fold: %s' % accuracy_naive)
print ('Average Accuracy: %f' % (sum(accuracy_naive) / len(accuracy_naive)))
我尝试用示例数据进行测试。我认为问题出在这里:
# Split dataset into the k folds. Returns the list of k folds
def cross_validation_split(dataset, n_folds):
dataset_split = list()
dataset_copy = list(dataset)
fold_size = int(len(dataset) / n_folds)
for i in range(n_folds):
fold = list()
while len(fold) < fold_size:
index = randrange(len(dataset_copy))
fold.append(dataset_copy.pop(index))
dataset_split.append(fold)
return dataset_split
#Test splitting data
dataset = [[1, 20, 1],
[2, 21, 0],
[3, 22, 1],
[4, 22, 0],
[5, 20, 0],
[6, 20, 1],
[7, 21, 0],
[8, 22, 1],
[9, 22, 0],
[10, 20, 1]
]
nfold = 5
dataset_split = cross_validation_split(dataset, nfold)
dataset_split
提前致谢
在 randrange 种子之前,随机在每次执行中遵循相同的拆分。
所以,你可以这样修改代码,
import random
# Split dataset into the k folds. Returns the list of k folds
def cross_validation_split(dataset, n_folds):
random.seed(0)
dataset_split = list()
dataset_copy = list(dataset)
fold_size = int(len(dataset) / n_folds)
for i in range(n_folds):
fold = list()
while len(fold) < fold_size:
index = random.randrange(len(dataset_copy))
fold.append(dataset_copy.pop(index))
dataset_split.append(fold)
return dataset_split
#Test splitting data
dataset = [[1, 20, 1],
[2, 21, 0],
[3, 22, 1],
[4, 22, 0],
[5, 20, 0],
[6, 20, 1],
[7, 21, 0],
[8, 22, 1],
[9, 22, 0],
[10, 20, 1]
]
nfold = 5
dataset_split = cross_validation_split(dataset, nfold)
dataset_split
@Amesh Jayaweera 提供的答案是正确的,但我想告诉你,sklearn 中有一个预定义函数,如下所示。
from sklearn.model_selection import StratifiedKfold
splitter = StratifiedKfold(n_splits=5, random_state=1234)
这是优雅实现和折叠分层的额外优势的更好方式。此外,random_state 是种子。你可以在线查看它的实现。
我正在尝试使用来自 here 的朴素贝叶斯分类器代码。 我使用 5 折 dataset.The 问题是,测试和训练数据集在每次折中都在变化,因此每次执行的准确性也在变化。但我需要一个固定的精度结果。我正在尝试使用一些示例数据集获得结果。 我的 Jupyter 代码在这里:
import numpy as np
from random import randrange
import csv
import math
import codecs
# Returns the mean of numbers
def mean(numbers):
return np.mean(numbers)
#Returns the std_deviation of numbers
def stdev(numbers):
return np.std(numbers)
#Split dataset into the k folds. Returns the list of k folds
def cross_validation_split(dataset, n_folds):
dataset_split = list()
dataset_copy = list(dataset)
fold_size = int(len(dataset) / n_folds)
for i in range(n_folds):
fold = list()
while len(fold) < fold_size:
index = randrange(len(dataset_copy))
fold.append(dataset_copy.pop(index))
dataset_split.append(fold)
return dataset_split
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
correct = 0
for i in range(len(actual)):
if actual[i] == predicted[i]:
correct += 1
return correct / float(len(actual)) * 100.0
#Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, ):
folds = cross_validation_split(dataset, n_folds)
scores = list()
for fold in folds:
train_set = list(folds)
train_set.remove(fold)
train_set = sum(train_set, [])
test_set = list()
for row in fold:
row_copy = list(row)
test_set.append(row_copy)
row_copy[-1] = None
predicted = algorithm(train_set, test_set, )
actual = [row[-1] for row in fold]
accuracy = accuracy_metric(actual, predicted)
scores.append(accuracy)
return scores, predicted
#Split training set by class value
def separate_by_class(dataset):
separated = {}
for i in range(len(dataset)):
row = dataset[i]
if row[-1] not in separated:
separated[row[-1]] = []
separated[row[-1]].append(row)
return separated
#Find the mean and standard deviation of each feature in dataset
def model(dataset):
models = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
models.pop() #Remove last entry because it is class value.
return models
#find the mean and standard deviation of each feature in dataset by their class
def model_by_class(dataset):
separated = separate_by_class(dataset)
class_models = {}
for (classValue, instances) in separated.items():
class_models[classValue] = model(instances)
return class_models
#Calculate probability using gaussian density function
def calculate_pdf(x, mean, stdev):
if stdev == 0.0:
if x == mean:
return 1.0
else:
return 0.0
exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
return 1 / (math.sqrt(2 * math.pi) * stdev) * exponent
#Calculate the class probability for input sample. Combine probability of each feature
def calculate_class_probabilities(models, input):
probabilities = {}
for (classValue, classModels) in models.items():
probabilities[classValue] = 1
for i in range(len(classModels)):
(mean, stdev) = classModels[i]
x = input[i]
probabilities[classValue] *= calculate_pdf(x, mean, stdev)
return probabilities
#Compare probability for each class. Return the class label which has max probability.
def predict(models, inputVector):
probabilities = calculate_class_probabilities(models, inputVector)
(bestLabel, bestProb) = (None, -1)
for (classValue, probability) in probabilities.items():
if bestLabel is None or probability > bestProb:
bestProb = probability
bestLabel = classValue
return bestLabel
#Get class label for each value in test set.
def getPredictions(models, testSet):
predictions = []
for i in range(len(testSet)):
result = predict(models, testSet[i])
predictions.append(result)
return predictions
#Create a naive bayes model. Then test the model and returns the testing result.
def naive_bayes(train, test, ):
summaries = model_by_class(train)
predictions = getPredictions(summaries, test)
return predictions
# load and prepare data for result
dataset =[[1, 20, 1],
[2, 21, 0],
[3, 22, 1],
[4, 22, 0],
[5, 20, 0],
[6, 20, 1],
[7, 21, 0],
[8, 22, 1],
[9, 22, 0],
[10, 20, 1]]
n_folds = 5
print ("---------- Gaussian Naive Bayes ---------------")
accuracy_naive = evaluate_algorithm(dataset, naive_bayes, n_folds)
print ("Naive Bayes Classification")
print ('Accuracy in each fold: %s' % accuracy_naive)
print ('Average Accuracy: %f' % (sum(accuracy_naive) / len(accuracy_naive)))
我尝试用示例数据进行测试。我认为问题出在这里:
# Split dataset into the k folds. Returns the list of k folds
def cross_validation_split(dataset, n_folds):
dataset_split = list()
dataset_copy = list(dataset)
fold_size = int(len(dataset) / n_folds)
for i in range(n_folds):
fold = list()
while len(fold) < fold_size:
index = randrange(len(dataset_copy))
fold.append(dataset_copy.pop(index))
dataset_split.append(fold)
return dataset_split
#Test splitting data
dataset = [[1, 20, 1],
[2, 21, 0],
[3, 22, 1],
[4, 22, 0],
[5, 20, 0],
[6, 20, 1],
[7, 21, 0],
[8, 22, 1],
[9, 22, 0],
[10, 20, 1]
]
nfold = 5
dataset_split = cross_validation_split(dataset, nfold)
dataset_split
提前致谢
在 randrange 种子之前,随机在每次执行中遵循相同的拆分。
所以,你可以这样修改代码,
import random
# Split dataset into the k folds. Returns the list of k folds
def cross_validation_split(dataset, n_folds):
random.seed(0)
dataset_split = list()
dataset_copy = list(dataset)
fold_size = int(len(dataset) / n_folds)
for i in range(n_folds):
fold = list()
while len(fold) < fold_size:
index = random.randrange(len(dataset_copy))
fold.append(dataset_copy.pop(index))
dataset_split.append(fold)
return dataset_split
#Test splitting data
dataset = [[1, 20, 1],
[2, 21, 0],
[3, 22, 1],
[4, 22, 0],
[5, 20, 0],
[6, 20, 1],
[7, 21, 0],
[8, 22, 1],
[9, 22, 0],
[10, 20, 1]
]
nfold = 5
dataset_split = cross_validation_split(dataset, nfold)
dataset_split
@Amesh Jayaweera 提供的答案是正确的,但我想告诉你,sklearn 中有一个预定义函数,如下所示。
from sklearn.model_selection import StratifiedKfold
splitter = StratifiedKfold(n_splits=5, random_state=1234)
这是优雅实现和折叠分层的额外优势的更好方式。此外,random_state 是种子。你可以在线查看它的实现。