如何将 for 循环结果传递给 sklearns t 测试方法
How can pass for loop results into sklearns t test method
我有一个 for
循环遍历机器学习算法列表,我想对机器学习模型的结果执行 T 检验。
import pandas
import numpy
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_recall_curve, average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from scipy.stats import ttest_ind, ttest_ind_from_stats
from scipy.special import stdtr
from sklearn.svm import SVC
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
import warnings
#Load KDD dataset
data_set = "NSL-KDD/KDDTest+.arff"
print "Loading: ", data_set
with warnings.catch_warnings():
warnings.simplefilter("ignore")
names = ['duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot','num_failed_logins','logged_in','num_compromised','su_attempted','num_root','num_file_creations',
'num_shells','num_access_files','num_outbound_cmds','is_host_login','is_guest_login','count','srv_count','serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate',
'dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate','class',
'dst_host_srv_rerror_rate']
dataset = pandas.read_csv(data_set, names=names)
for column in dataset.columns:
if dataset[column].dtype == type(object):
le = LabelEncoder()
dataset[column] = le.fit_transform(dataset[column])
# split data into X and y -- these values must be changed to 8 for the diabites dataset and 4 for the iris dataset.
array = dataset.values
X = array[:,0:40]
Y = array[:,40]
# Split-out validation dataset
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = cross_validation.train_test_split(X, Y, test_size=validation_size, random_state=seed)
# Test options and evaluation metric
num_folds = 7
num_instances = len(X_train)
seed = 7
scoring = 'accuracy'
# Algorithms
models = []
models.append(('KNN', KNeighborsClassifier()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('Random Forest', RandomForestClassifier()))
models.append(('LR', LogisticRegression()))
# **evaluate each model in turn and perform T Test here**
results = []
names = []
for name, model in models:
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
cv_results = cross_validation.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean()*100, cv_results.std()*100)#multiplying by 100 to show percentage
print(msg)
print cv_results*100
#t, p = ttest_ind(cv_results, cv_results, equal_var=False)
#print("T_Test: T Value = %g P Value = %g" % (t, p))
上面是我的 for 循环,它迭代了 models
算法列表,它创建的结果如下所示:
KNN: 90.027688 (0.269979)
[ 90.0464756 90.12393493 89.5817196 90.08136381 90.35257652
89.69391709 90.31383185]
LDA: 83.646949 (0.187389)
[ 83.46243222 83.50116189 83.61735089 84.07593956 83.64974816
83.57225881 83.64974816]
CART: 92.440512 (0.438941)
[ 92.79628195 92.52517428 91.94422928 93.02595893 92.59976753
91.66989539 92.52227819]
NB: 29.662505 (1.702733)
[ 28.11773819 30.51897754 28.89233153 28.51607904 33.39790779
29.67841922 28.51607904]
Random Forest: 92.318760 (0.355462)
[ 92.52517428 92.64136328 91.78931061 92.32855482 92.71600155
91.78612941 92.44478884]
我的问题是,如何将cv_results
传入t测试值a, b
?因为我试图直接传入 cv_resutls
,它为每次迭代产生相同的结果。
你得到了相同的 t 统计量,因为你在每个循环中测试了两个相同的数组,即 a 和 b 都是 cv_results。
如果您想在模型之间进行 t 检验,只需将所有模型的 cv_results 保存到一个列表中,然后遍历该列表以计算成对的 t 检验。
例如:
for i in range(len(results) - 1):
for j in range(i, len(results)):
t, p = ttest_ind(results[i], results[j], equal_var=False)
print("T_Test between {} & {}: T Value = {}, P Value = {}".format(names[i], names[j], t, p))
我有一个 for
循环遍历机器学习算法列表,我想对机器学习模型的结果执行 T 检验。
import pandas
import numpy
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_recall_curve, average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from scipy.stats import ttest_ind, ttest_ind_from_stats
from scipy.special import stdtr
from sklearn.svm import SVC
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
import warnings
#Load KDD dataset
data_set = "NSL-KDD/KDDTest+.arff"
print "Loading: ", data_set
with warnings.catch_warnings():
warnings.simplefilter("ignore")
names = ['duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot','num_failed_logins','logged_in','num_compromised','su_attempted','num_root','num_file_creations',
'num_shells','num_access_files','num_outbound_cmds','is_host_login','is_guest_login','count','srv_count','serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate',
'dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate','class',
'dst_host_srv_rerror_rate']
dataset = pandas.read_csv(data_set, names=names)
for column in dataset.columns:
if dataset[column].dtype == type(object):
le = LabelEncoder()
dataset[column] = le.fit_transform(dataset[column])
# split data into X and y -- these values must be changed to 8 for the diabites dataset and 4 for the iris dataset.
array = dataset.values
X = array[:,0:40]
Y = array[:,40]
# Split-out validation dataset
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = cross_validation.train_test_split(X, Y, test_size=validation_size, random_state=seed)
# Test options and evaluation metric
num_folds = 7
num_instances = len(X_train)
seed = 7
scoring = 'accuracy'
# Algorithms
models = []
models.append(('KNN', KNeighborsClassifier()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('Random Forest', RandomForestClassifier()))
models.append(('LR', LogisticRegression()))
# **evaluate each model in turn and perform T Test here**
results = []
names = []
for name, model in models:
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
cv_results = cross_validation.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean()*100, cv_results.std()*100)#multiplying by 100 to show percentage
print(msg)
print cv_results*100
#t, p = ttest_ind(cv_results, cv_results, equal_var=False)
#print("T_Test: T Value = %g P Value = %g" % (t, p))
上面是我的 for 循环,它迭代了 models
算法列表,它创建的结果如下所示:
KNN: 90.027688 (0.269979)
[ 90.0464756 90.12393493 89.5817196 90.08136381 90.35257652
89.69391709 90.31383185]
LDA: 83.646949 (0.187389)
[ 83.46243222 83.50116189 83.61735089 84.07593956 83.64974816
83.57225881 83.64974816]
CART: 92.440512 (0.438941)
[ 92.79628195 92.52517428 91.94422928 93.02595893 92.59976753
91.66989539 92.52227819]
NB: 29.662505 (1.702733)
[ 28.11773819 30.51897754 28.89233153 28.51607904 33.39790779
29.67841922 28.51607904]
Random Forest: 92.318760 (0.355462)
[ 92.52517428 92.64136328 91.78931061 92.32855482 92.71600155
91.78612941 92.44478884]
我的问题是,如何将cv_results
传入t测试值a, b
?因为我试图直接传入 cv_resutls
,它为每次迭代产生相同的结果。
你得到了相同的 t 统计量,因为你在每个循环中测试了两个相同的数组,即 a 和 b 都是 cv_results。 如果您想在模型之间进行 t 检验,只需将所有模型的 cv_results 保存到一个列表中,然后遍历该列表以计算成对的 t 检验。
例如:
for i in range(len(results) - 1):
for j in range(i, len(results)):
t, p = ttest_ind(results[i], results[j], equal_var=False)
print("T_Test between {} & {}: T Value = {}, P Value = {}".format(names[i], names[j], t, p))