如何在sklearn上拆分平衡训练集和测试集的数据
How to split data on balanced training set and test set on sklearn
我正在使用 sklearn 进行多class化任务。我需要将所有数据拆分为 train_set 和 test_set。我想从每个 class 中随机抽取相同的样本编号。
其实我是在逗这个功能
X_train, X_test, y_train, y_test = cross_validation.train_test_split(Data, Target, test_size=0.3, random_state=0)
但它给出了不平衡的数据集!任何建议。
您可以使用 StratifiedShuffleSplit 创建具有与原始数据相同百分比 类 的数据集:
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
X = np.array([[1, 3], [3, 7], [2, 4], [4, 8]])
y = np.array([0, 1, 0, 1])
stratSplit = StratifiedShuffleSplit(y, n_iter=1, test_size=0.5, random_state=42)
for train_idx, test_idx in stratSplit:
X_train=X[train_idx]
y_train=y[train_idx]
print(X_train)
# [[3 7]
# [2 4]]
print(y_train)
# [1 0]
尽管 Christian 的建议是正确的,但从技术上讲,train_test_split
应该使用 stratify
参数为您提供分层结果。
所以你可以这样做:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(Data, Target, test_size=0.3, random_state=0, stratify=Target)
这里的技巧是它从 0.17
in sklearn
的版本开始。
来自关于参数 stratify
的文档:
stratify : array-like or None (default is None)
If not None, data is split in a stratified fashion, using this as the labels array.
New in version 0.17: stratify splitting
如果 classes 不平衡,但您希望拆分平衡,那么分层将无济于事。似乎没有在 sklearn 中进行平衡采样的方法,但使用基本的 numpy 很容易,例如这样的函数可能对你有帮助:
def split_balanced(data, target, test_size=0.2):
classes = np.unique(target)
# can give test_size as fraction of input data size of number of samples
if test_size<1:
n_test = np.round(len(target)*test_size)
else:
n_test = test_size
n_train = max(0,len(target)-n_test)
n_train_per_class = max(1,int(np.floor(n_train/len(classes))))
n_test_per_class = max(1,int(np.floor(n_test/len(classes))))
ixs = []
for cl in classes:
if (n_train_per_class+n_test_per_class) > np.sum(target==cl):
# if data has too few samples for this class, do upsampling
# split the data to training and testing before sampling so data points won't be
# shared among training and test data
splitix = int(np.ceil(n_train_per_class/(n_train_per_class+n_test_per_class)*np.sum(target==cl)))
ixs.append(np.r_[np.random.choice(np.nonzero(target==cl)[0][:splitix], n_train_per_class),
np.random.choice(np.nonzero(target==cl)[0][splitix:], n_test_per_class)])
else:
ixs.append(np.random.choice(np.nonzero(target==cl)[0], n_train_per_class+n_test_per_class,
replace=False))
# take same num of samples from all classes
ix_train = np.concatenate([x[:n_train_per_class] for x in ixs])
ix_test = np.concatenate([x[n_train_per_class:(n_train_per_class+n_test_per_class)] for x in ixs])
X_train = data[ix_train,:]
X_test = data[ix_test,:]
y_train = target[ix_train]
y_test = target[ix_test]
return X_train, X_test, y_train, y_test
请注意,如果您使用此方法并且每个 class 采样的点数多于输入数据中的点数,那么这些点将被上采样(带替换的采样)。因此,一些数据点会出现多次,这可能会对准确性措施等产生影响。如果某些 class 只有一个数据点,则会出现错误。您可以轻松检查每个 class 的点数,例如使用 np.unique(target, return_counts=True)
这是我用来获取train/test数据索引的实现
def get_safe_balanced_split(target, trainSize=0.8, getTestIndexes=True, shuffle=False, seed=None):
classes, counts = np.unique(target, return_counts=True)
nPerClass = float(len(target))*float(trainSize)/float(len(classes))
if nPerClass > np.min(counts):
print("Insufficient data to produce a balanced training data split.")
print("Classes found %s"%classes)
print("Classes count %s"%counts)
ts = float(trainSize*np.min(counts)*len(classes)) / float(len(target))
print("trainSize is reset from %s to %s"%(trainSize, ts))
trainSize = ts
nPerClass = float(len(target))*float(trainSize)/float(len(classes))
# get number of classes
nPerClass = int(nPerClass)
print("Data splitting on %i classes and returning %i per class"%(len(classes),nPerClass ))
# get indexes
trainIndexes = []
for c in classes:
if seed is not None:
np.random.seed(seed)
cIdxs = np.where(target==c)[0]
cIdxs = np.random.choice(cIdxs, nPerClass, replace=False)
trainIndexes.extend(cIdxs)
# get test indexes
testIndexes = None
if getTestIndexes:
testIndexes = list(set(range(len(target))) - set(trainIndexes))
# shuffle
if shuffle:
trainIndexes = random.shuffle(trainIndexes)
if testIndexes is not None:
testIndexes = random.shuffle(testIndexes)
# return indexes
return trainIndexes, testIndexes
另一种方法是对分层 test/train 拆分进行过度或欠采样。 imbalanced-learn 库对此非常方便,如果您正在进行 在线学习 并且希望保证管道中的火车数据平衡,则特别有用。
from imblearn.pipeline import Pipeline as ImbalancePipeline
model = ImbalancePipeline(steps=[
('data_balancer', RandomOverSampler()),
('classifier', SVC()),
])
我正在使用 sklearn 进行多class化任务。我需要将所有数据拆分为 train_set 和 test_set。我想从每个 class 中随机抽取相同的样本编号。 其实我是在逗这个功能
X_train, X_test, y_train, y_test = cross_validation.train_test_split(Data, Target, test_size=0.3, random_state=0)
但它给出了不平衡的数据集!任何建议。
您可以使用 StratifiedShuffleSplit 创建具有与原始数据相同百分比 类 的数据集:
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
X = np.array([[1, 3], [3, 7], [2, 4], [4, 8]])
y = np.array([0, 1, 0, 1])
stratSplit = StratifiedShuffleSplit(y, n_iter=1, test_size=0.5, random_state=42)
for train_idx, test_idx in stratSplit:
X_train=X[train_idx]
y_train=y[train_idx]
print(X_train)
# [[3 7]
# [2 4]]
print(y_train)
# [1 0]
尽管 Christian 的建议是正确的,但从技术上讲,train_test_split
应该使用 stratify
参数为您提供分层结果。
所以你可以这样做:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(Data, Target, test_size=0.3, random_state=0, stratify=Target)
这里的技巧是它从 0.17
in sklearn
的版本开始。
来自关于参数 stratify
的文档:
stratify : array-like or None (default is None) If not None, data is split in a stratified fashion, using this as the labels array. New in version 0.17: stratify splitting
如果 classes 不平衡,但您希望拆分平衡,那么分层将无济于事。似乎没有在 sklearn 中进行平衡采样的方法,但使用基本的 numpy 很容易,例如这样的函数可能对你有帮助:
def split_balanced(data, target, test_size=0.2):
classes = np.unique(target)
# can give test_size as fraction of input data size of number of samples
if test_size<1:
n_test = np.round(len(target)*test_size)
else:
n_test = test_size
n_train = max(0,len(target)-n_test)
n_train_per_class = max(1,int(np.floor(n_train/len(classes))))
n_test_per_class = max(1,int(np.floor(n_test/len(classes))))
ixs = []
for cl in classes:
if (n_train_per_class+n_test_per_class) > np.sum(target==cl):
# if data has too few samples for this class, do upsampling
# split the data to training and testing before sampling so data points won't be
# shared among training and test data
splitix = int(np.ceil(n_train_per_class/(n_train_per_class+n_test_per_class)*np.sum(target==cl)))
ixs.append(np.r_[np.random.choice(np.nonzero(target==cl)[0][:splitix], n_train_per_class),
np.random.choice(np.nonzero(target==cl)[0][splitix:], n_test_per_class)])
else:
ixs.append(np.random.choice(np.nonzero(target==cl)[0], n_train_per_class+n_test_per_class,
replace=False))
# take same num of samples from all classes
ix_train = np.concatenate([x[:n_train_per_class] for x in ixs])
ix_test = np.concatenate([x[n_train_per_class:(n_train_per_class+n_test_per_class)] for x in ixs])
X_train = data[ix_train,:]
X_test = data[ix_test,:]
y_train = target[ix_train]
y_test = target[ix_test]
return X_train, X_test, y_train, y_test
请注意,如果您使用此方法并且每个 class 采样的点数多于输入数据中的点数,那么这些点将被上采样(带替换的采样)。因此,一些数据点会出现多次,这可能会对准确性措施等产生影响。如果某些 class 只有一个数据点,则会出现错误。您可以轻松检查每个 class 的点数,例如使用 np.unique(target, return_counts=True)
这是我用来获取train/test数据索引的实现
def get_safe_balanced_split(target, trainSize=0.8, getTestIndexes=True, shuffle=False, seed=None):
classes, counts = np.unique(target, return_counts=True)
nPerClass = float(len(target))*float(trainSize)/float(len(classes))
if nPerClass > np.min(counts):
print("Insufficient data to produce a balanced training data split.")
print("Classes found %s"%classes)
print("Classes count %s"%counts)
ts = float(trainSize*np.min(counts)*len(classes)) / float(len(target))
print("trainSize is reset from %s to %s"%(trainSize, ts))
trainSize = ts
nPerClass = float(len(target))*float(trainSize)/float(len(classes))
# get number of classes
nPerClass = int(nPerClass)
print("Data splitting on %i classes and returning %i per class"%(len(classes),nPerClass ))
# get indexes
trainIndexes = []
for c in classes:
if seed is not None:
np.random.seed(seed)
cIdxs = np.where(target==c)[0]
cIdxs = np.random.choice(cIdxs, nPerClass, replace=False)
trainIndexes.extend(cIdxs)
# get test indexes
testIndexes = None
if getTestIndexes:
testIndexes = list(set(range(len(target))) - set(trainIndexes))
# shuffle
if shuffle:
trainIndexes = random.shuffle(trainIndexes)
if testIndexes is not None:
testIndexes = random.shuffle(testIndexes)
# return indexes
return trainIndexes, testIndexes
另一种方法是对分层 test/train 拆分进行过度或欠采样。 imbalanced-learn 库对此非常方便,如果您正在进行 在线学习 并且希望保证管道中的火车数据平衡,则特别有用。
from imblearn.pipeline import Pipeline as ImbalancePipeline
model = ImbalancePipeline(steps=[
('data_balancer', RandomOverSampler()),
('classifier', SVC()),
])