train_test_split 二维标签作为分层数组的异常
train_test_split exception with 2D labels as stratify array
我正在尝试通过提供标签数组来使用 train_test_split
函数,该数组是用于分层的二维数组,只有 0 或 1 个值(即 [0,0]、[0、 1]、[1,0] 或 [1,1] 是四种可能的标签)。出于代码兼容性原因,我无法重命名标签(例如,重命名为 1、2、3、4)。
以下使用train_test_split
的函数代码
def preprocess_csv_deceptive_opinion(path, prediction, dataset_recreations):
"""
:param path: path of the original csv file
:param prediction: it can be 'deceptive', 'polarity' or both on the basis of what you want to predict
:param dataset_recreations: it is the number of random seeds used
:return:
"""
read_dataset = pd.read_csv(path) # Dataset columns: deceptive,hotel,polarity,source,text
# print('average sentence length: ', df.text.str.split().str.len().mean())
# print('stdev sentence length: ', df.text.str.split().str.len().std())
read_dataset = read_dataset.drop(['hotel', 'source'], axis=1)
if prediction == 'multi':
# 00 = truthful positive, 01 = truthful negative, 10 = deceptive positive, 11 = deceptive negative
read_dataset['deceptive'] = (read_dataset['deceptive'] == 'deceptive').astype(int)
read_dataset['polarity'] = (read_dataset['polarity'] == 'negative').astype(int)
elif prediction == 'deceptive':
read_dataset = read_dataset.drop(['polarity'], axis=1) # 0 = truthful, 1 = deceptive
read_dataset['deceptive'] = (read_dataset['deceptive'] == 'deceptive').astype(int)
elif prediction == 'polarity':
read_dataset = read_dataset.drop(['deceptive'], axis=1)
read_dataset['polarity'] = (read_dataset['polarity'] == 'negative').astype(int) # 0 = positive, 1 = negative
else:
sys.exit('Label not valid!')
cols = read_dataset.columns
if prediction == 'multi':
label_cols = list(cols[:2])
elif prediction == 'deceptive' or prediction == 'polarity':
label_cols = list(cols[:1])
else:
sys.exit('"label_cols" variable not initialised!')
num_labels = len(label_cols)
print('Label columns: ', label_cols)
read_dataset['one_hot_labels'] = list(read_dataset[label_cols].to_numpy())
# def get_scalar(x):
# if (x == [0, 0]).all():
# return 0 # truthful positive
# elif (x == [0, 1]).all():
# return 1 # truthful negative
# elif (x == [1, 0]).all():
# return 2 # deceptive positive
# elif (x == [1, 1]).all():
# return 3 # deceptive negative
# if prediction == 'multi':
# read_dataset['one_hot_labels'] = list(map(get_scalar, read_dataset['one_hot_labels']))
read_dataset = read_dataset.drop(['deceptive', 'polarity'], axis=1)
# read_dataset = read_dataset.drop(['deceptive', 'polarity'], axis=1)
# Resulting dataset: x | text | one_hot_labels where:
# x is deceptive or polarity or [deceptive, polarity] where get_scalar function is applied to the latest alternative
# one_hot_labels is [deceptive] or [polarity] or [deceptive, polarity]
training_sizes = [0.5, 0.6, 0.7, 0.8, 0.9]
training_data = {}
validation_data = {}
for size in training_sizes:
training_data[size] = {}
validation_data[size] = {}
for i in range(dataset_recreations):
print(type(read_dataset.one_hot_labels))
training, validation = train_test_split(read_dataset, train_size=size, shuffle=True, random_state=i, stratify=read_dataset['one_hot_labels'].to_numpy())
training_data[size][i] = training
validation_data[size][i] = validation
return [read_dataset, num_labels, training_data, validation_data]
但我收到以下信息:
File "/home/[...]/main.py", line 1409, in preprocess_csv_deceptive_opinion
training, validation = train_test_split(read_dataset, train_size=size, shuffle=True, random_state=i, > stratify=read_dataset['one_hot_labels'].to_numpy())
File "/home/[...]/.conda/envs/RC_37/lib/python3.7/site-packages/sklearn/model_selection/_split.py", line 2197, in train_test_split
train, test = next(cv.split(X=arrays[0], y=stratify))
File "/home/[...]/.conda/envs/RC_37/lib/python3.7/site-packages/sklearn/model_selection/_split.py", line 1793, in split
y = check_array(y, ensure_2d=False, dtype=None)
File "/home/[...]/.conda/envs/RC_37/lib/python3.7/site-packages/sklearn/utils/validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "/home/[...]/.conda/envs/RC_37/lib/python3.7/site-packages/sklearn/utils/validation.py", line 664, in check_array
allow_nan=force_all_finite == 'allow-nan')
File "/home/[...]/.conda/envs/RC_37/lib/python3.7/site-packages/sklearn/utils/validation.py", line 110, in _assert_all_finite
if _object_dtype_isnan(X).any():
AttributeError: 'bool' object has no attribute 'any'
使用的数据集可用here。
问题是什么,我该如何解决?
这是由于类型冲突以及 pandas
内部处理列表的方式造成的。 案例 3 重现错误,然后显示如何修复它。
案例 1:train_test_split
可以处理分层二维标签:
from sklearn.model_selection import train_test_split
import numpy as np
X = np.array([[-0.328,-0.192,0.260,0.408,0.356,0.472,-0.328,-0.184,-0.440,-0.312,0.348,0.432,0.464,0.532,-0.428,-0.292],[0.323,0.503,0.591,0.499,-0.313,-0.229,-0.213,-0.361,0.451,0.595,0.723,0.675,-0.445,-0.361,-0.317,-0.449]]).T
y = np.array([[0,0], [0,0], [0,1], [0,1], [1,0], [1,0], [1,1], [1,1], [0,0], [0,0], [0,1], [0,1], [1,0], [1,0], [1,1], [1,1]])
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
print(X_train)
print(y_train)
# [[ 0.408 0.499]
# ...
# [ 0.464 -0.445]]
# [[0 1]
# ...
# [1 0]]
案例 2:与 pandas 数据框类似的示例:
from sklearn.model_selection import train_test_split
import pandas as pd
read_data = pd.DataFrame({
'x0': [-0.328,-0.192,0.260,0.408,0.356,0.472,-0.328,-0.184,-0.440,-0.312,0.348,0.432,0.464,0.532,-0.428,-0.292],
'x1': [0.323,0.503,0.591,0.499,-0.313,-0.229,-0.213,-0.361,0.451,0.595,0.723,0.675,-0.445,-0.361,-0.317,-0.449],
'y': [[0,0], [0,0], [0,1], [0,1], [1,0], [1,0], [1,1], [1,1], [0,0], [0,0], [0,1], [0,1], [1,0], [1,0], [1,1], [1,1]],
})
train, test = train_test_split(read_data, train_size=0.8, stratify=read_data['y'], random_state=0)
print(train)
print(test)
# x0 x1 y
# 3 0.408 0.499 [0, 1]
# 1 -0.192 0.503 [0, 0]
# ...
# 12 0.464 -0.445 [1, 0]
# x0 x1 y
# 5 0.472 -0.229 [1, 0]
# 15 -0.292 -0.449 [1, 1]
# 11 0.432 0.675 [0, 1]
# 0 -0.328 0.323 [0, 0]
案例 3:(麻烦开始的地方)
这是一个从问题中再现 AttributeError
的最小示例:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
read_data = pd.DataFrame({
'x0': [-0.328,-0.192,0.260,0.408,0.356,0.472,-0.328,-0.184,-0.440,-0.312,0.348,0.432,0.464,0.532,-0.428,-0.292],
'x1': [0.323,0.503,0.591,0.499,-0.313,-0.229,-0.213,-0.361,0.451,0.595,0.723,0.675,-0.445,-0.361,-0.317,-0.449],
'y': [[0,0], [0,0], [0,1], [0,1], [1,0], [1,0], [1,1], [1,1], [0,0], [0,0], [0,1], [0,1], [1,0], [1,0], [1,1], [1,1]],
})
read_data['y_new'] = list(np.array([value for value in read_data['y'].values]))
read_data = read_data.drop(['y'], axis=1)
train, test = train_test_split(read_data, train_size=0.8, stratify=read_data['y_new'], random_state=0)
此处,y_new
列中的每个值都是 numpy.ndarray
:
>>> type(read_data['y_new'][0])
<class 'numpy.ndarray'>
如何解决:
最直接的方法是确保 y_new
列中的值是列表,例如将行更改为:
read_data['y_new'] = [value for value in read_data['y'].values]
在与问题一起发布的示例中,这相当于更改为这些行:
read_dataset['one_hot_labels'] = list(read_dataset[list(read_dataset.columns[:2])].values.tolist())
read_dataset = read_dataset.drop(['deceptive', 'polarity'], axis=1)
我正在尝试通过提供标签数组来使用 train_test_split
函数,该数组是用于分层的二维数组,只有 0 或 1 个值(即 [0,0]、[0、 1]、[1,0] 或 [1,1] 是四种可能的标签)。出于代码兼容性原因,我无法重命名标签(例如,重命名为 1、2、3、4)。
以下使用train_test_split
的函数代码
def preprocess_csv_deceptive_opinion(path, prediction, dataset_recreations):
"""
:param path: path of the original csv file
:param prediction: it can be 'deceptive', 'polarity' or both on the basis of what you want to predict
:param dataset_recreations: it is the number of random seeds used
:return:
"""
read_dataset = pd.read_csv(path) # Dataset columns: deceptive,hotel,polarity,source,text
# print('average sentence length: ', df.text.str.split().str.len().mean())
# print('stdev sentence length: ', df.text.str.split().str.len().std())
read_dataset = read_dataset.drop(['hotel', 'source'], axis=1)
if prediction == 'multi':
# 00 = truthful positive, 01 = truthful negative, 10 = deceptive positive, 11 = deceptive negative
read_dataset['deceptive'] = (read_dataset['deceptive'] == 'deceptive').astype(int)
read_dataset['polarity'] = (read_dataset['polarity'] == 'negative').astype(int)
elif prediction == 'deceptive':
read_dataset = read_dataset.drop(['polarity'], axis=1) # 0 = truthful, 1 = deceptive
read_dataset['deceptive'] = (read_dataset['deceptive'] == 'deceptive').astype(int)
elif prediction == 'polarity':
read_dataset = read_dataset.drop(['deceptive'], axis=1)
read_dataset['polarity'] = (read_dataset['polarity'] == 'negative').astype(int) # 0 = positive, 1 = negative
else:
sys.exit('Label not valid!')
cols = read_dataset.columns
if prediction == 'multi':
label_cols = list(cols[:2])
elif prediction == 'deceptive' or prediction == 'polarity':
label_cols = list(cols[:1])
else:
sys.exit('"label_cols" variable not initialised!')
num_labels = len(label_cols)
print('Label columns: ', label_cols)
read_dataset['one_hot_labels'] = list(read_dataset[label_cols].to_numpy())
# def get_scalar(x):
# if (x == [0, 0]).all():
# return 0 # truthful positive
# elif (x == [0, 1]).all():
# return 1 # truthful negative
# elif (x == [1, 0]).all():
# return 2 # deceptive positive
# elif (x == [1, 1]).all():
# return 3 # deceptive negative
# if prediction == 'multi':
# read_dataset['one_hot_labels'] = list(map(get_scalar, read_dataset['one_hot_labels']))
read_dataset = read_dataset.drop(['deceptive', 'polarity'], axis=1)
# read_dataset = read_dataset.drop(['deceptive', 'polarity'], axis=1)
# Resulting dataset: x | text | one_hot_labels where:
# x is deceptive or polarity or [deceptive, polarity] where get_scalar function is applied to the latest alternative
# one_hot_labels is [deceptive] or [polarity] or [deceptive, polarity]
training_sizes = [0.5, 0.6, 0.7, 0.8, 0.9]
training_data = {}
validation_data = {}
for size in training_sizes:
training_data[size] = {}
validation_data[size] = {}
for i in range(dataset_recreations):
print(type(read_dataset.one_hot_labels))
training, validation = train_test_split(read_dataset, train_size=size, shuffle=True, random_state=i, stratify=read_dataset['one_hot_labels'].to_numpy())
training_data[size][i] = training
validation_data[size][i] = validation
return [read_dataset, num_labels, training_data, validation_data]
但我收到以下信息:
File "/home/[...]/main.py", line 1409, in preprocess_csv_deceptive_opinion
training, validation = train_test_split(read_dataset, train_size=size, shuffle=True, random_state=i, > stratify=read_dataset['one_hot_labels'].to_numpy())
File "/home/[...]/.conda/envs/RC_37/lib/python3.7/site-packages/sklearn/model_selection/_split.py", line 2197, in train_test_split
train, test = next(cv.split(X=arrays[0], y=stratify))
File "/home/[...]/.conda/envs/RC_37/lib/python3.7/site-packages/sklearn/model_selection/_split.py", line 1793, in split
y = check_array(y, ensure_2d=False, dtype=None)
File "/home/[...]/.conda/envs/RC_37/lib/python3.7/site-packages/sklearn/utils/validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "/home/[...]/.conda/envs/RC_37/lib/python3.7/site-packages/sklearn/utils/validation.py", line 664, in check_array
allow_nan=force_all_finite == 'allow-nan')
File "/home/[...]/.conda/envs/RC_37/lib/python3.7/site-packages/sklearn/utils/validation.py", line 110, in _assert_all_finite
if _object_dtype_isnan(X).any():
AttributeError: 'bool' object has no attribute 'any'
使用的数据集可用here。
问题是什么,我该如何解决?
这是由于类型冲突以及 pandas
内部处理列表的方式造成的。 案例 3 重现错误,然后显示如何修复它。
案例 1:train_test_split
可以处理分层二维标签:
from sklearn.model_selection import train_test_split
import numpy as np
X = np.array([[-0.328,-0.192,0.260,0.408,0.356,0.472,-0.328,-0.184,-0.440,-0.312,0.348,0.432,0.464,0.532,-0.428,-0.292],[0.323,0.503,0.591,0.499,-0.313,-0.229,-0.213,-0.361,0.451,0.595,0.723,0.675,-0.445,-0.361,-0.317,-0.449]]).T
y = np.array([[0,0], [0,0], [0,1], [0,1], [1,0], [1,0], [1,1], [1,1], [0,0], [0,0], [0,1], [0,1], [1,0], [1,0], [1,1], [1,1]])
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
print(X_train)
print(y_train)
# [[ 0.408 0.499]
# ...
# [ 0.464 -0.445]]
# [[0 1]
# ...
# [1 0]]
案例 2:与 pandas 数据框类似的示例:
from sklearn.model_selection import train_test_split
import pandas as pd
read_data = pd.DataFrame({
'x0': [-0.328,-0.192,0.260,0.408,0.356,0.472,-0.328,-0.184,-0.440,-0.312,0.348,0.432,0.464,0.532,-0.428,-0.292],
'x1': [0.323,0.503,0.591,0.499,-0.313,-0.229,-0.213,-0.361,0.451,0.595,0.723,0.675,-0.445,-0.361,-0.317,-0.449],
'y': [[0,0], [0,0], [0,1], [0,1], [1,0], [1,0], [1,1], [1,1], [0,0], [0,0], [0,1], [0,1], [1,0], [1,0], [1,1], [1,1]],
})
train, test = train_test_split(read_data, train_size=0.8, stratify=read_data['y'], random_state=0)
print(train)
print(test)
# x0 x1 y
# 3 0.408 0.499 [0, 1]
# 1 -0.192 0.503 [0, 0]
# ...
# 12 0.464 -0.445 [1, 0]
# x0 x1 y
# 5 0.472 -0.229 [1, 0]
# 15 -0.292 -0.449 [1, 1]
# 11 0.432 0.675 [0, 1]
# 0 -0.328 0.323 [0, 0]
案例 3:(麻烦开始的地方)
这是一个从问题中再现 AttributeError
的最小示例:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
read_data = pd.DataFrame({
'x0': [-0.328,-0.192,0.260,0.408,0.356,0.472,-0.328,-0.184,-0.440,-0.312,0.348,0.432,0.464,0.532,-0.428,-0.292],
'x1': [0.323,0.503,0.591,0.499,-0.313,-0.229,-0.213,-0.361,0.451,0.595,0.723,0.675,-0.445,-0.361,-0.317,-0.449],
'y': [[0,0], [0,0], [0,1], [0,1], [1,0], [1,0], [1,1], [1,1], [0,0], [0,0], [0,1], [0,1], [1,0], [1,0], [1,1], [1,1]],
})
read_data['y_new'] = list(np.array([value for value in read_data['y'].values]))
read_data = read_data.drop(['y'], axis=1)
train, test = train_test_split(read_data, train_size=0.8, stratify=read_data['y_new'], random_state=0)
此处,y_new
列中的每个值都是 numpy.ndarray
:
>>> type(read_data['y_new'][0])
<class 'numpy.ndarray'>
如何解决:
最直接的方法是确保 y_new
列中的值是列表,例如将行更改为:
read_data['y_new'] = [value for value in read_data['y'].values]
在与问题一起发布的示例中,这相当于更改为这些行:
read_dataset['one_hot_labels'] = list(read_dataset[list(read_dataset.columns[:2])].values.tolist())
read_dataset = read_dataset.drop(['deceptive', 'polarity'], axis=1)