pandas 数据框中多标签分类数据集的迭代拆分
Iterative split of multilabel classification dataset in pandas dataframe
我有一个数据集,其中包含具有字符串值的文本列和具有值 1 或 0(分类或无)的多列。我想使用 skmultilearn 以均匀分布的方式拆分此数据,但出现此错误:
KeyError: 'key of type tuple not found and not a MultiIndex'
这是我的代码:
import pandas as pd
from skmultilearn.model_selection import iterative_train_test_split
y = pd.read_csv("dataset.csv")
x = y.pop("text")
x_train, x_test, y_train, y_test = iterative_train_test_split(x, y, test_size=0.1)
这对我有用(这是 98/1/1 分割):
import os
import pandas as pd
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
def main():
# load dataset
y = pd.read_csv("dataset.csv")
x = y.pop("text")
# save tag names to reuse them later for creating pandas DataFrames
tag_names = y.columns
# Data has to be in ndarray format
y = y.to_numpy()
x = x.to_numpy()
# split to train / test
msss = MultilabelStratifiedShuffleSplit(n_splits=2, test_size=0.02, random_state=42)
for train_index, test_index in msss.split(x, y):
x_train, x_test_temp = x[train_index], x[test_index]
y_train, y_test_temp = y[train_index], y[test_index]
# make some memory space
del x
del y
# split to test / validation
msss = MultilabelStratifiedShuffleSplit(n_splits=2, test_size=0.5, random_state=42)
for test_index, val_index in msss.split(x_test_temp, y_test_temp):
x_test, x_val = x_test_temp[test_index], x_test_temp[val_index]
y_test, y_val = y_test_temp[test_index], y_test_temp[val_index]
# train dataset
df_train = pd.DataFrame(data=y_train, columns=tag_names)
df_train.insert(0, "snippet", x_train)
# validation dataset
df_val = pd.DataFrame(data=y_val, columns=tag_names)
df_val.insert(0, "snippet", x_val)
# test dataset
df_test = pd.DataFrame(data=y_test, columns=tag_names)
df_test.insert(0, "snippet", x_test)
if __name__ == "__main__":
main()
我有一个数据集,其中包含具有字符串值的文本列和具有值 1 或 0(分类或无)的多列。我想使用 skmultilearn 以均匀分布的方式拆分此数据,但出现此错误:
KeyError: 'key of type tuple not found and not a MultiIndex'
这是我的代码:
import pandas as pd
from skmultilearn.model_selection import iterative_train_test_split
y = pd.read_csv("dataset.csv")
x = y.pop("text")
x_train, x_test, y_train, y_test = iterative_train_test_split(x, y, test_size=0.1)
这对我有用(这是 98/1/1 分割):
import os
import pandas as pd
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
def main():
# load dataset
y = pd.read_csv("dataset.csv")
x = y.pop("text")
# save tag names to reuse them later for creating pandas DataFrames
tag_names = y.columns
# Data has to be in ndarray format
y = y.to_numpy()
x = x.to_numpy()
# split to train / test
msss = MultilabelStratifiedShuffleSplit(n_splits=2, test_size=0.02, random_state=42)
for train_index, test_index in msss.split(x, y):
x_train, x_test_temp = x[train_index], x[test_index]
y_train, y_test_temp = y[train_index], y[test_index]
# make some memory space
del x
del y
# split to test / validation
msss = MultilabelStratifiedShuffleSplit(n_splits=2, test_size=0.5, random_state=42)
for test_index, val_index in msss.split(x_test_temp, y_test_temp):
x_test, x_val = x_test_temp[test_index], x_test_temp[val_index]
y_test, y_val = y_test_temp[test_index], y_test_temp[val_index]
# train dataset
df_train = pd.DataFrame(data=y_train, columns=tag_names)
df_train.insert(0, "snippet", x_train)
# validation dataset
df_val = pd.DataFrame(data=y_val, columns=tag_names)
df_val.insert(0, "snippet", x_val)
# test dataset
df_test = pd.DataFrame(data=y_test, columns=tag_names)
df_test.insert(0, "snippet", x_test)
if __name__ == "__main__":
main()