Scikit Learn fit()：用序列拟合设置数组元素

Question

我正在尝试在数据帧上调用 scikit learn fit 函数，其中每列的元素都是 numpy 数组。但是，我收到错误“使用序列设置数组元素”，这大概是因为我试图在数组数据帧而不是标量值上调用 fit。我该如何解决这个问题？非常感谢您的帮助。

这是我的代码。您可以在此处找到我正在使用的数据：https://competitions.codalab.org/competitions/21163

training_data = pd.read_csv('/train.tsv', sep='\t')
testing_data = pd.read_csv('/dev.tsv', sep='\t')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True,max_length=1024)
model = BertModel.from_pretrained('bert-base-uncased')
model = model.to(device)

# These are used to map the data to their appropriate column on each pass
pomt_train_x = pd.DataFrame(columns=["claim", "reason", "category", "speaker", "checker", "tags", "claim entities", "article title"])
feature_dict = {1: "claim", 4: "reason", 5: "category", 6: "speaker", 7: "checker", 8: "tags", 9: "claim entities", 10: "article title"}

# Sort the data appropriately.
for i, data in enumerate(training_data[training_data.columns].to_numpy()):
    if 'pomt' in data[0]:
        appended_data = {}
        for j, sentence in enumerate(data):
            if j in feature_dict:
                inputs = tokenizer(str(sentence), return_tensors="pt", max_length=512, pad_to_max_length=True).to(device)
                outputs = model(**inputs)
                appended_data[feature_dict[j]] = outputs.last_hidden_state[:,0][0].cpu().detach().numpy()
        pomt_train_x = pomt_train_x.append(appended_data, ignore_index=True)
        print(f"{i + 1} out of {training_data.index.stop} from training")

count = 0
# append testing data to training data
for i, data in enumerate(testing_data[testing_data.columns].to_numpy()):
    if 'pomt' in data[0]:
        appended_data = {}
        for j, sentence in enumerate(data):
            if j in feature_dict:
                inputs = tokenizer(str(sentence), return_tensors="pt", max_length=512, pad_to_max_length=True).to(device)
                outputs = model(**inputs)
                appended_data[feature_dict[j]] = outputs.last_hidden_state[:,0][0].cpu().detach().numpy()
        pomt_train_x = pomt_train_x.append(appended_data, ignore_index=True)
        print(f"{i + 1} out of {testing_data.index.stop} from testing")
        count += 1

# Map the possible labels to an emotion
positive_set = set(['half-true', 'correct attribution!', 'correct', 'determination: barely true', 'factscan score: true',
                'correct attribution', 'mostly true', 'mostly-correct', 'truth!', 'partially true', 'half true',
                'mostly truth!', 'determination: true', 'true messages', 'authorship confirmed!', 'verdict: true',
                'mostly_true', 'determination: mostly true', 'confirmed authorship!', 'conclusion: accurate', 'accurate',
                'true', 'partly true', 'fact', 'full flop', 'in-the-green', 'verified'])
negative_set = set({'fake news', 'verdict: false', '3 pinnochios', 'fiction!', 'bogus warning', 'we rate this claim false',
                'determination: false', 'disputed!', 'false', 'fiction', 'a lot of baloney', '2 pinnochios', 'some baloney',
                'mostly_false', 'cherry picks', 'miscaptioned', 'misleading!', 'misleading recommendations', 'mostly fiction!',
                'mostly false', 'a little baloney', 'fiction! & satire!', 'conclusion: false', 'rating: false',
                'determination: misleading', 'promise broken', '4 pinnochios', 'misleading', 'promise kept',
                'misattributed', 'fake', 'previously truth! now resolved!','incorrect attribution!', 'incorrect',
                'spins the facts', 'determination: a stretch', 'factscan score: misleading', 'pants on fire!',
                'factscan score: false', 'exaggerates', 'outdated', 'facebook scams', 'unsupported', 'opinion!',
                'verdict: unsubstantiated', 'scam', 'virus!', 'no flip', 'scam!', 'unverified', 'distorts the facts', 'outdated!'
                'understated', 'no evidence', 'unproven!', 'inaccurate attribution!', 'statirical reports', 'unproven', 'exaggerated', 
                'determination: huckster propaganda', 'grass roots movement!', 'commentary!', 'in-the-red', 'unsubstantiated messages',})
neutral_set = set({'truth! & fiction!', 'conclusion: unclear', '1', 'unobservable', 'needs context', 'truth! & disputed!', 'half flip',
               '0',  'in-between', '4', 'None', '2', 'none',  'investigation pending!','not the whole story', '10','in the works',
               'truth! & misleading!', '3',  'mixture', 'not yet rated', 'legend', 'stalled', 'truth! & unproven!', 'truth! & outdated!',
               'compromise'})

# Read in the labels for the appropriate data
pomt_train_y = pd.DataFrame(columns=["label"])

sign_to_append = 0

for i, data in enumerate(training_data[training_data.columns].to_numpy()):
    if 'pomt' in data[0]:
        if data[2] in positive_set:
            sign_to_append = 1
        elif data[2] in negative_set:
            sign_to_append = -1
        else:
            sign_to_append = 0
        pomt_train_y = pomt_train_y.append({'label':sign_to_append}, ignore_index=True)
        print(f"{i + 1} out of {training_data.index.stop} from training")

# append testing data to training data
for i, data in enumerate(testing_data[testing_data.columns].to_numpy()):
    if 'pomt' in data[0]:
        if data[2] in positive_set:
            sign_to_append = 1
        elif data[2] in negative_set:
            sign_to_append = -1
        else:
            sign_to_append = 0
        pomt_train_y = pomt_train_y.append({'label':sign_to_append}, ignore_index=True)
        print(f"{i + 1} out of {testing_data.index.stop} from testing")

pomt_X_train, pomt_X_test, pomt_Y_train, pomt_Y_test = train_test_split(pomt_train_x, pomt_train_y, test_size= (count / pomt_train_x.shape[0]), stratify=pomt_train_y)
pomt_Y_train = pomt_Y_train.astype("int")
pomt_Y_test = pomt_Y_test.astype("int")

# One Vs. One Multiclass Classification
clf = OneVsOneClassifier(SVC(C = 1, verbose=True))

# Fit to Training Data
clf.fit(pomt_X_train, pomt_Y_train)

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
TypeError: only size-1 arrays can be converted to Python scalars

The above exception was the direct cause of the following exception:

ValueError                                Traceback (most recent call last)
<ipython-input-22-3314e23093e3> in <module>()
      1 # Fit to Training Data
----> 2 clf.fit(pomt_X_train.squeeze(), pomt_Y_train)
      3 
      4 # Training data accuracy
      5 X_train_prediction = clf.predict(pomt_X_train)

4 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py in __array__(self, dtype)
   1991 
   1992     def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
-> 1993         return np.asarray(self._values, dtype=dtype)
   1994 
   1995     def __array_wrap__(

ValueError: setting an array element with a sequence.

Answer 1

我想出了自己该怎么做。我基本上只是在数据框中创建了一列来反映列表的每个元素，而不是每个列表本身。这有点不直观，但它确实有效。

Scikit Learn fit()：用序列拟合设置数组元素

Scikit Learn fit(): Setting an array element with a sequence fit

python

numpy

pandas

tensorflow

bert-language-model