Keras error - ValueError: could not convert string to float

Keras error - ValueError: could not convert string to float

我已经回答过处理类似问题的问题。但是,他们无法回答我的具体问题。因此,对于解决我面临的以下问题的任何建议,我将不胜感激。

我正在尝试为文本class化问题实施 RNN 模型。我的 triple.csv 文件中有一个句子 (triple) 和 class 标签 [0, 1] (truth) 的 csv 文件。

triple.csv 文件的示例

triple,truth
sportsteam hawks teamplaysincity city atlanta,1
stadiumoreventvenue hondacenter stadiumlocatedincity city anaheim,1
sportsteam ducks teamplaysincity city anaheim,1
sportsteam n1985chicagobears teamplaysincity city chicago,1
...

我正在尝试使用 RNN 及其 word2vec 嵌入来训练句子(三元组)。但是,我不断收到以下错误。

ValueError: could not convert string to float: 'sportsleague nfl leaguestadiums stadiumoreventvenue heinzfield'

我的主要代码

import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np
import gensim
import pandas as pd
import os
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec, KeyedVectors
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.initializers import Constant
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from termcolor import colored
from keras.utils import to_categorical

nltk.download('stopwords')
# one hot encode

df = pd.DataFrame()
df = pd.read_csv('data/triple.csv')
triple_lines = list()
lines = df['triple'].values.tolist()

for line in lines:
    tokens = word_tokenize(line)
    tokens = [w.lower() for w in tokens]
    table = str.maketrans('','',string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    triple_lines.append(words)

print(colored(len(triple_lines),'green'))
EMBEDDING_DIM = 100
model = gensim.models.Word2Vec(sentences=triple_lines, size=EMBEDDING_DIM, window =5, workers=4, min_count=1)
words = list(model.wv.vocab)
print(colored('Vocabulary size: %d' % len(words),'green'))

filename = 'embedding_word2vec.txt'
model.wv.save_word2vec_format(filename,binary=False)

embedding_index = {}
f = open(os.path.join('', 'embedding_word2vec.txt'), encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embedding_index[word] = coefs
f.close()

#Vectorize the text samples into a S2 integer tensor
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(triple_lines)
sequences = tokenizer_obj.texts_to_sequences(triple_lines)

#pad sequences
word_index = tokenizer_obj.word_index
print(colored('Found %s unique tokens.'% len(word_index),'magenta'))

max_length = 9

triple_pad = pad_sequences(sequences, maxlen=max_length)
truth = df['triple'].values
print('Shape of triple tensor: ', triple_pad.shape)
print('Shape of truth tensor: ', truth.shape)

#map embeddings from loaded word2vec model for each word to the tokenizer_obj.word_index vocabulary & create a wordvector matrix

num_words = len(word_index)+1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word,i in word_index.items():
    if i>num_words:
        continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        # words not found in the embedding index will be all-zero
        embedding_matrix[i] = embedding_vector

print(colored(num_words,'cyan'))

# Define Model
model = Sequential()
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=max_length,
                            trainable=False)

model.add(embedding_layer)
model.add(GRU(units=32, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(colored(model.summary(),'cyan'))

#Split the data into training set and validation set
VALIDATION_SPLIT = 0.2

indices = np.arange(triple_pad.shape[0])
np.random.shuffle(indices)
triple_pad = triple_pad[indices]
truth = truth[indices]
num_validation_samples = int(VALIDATION_SPLIT * triple_pad.shape[0])

X_train_pad = triple_pad[:-num_validation_samples]
y_train = truth[:-num_validation_samples]
X_test_pad = triple_pad[-num_validation_samples:]
y_test = truth[-num_validation_samples:]

print('Shape of X_train_pad tensor: ',X_train_pad.shape)
print('Shape of y_train tensor: ',y_train.shape)
print('Shape of X_test_pad tensor: ',X_test_pad.shape)
print('Shape of y_test tensor: ',y_test.shape)

print(colored('Training...','green'))
model.fit(X_train_pad, y_train, batch_size=128, epochs=25, validation_data=(X_test_pad, y_test), verbose=2)

任何有关如何解决此问题的帮助将不胜感激。

我在将字符串 y_train 传递给 model.fit() 时遇到了这个错误。

我没有将布尔真值定义为目标 class 值,而是将三元组定义为目标 class,它将字符串传递到 model.fit() 中,如下所示。

truth = df['triple'].values

因此,只需按如下方式修改上面的行即可解决此问题。

truth = df['truth'].values

我太想念这些琐碎的细节了。傻我!