如何将keras嵌入替换为CNN的预训练词嵌入

how to replace keras embedding with pre-trained word embedding to CNN

我目前正在研究如何将 CNN 用于文本分类,并在堆栈溢出上找到了一些使用 keras 嵌入层的代码。

我 运行 使用 keras 嵌入的代码,但现在想测试预训练嵌入会发生什么,我已经从 gensim 下载了 word2vec api 但不知道如何从那里调整代码?

我的问题是如何用 word2vec 模型或 Glove 等预训练嵌入替换 keras 嵌入层?

这是代码

from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Convolution1D, Flatten, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.callbacks import TensorBoard

# Using keras to load the dataset with the top_words
top_words = 10000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

# Pad the sequence to the same length
max_review_length = 1600
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

# Using embedding from Keras
embedding_vecor_length = 300
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))

# Convolutional model (3x conv, flatten, 2x dense)
model.add(Convolution1D(64, 3, padding='same'))
model.add(Convolution1D(32, 3, padding='same'))
model.add(Convolution1D(16, 3, padding='same'))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(180,activation='sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(1,activation='sigmoid'))

# Log to tensorboard
tensorBoardCallback = TensorBoard(log_dir='./logs', write_graph=True)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=3, callbacks=[tensorBoardCallback], batch_size=64)

# Evaluation on the test set
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

这会读取包含权重的文本文件,将单词及其权重存储在字典中,然后使用适合分词器的词汇表将它们映射到新矩阵中。

from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Convolution1D, Flatten, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.callbacks import TensorBoard
from tensorflow import keras
import itertools
import numpy as np


# Using keras to load the dataset with the top_words
top_words = 10000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

word_index = keras.datasets.imdb.get_word_index()

embedding_vecor_length = 300  # same as the embeds to be loaded below
embeddings_dictionary = dict()
glove_file = open('./embeds/glove.6B.300d.txt', 'rb')

for line in glove_file:
    records = line.split()  # seperates each line by a white space
    word = records[0]  # the first element is the word
    vector_dimensions = np.asarray(
        records[1:], dtype='float32')  # the rest are the weights
    # storing in dictionary
    embeddings_dictionary[word] = vector_dimensions
    
glove_file.close()

# len_of_vocab = len(word_index)
embeddings_matrix = np.zeros((top_words, embedding_vecor_length))
# mapping to a new matrix, using only the words in your tokenizer's vocabulary
for word, index in word_index.items():
    if index>=top_words:
        continue
    # the weights of the individual words in your vocabulary
    embedding_vector = embeddings_dictionary.get(bytes(word, 'utf-8'))
    if embedding_vector is not None:
        embeddings_matrix[index] = embedding_vector

# Pad the sequence to the same length
max_review_length = 1600
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)


# Using embedding from Keras
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length,
          input_length=max_review_length, name="embeddinglayer", weights=[embeddings_matrix], trainable=True))


# Convolutional model (3x conv, flatten, 2x dense)
model.add(Convolution1D(64, 3, padding='same'))
model.add(Convolution1D(32, 3, padding='same'))
model.add(Convolution1D(16, 3, padding='same'))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(180, activation='sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

# Log to tensorboard
tensorBoardCallback = TensorBoard(log_dir='./logs', write_graph=True)
model.compile(loss='binary_crossentropy',
              optimizer='adam', metrics=['accuracy'])


model.fit(X_train, y_train, epochs=3, callbacks=[
          tensorBoardCallback], batch_size=64)


# Evaluation on the test set
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))