使用 word2vec 进行文本分类
Text Classification with word2vec
我在做文本分类,打算用word2vec词嵌入。
我已经使用 gensim 模块进行 word2vec 训练。
我已经尝试了几个选项。但是我收到错误消息 'xyz' 不在词汇表中。我找不到我的错误。
文本处理
def clean_text(text):
text = text.translate(string.punctuation)
text = text.lower().split()
stops = set(stopwords.words("english"))
text = [w for w in text if not w in stops]
text = " ".join(text)
text = re.sub(r"[^\w\s]", " ",text)
text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ",text)
text = text.split()
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(w) for w in text]
text = " ".join(lemmatized_words)
return text
data['text'] = data['text'].map(lambda x: clean_text(x))
请帮我解决问题。
定义语料库
def build_corpus(data):
"Creates a list of lists containing words from each sentence"
corpus = []
for col in ['text']:
for sentence in data[col].iteritems():
word_list = sentence[1].split(" ")
corpus.append(word_list)
return corpus
corpus = build_corpus(data)
Word2vec 模型
from gensim.models import word2vec
model = word2vec.Word2Vec(corpus, size=100, window=20, min_count=20, workers=12, sg=1)
words = list(model.wv.vocab)
tokenizer = Tokenizer()
X = data.text
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
X = pad_sequences(sequences, maxlen=10000)
embedding_vector_size=100
vocab_size = len(words)
embedding_matrix = np.zeros((vocab_size, embedding_vector_size))
for index, word in enumerate(words):
embedding_vector = model.wv[word]
if embedding_vector is not None:
embedding_matrix[index] = embedding_vector
现在我在下游分类任务中使用我创建的词嵌入。
分类模型
labels = data['Priority']
我有两个优先事项。我要分类。
X_train, X_test, y_train, y_test = train_test_split(X , labels, test_size=0.25, random_state=42)
我正在使用以下网络进行分类
model3 = Sequential()
model3.add(Embedding(input_dim = vocab_size, output_dim = embedding_vector_size, input_length = max_len, weights=[embedding_matrix]))
model3.add(SpatialDropout1D(0.7))
model3.add(LSTM(64, dropout=0.7, recurrent_dropout=0.7))
model3.add(Dense(2, activation='softmax'))
model3.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
print(model3.summary())
我在这里遇到错误:
'ValueError: "input_length" is 10000, but received input has shape (None, 3)'
请帮我解决一下out.Thank你
并非所有来自 corpus 的单词都会保留在 word2vec 模型中。
替换:
vocab_size = len(tokenizer.word_index) + 1
有:
vocab_size = len(words)
并替换:
for word, i in tokenizer.word_index.items():
有:
for i, word in enumerate(words):
从而确保您的嵌入矩阵仅包含模型中的单词。
我在做文本分类,打算用word2vec词嵌入。 我已经使用 gensim 模块进行 word2vec 训练。
我已经尝试了几个选项。但是我收到错误消息 'xyz' 不在词汇表中。我找不到我的错误。
文本处理
def clean_text(text):
text = text.translate(string.punctuation)
text = text.lower().split()
stops = set(stopwords.words("english"))
text = [w for w in text if not w in stops]
text = " ".join(text)
text = re.sub(r"[^\w\s]", " ",text)
text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ",text)
text = text.split()
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(w) for w in text]
text = " ".join(lemmatized_words)
return text
data['text'] = data['text'].map(lambda x: clean_text(x))
请帮我解决问题。
定义语料库
def build_corpus(data):
"Creates a list of lists containing words from each sentence"
corpus = []
for col in ['text']:
for sentence in data[col].iteritems():
word_list = sentence[1].split(" ")
corpus.append(word_list)
return corpus
corpus = build_corpus(data)
Word2vec 模型
from gensim.models import word2vec
model = word2vec.Word2Vec(corpus, size=100, window=20, min_count=20, workers=12, sg=1)
words = list(model.wv.vocab)
tokenizer = Tokenizer()
X = data.text
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
X = pad_sequences(sequences, maxlen=10000)
embedding_vector_size=100
vocab_size = len(words)
embedding_matrix = np.zeros((vocab_size, embedding_vector_size))
for index, word in enumerate(words):
embedding_vector = model.wv[word]
if embedding_vector is not None:
embedding_matrix[index] = embedding_vector
现在我在下游分类任务中使用我创建的词嵌入。
分类模型
labels = data['Priority']
我有两个优先事项。我要分类。
X_train, X_test, y_train, y_test = train_test_split(X , labels, test_size=0.25, random_state=42)
我正在使用以下网络进行分类
model3 = Sequential()
model3.add(Embedding(input_dim = vocab_size, output_dim = embedding_vector_size, input_length = max_len, weights=[embedding_matrix]))
model3.add(SpatialDropout1D(0.7))
model3.add(LSTM(64, dropout=0.7, recurrent_dropout=0.7))
model3.add(Dense(2, activation='softmax'))
model3.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
print(model3.summary())
我在这里遇到错误:
'ValueError: "input_length" is 10000, but received input has shape (None, 3)'
请帮我解决一下out.Thank你
并非所有来自 corpus 的单词都会保留在 word2vec 模型中。
替换:
vocab_size = len(tokenizer.word_index) + 1
有:
vocab_size = len(words)
并替换:
for word, i in tokenizer.word_index.items():
有:
for i, word in enumerate(words):
从而确保您的嵌入矩阵仅包含模型中的单词。