在 Flask App 中从 Jupyter Notebook 中解开实例

Unpickle instance from Jupyter Notebook in Flask App

我为 word2vec 矢量化创建了一个 class,它运行良好。 但是当我创建一个模型 pickle 文件并在 Flask 应用程序中使用该 pickle 文件时,我收到如下错误:

AttributeError: module '__main__' has no attribute 'GensimWord2VecVectorizer'

我正在 Google Colab 上创建模型。

Jupyter Notebook 中的代码:

# Word2Vec Model
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.models import Word2Vec

class GensimWord2VecVectorizer(BaseEstimator, TransformerMixin):

    def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None,
                 sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5,
                 ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
                 trim_rule=None, sorted_vocab=1, batch_words=10000, compute_loss=False,
                 callbacks=(), max_final_vocab=None):
        self.size = size
        self.alpha = alpha
        self.window = window
        self.min_count = min_count
        self.max_vocab_size = max_vocab_size
        self.sample = sample
        self.seed = seed
        self.workers = workers
        self.min_alpha = min_alpha
        self.sg = sg
        self.hs = hs
        self.negative = negative
        self.ns_exponent = ns_exponent
        self.cbow_mean = cbow_mean
        self.hashfxn = hashfxn
        self.iter = iter
        self.null_word = null_word
        self.trim_rule = trim_rule
        self.sorted_vocab = sorted_vocab
        self.batch_words = batch_words
        self.compute_loss = compute_loss
        self.callbacks = callbacks
        self.max_final_vocab = max_final_vocab

    def fit(self, X, y=None):
        self.model_ = Word2Vec(
            sentences=X, corpus_file=None,
            size=self.size, alpha=self.alpha, window=self.window, min_count=self.min_count,
            max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed,
            workers=self.workers, min_alpha=self.min_alpha, sg=self.sg, hs=self.hs,
            negative=self.negative, ns_exponent=self.ns_exponent, cbow_mean=self.cbow_mean,
            hashfxn=self.hashfxn, iter=self.iter, null_word=self.null_word,
            trim_rule=self.trim_rule, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words,
            compute_loss=self.compute_loss, callbacks=self.callbacks,
            max_final_vocab=self.max_final_vocab)
        return self

    def transform(self, X):
        X_embeddings = np.array([self._get_embedding(words) for words in X])
        return X_embeddings

    def _get_embedding(self, words):
        valid_words = [word for word in words if word in self.model_.wv.vocab]
        if valid_words:
            embedding = np.zeros((len(valid_words), self.size), dtype=np.float32)
            for idx, word in enumerate(valid_words):
                embedding[idx] = self.model_.wv[word]

            return np.mean(embedding, axis=0)
        else:
            return np.zeros(self.size)

# column transformer
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer([
    ('step1', GensimWord2VecVectorizer(), 'STATUS')
], remainder='drop')

# Create Model
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import pickle
import numpy as np
import dill
import torch
# ##########
# SVC - support vector classifier
# ##########
# defining parameter range
hyperparameters = {'C': [0.1, 1],
                   'gamma': [1, 0.1],
                   'kernel': ['rbf'],
                   'probability': [True]}
model_sv = Pipeline([
    ('column_transformers', ct),
    ('model', GridSearchCV(SVC(), hyperparameters,
                           refit=True, verbose=3)),
])
model_sv_cEXT = model_sv.fit(X_train, y_train['cEXT'])
# Save the trained cEXT - SVM Model.
import joblib
joblib.dump(model_sv_cEXT, 'model_Word2Vec_sv_cEXT.pkl')

Flask 应用程序中的代码:

# Word2Vec
model_EXT_WV_SV = joblib.load('utility/model/MachineLearning/SVM/model_Word2Vec_sv_cEXT.pkl')

我试图将相同的 class 复制到我的 Flask 文件中,但它也不起作用。

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.models import Word2Vec

class GensimWord2VecVectorizer(BaseEstimator, TransformerMixin):

    def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None,
                 sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5,
                 ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
                 trim_rule=None, sorted_vocab=1, batch_words=10000, compute_loss=False,
                 callbacks=(), max_final_vocab=None):
        self.size = size
        self.alpha = alpha
        self.window = window
        self.min_count = min_count
        self.max_vocab_size = max_vocab_size
        self.sample = sample
        self.seed = seed
        self.workers = workers
        self.min_alpha = min_alpha
        self.sg = sg
        self.hs = hs
        self.negative = negative
        self.ns_exponent = ns_exponent
        self.cbow_mean = cbow_mean
        self.hashfxn = hashfxn
        self.iter = iter
        self.null_word = null_word
        self.trim_rule = trim_rule
        self.sorted_vocab = sorted_vocab
        self.batch_words = batch_words
        self.compute_loss = compute_loss
        self.callbacks = callbacks
        self.max_final_vocab = max_final_vocab

    def fit(self, X, y=None):
        self.model_ = Word2Vec(
            sentences=X, corpus_file=None,
            size=self.size, alpha=self.alpha, window=self.window, min_count=self.min_count,
            max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed,
            workers=self.workers, min_alpha=self.min_alpha, sg=self.sg, hs=self.hs,
            negative=self.negative, ns_exponent=self.ns_exponent, cbow_mean=self.cbow_mean,
            hashfxn=self.hashfxn, iter=self.iter, null_word=self.null_word,
            trim_rule=self.trim_rule, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words,
            compute_loss=self.compute_loss, callbacks=self.callbacks,
            max_final_vocab=self.max_final_vocab)
        return self

    def transform(self, X):
        X_embeddings = np.array([self._get_embedding(words) for words in X])
        return X_embeddings

    def _get_embedding(self, words):
        valid_words = [word for word in words if word in self.model_.wv.vocab]
        if valid_words:
            embedding = np.zeros((len(valid_words), self.size), dtype=np.float32)
            for idx, word in enumerate(valid_words):
                embedding[idx] = self.model_.wv[word]

            return np.mean(embedding, axis=0)
        else:
            return np.zeros(self.size)

# Word2Vec
model_EXT_WV_SV = joblib.load('utility/model/MachineLearning/SVM/model_Word2Vec_sv_cEXT.pkl')

GitHub代码:https://github.com/Juned-Ansari/test

泡菜文件:https://github.com/Juned-Ansari/test/blob/main/model_Word2Vec_sv_cEXT.pkl

Flask 网络应用程序:https://github.com/Juned-Ansari/test/tree/main/WebApp

在您的 Flask Web 应用 python 文件中导入 GensimWord2VecVectorizer

来自 https://docs.python.org/3/library/pickle.html:

pickle can save and restore class instances transparently, however the class definition must be importable and live in the same module as when the object was stored.

The following types can be pickled:

  • ...
  • classes that are defined at the top level of a module
  • instances of such classes ...

考虑您的目录结构:

├── WebApp/
│  └── app.py
└── Untitled.ipynb

假设您 flask run 来自 WebApp/,那么 app 是一个 top-level 模块。

首先,将 class GensimWord2VecVectorizer 移动到 WebApp/app.py 的顶层。

接下来,在您的 Jupyter Notebook 中,导入 GensimWord2VecVectorizer 并欺骗 pickle 认为它来自 top-level app 模块:

from WebApp.app import GensimWord2VecVectorizer
GensimWord2VecVectorizer.__module__ = 'app'

import sys
sys.modules['app'] = sys.modules['WebApp.app']

那么你应该可以 dumpload pickle 文件。

Google Colab

如果导入本地模块很麻烦,请改为:

GensimWord2VecVectorizer.__module__ = 'app'

import sys
app = sys.modules['app'] = type(sys)('app')
app.GensimWord2VecVectorizer = GensimWord2VecVectorizer

那么你应该可以 dumpload pickle 文件。