如何在 BERT 模型上应用剪枝?
How can I apply pruning on a BERT model?
我已经使用 ktrain(TensorFlow 包装器)训练了一个 BERT 模型来识别文本上的情绪。它有效,但它的推理速度非常慢。这使得我的模型不适合生产环境。我做了一些研究,似乎修剪会有所帮助。
TensorFlow 提供了一些修剪选项,例如 tf.contrib.model_pruning。问题是它不是一种未广泛使用的技术。有什么足够简单的示例可以帮助我理解如何使用它?
我在下面提供我的工作代码以供参考。
import pandas as pd
import numpy as np
import preprocessor as p
import emoji
import re
import ktrain
from ktrain import text
from unidecode import unidecode
import nltk
# Text preprocessing class
class TextPreprocessing:
def __init__(self):
p.set_options(p.OPT.MENTION, p.OPT.URL)
def _punctuation(self, val):
val = re.sub(r'[^\w\s]', ' ', val)
val = re.sub('_', ' ', val)
return val
def _whitespace(self, val):
return " ".join(val.split())
def _removenumbers(self, val):
val = re.sub('[0-9] + ', '', val)
return val
def _remove_unicode(self, text):
text = unidecode(text).encode("ascii")
text = str(text, "ascii")
return text
def _split_to_sentences(self, body_text):
sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", body_text)
return sentences
def _clean_text(self, val):
val = val.lower()
val = self._removenumbers(val)
val = p.clean(val)
val = ' '.join(self._punctuation(emoji.demojize(val)).split())
val = self._remove_unicode(val)
val = self._whitespace(val)
return val
def text_preprocessor(self, body_text):
body_text_df = pd.DataFrame({"body_text": body_text}, index=[1])
sentence_split_df = body_text_df.copy()
sentence_split_df["body_text"] = sentence_split_df["body_text"].apply(
self._split_to_sentences)
lst_col = "body_text"
sentence_split_df = pd.DataFrame(
{
col: np.repeat(
sentence_split_df[col].values, sentence_split_df[lst_col].str.len(
)
)
for col in sentence_split_df.columns.drop(lst_col)
}
).assign(**{lst_col: np.concatenate(sentence_split_df[lst_col].values)})[
sentence_split_df.columns
]
body_text_df["body_text"] = body_text_df["body_text"].apply(self._clean_text)
final_df = (
pd.concat([sentence_split_df, body_text_df])
.reset_index()
.drop(columns=["index"])
)
return final_df["body_text"]
# Instantiate data preprocessing object
text1 = TextPreprocessing()
# Import data
data_train = pd.read_csv('data_train_v5.csv', encoding='utf8', engine='python')
data_test = pd.read_csv('data_test_v5.csv', encoding='utf8', engine='python')
# Clean the data
data_train['Text'] = data_train['Text'].apply(text1._clean_text)
data_test['Text'] = data_test['Text'].apply(text1._clean_text)
X_train = data_train.Text.tolist()
X_test = data_test.Text.tolist()
y_train = data_train.Emotion.tolist()
y_test = data_test.Emotion.tolist()
data = data_train.append(data_test, ignore_index=True)
class_names = ['joy', 'sadness', 'fear', 'anger', 'neutral']
encoding = {
'joy': 0,
'sadness': 1,
'fear': 2,
'anger': 3,
'neutral': 4
}
# Integer values for each class
y_train = [encoding[x] for x in y_train]
y_test = [encoding[x] for x in y_test]
trn, val, preproc = text.texts_from_array(x_train=X_train, y_train=y_train,
x_test=X_test, y_test=y_test,
class_names=class_names,
preprocess_mode='distilbert',
maxlen=350)
model = text.text_classifier('distilbert', train_data=trn, preproc=preproc)
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)
predictor = ktrain.get_predictor(learner.model, preproc)
# Save the model on a file for later use
predictor.save("models/bert_model")
message = "This is a happy message"
# Cleaning - takes 5 ms to run
clean = text1._clean_text(message)
# Prediction - takes 325 ms to run
predictor.predict_proba(clean)
ktrain 中的 distilbert
模型是使用 Hugging Face transformers 创建的,这意味着您可以使用该库来修剪该模型。参见 this link for more information and the example script. You may need to convert the model to PyTorch before using the script (in addition to making some modifications to the script itself). The approach is based on the paper Are Sixteen Heads Really Better Than One?。
我已经使用 ktrain(TensorFlow 包装器)训练了一个 BERT 模型来识别文本上的情绪。它有效,但它的推理速度非常慢。这使得我的模型不适合生产环境。我做了一些研究,似乎修剪会有所帮助。
TensorFlow 提供了一些修剪选项,例如 tf.contrib.model_pruning。问题是它不是一种未广泛使用的技术。有什么足够简单的示例可以帮助我理解如何使用它?
我在下面提供我的工作代码以供参考。
import pandas as pd
import numpy as np
import preprocessor as p
import emoji
import re
import ktrain
from ktrain import text
from unidecode import unidecode
import nltk
# Text preprocessing class
class TextPreprocessing:
def __init__(self):
p.set_options(p.OPT.MENTION, p.OPT.URL)
def _punctuation(self, val):
val = re.sub(r'[^\w\s]', ' ', val)
val = re.sub('_', ' ', val)
return val
def _whitespace(self, val):
return " ".join(val.split())
def _removenumbers(self, val):
val = re.sub('[0-9] + ', '', val)
return val
def _remove_unicode(self, text):
text = unidecode(text).encode("ascii")
text = str(text, "ascii")
return text
def _split_to_sentences(self, body_text):
sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", body_text)
return sentences
def _clean_text(self, val):
val = val.lower()
val = self._removenumbers(val)
val = p.clean(val)
val = ' '.join(self._punctuation(emoji.demojize(val)).split())
val = self._remove_unicode(val)
val = self._whitespace(val)
return val
def text_preprocessor(self, body_text):
body_text_df = pd.DataFrame({"body_text": body_text}, index=[1])
sentence_split_df = body_text_df.copy()
sentence_split_df["body_text"] = sentence_split_df["body_text"].apply(
self._split_to_sentences)
lst_col = "body_text"
sentence_split_df = pd.DataFrame(
{
col: np.repeat(
sentence_split_df[col].values, sentence_split_df[lst_col].str.len(
)
)
for col in sentence_split_df.columns.drop(lst_col)
}
).assign(**{lst_col: np.concatenate(sentence_split_df[lst_col].values)})[
sentence_split_df.columns
]
body_text_df["body_text"] = body_text_df["body_text"].apply(self._clean_text)
final_df = (
pd.concat([sentence_split_df, body_text_df])
.reset_index()
.drop(columns=["index"])
)
return final_df["body_text"]
# Instantiate data preprocessing object
text1 = TextPreprocessing()
# Import data
data_train = pd.read_csv('data_train_v5.csv', encoding='utf8', engine='python')
data_test = pd.read_csv('data_test_v5.csv', encoding='utf8', engine='python')
# Clean the data
data_train['Text'] = data_train['Text'].apply(text1._clean_text)
data_test['Text'] = data_test['Text'].apply(text1._clean_text)
X_train = data_train.Text.tolist()
X_test = data_test.Text.tolist()
y_train = data_train.Emotion.tolist()
y_test = data_test.Emotion.tolist()
data = data_train.append(data_test, ignore_index=True)
class_names = ['joy', 'sadness', 'fear', 'anger', 'neutral']
encoding = {
'joy': 0,
'sadness': 1,
'fear': 2,
'anger': 3,
'neutral': 4
}
# Integer values for each class
y_train = [encoding[x] for x in y_train]
y_test = [encoding[x] for x in y_test]
trn, val, preproc = text.texts_from_array(x_train=X_train, y_train=y_train,
x_test=X_test, y_test=y_test,
class_names=class_names,
preprocess_mode='distilbert',
maxlen=350)
model = text.text_classifier('distilbert', train_data=trn, preproc=preproc)
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)
predictor = ktrain.get_predictor(learner.model, preproc)
# Save the model on a file for later use
predictor.save("models/bert_model")
message = "This is a happy message"
# Cleaning - takes 5 ms to run
clean = text1._clean_text(message)
# Prediction - takes 325 ms to run
predictor.predict_proba(clean)
ktrain 中的 distilbert
模型是使用 Hugging Face transformers 创建的,这意味着您可以使用该库来修剪该模型。参见 this link for more information and the example script. You may need to convert the model to PyTorch before using the script (in addition to making some modifications to the script itself). The approach is based on the paper Are Sixteen Heads Really Better Than One?。