使用 fastai 和 huggingface 进行多类序列分类
multiclass sequence classifiaction with fastai and huggingface
我希望通过 fastai 和 huggingface 实现 DistilBERT 来解决 mutliclass 序列 classification 问题。我找到了一个有用的教程,它给出了一个很好的例子,说明如何使用二进制 classification 来做到这一点。代码如下:
# !pip install torch==1.9.0
# !pip install torchtext==0.10
# !pip install transformers==4.7
# !pip install fastai==2.4
from fastai.text.all import *
from sklearn.model_selection import train_test_split
import pandas as pd
import glob
from transformers import AutoTokenizer, AutoModelForSequenceClassification
hf_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
hf_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
"""
train_df and val_df looks like this:
label text
4240 5 whoa interesting.
13 7 you could you could we just
4639 4 you set the goal,
28 1 because ive already agreed to that
66 8 oh hey freshman thats you gona need
"""
print(list(train_df.label.value_counts().index))
"""
[4, 1, 5, 6, 7, 0, 2, 3, 8]
"""
class HF_Dataset(torch.utils.data.Dataset):
def __init__(self, df, hf_tokenizer):
self.df = df
self.hf_tokenizer = hf_tokenizer
self.label_map = {
0:0,
1:0,
2:0,
3:0,
4:1,
5:1,
6:1,
7:1,
8:1
}
def __len__(self):
return len(self.df)
def decode(self, token_ids):
return ' '.join([hf_tokenizer.decode(x) for x in tokenizer_outputs['input_ids']])
def decode_to_original(self, token_ids):
return self.hf_tokenizer.decode(token_ids.squeeze())
def __getitem__(self, index):
label, text = self.df.iloc[index]
label = self.label_map[label]
label = torch.tensor(label)
tokenizer_output = self.hf_tokenizer(text, return_tensors="pt", padding='max_length', truncation=True, max_length=512)
tokenizer_output['input_ids'].squeeze_()
tokenizer_output['attention_mask'].squeeze_()
return tokenizer_output, label
train_dataset = HF_Dataset(train_df, hf_tokenizer)
valid_dataset = HF_Dataset(valid_df, hf_tokenizer)
train_dl = DataLoader(train_dataset, bs=16, shuffle=True)
valid_dl = DataLoader(valid_dataset, bs=16)
dls = DataLoaders(train_dl, valid_dl)
hf_model(**batched_data)
class HF_Model(nn.Module):
def __init__(self, hf_model):
super().__init__()
self.hf_model = hf_model
def forward(self, tokenizer_outputs):
model_output = self.hf_model(**tokenizer_outputs)
return model_output.logits
model = HF_Model(hf_model)
# Manually popping the model onto the gpu since the data is in a dictionary format
# (doesn't automatically place model + data on gpu otherwise)
learn = Learner(dls, model, loss_func=nn.CrossEntropyLoss(), metrics=[accuracy])
learn.fit_one_cycle(3, 1e-4)
这很好用。但是,我将我的 multiclass 标签映射到 2 个标签以允许它工作。我实际上有 9 classes。我尝试调整 HF_Dataset()
class 中的标签映射方案以匹配我的实际标签,如下所示:
class HF_Dataset(torch.utils.data.Dataset):
def __init__(self, df, hf_tokenizer):
self.df = df
self.hf_tokenizer = hf_tokenizer
self.label_map = {
0:0,
1:1,
2:2,
3:3,
4:4,
5:5,
6:6,
7:7,
8:8
}
def __len__(self):
return len(self.df)
def decode(self, token_ids):
return ' '.join([hf_tokenizer.decode(x) for x in tokenizer_outputs['input_ids']])
def decode_to_original(self, token_ids):
return self.hf_tokenizer.decode(token_ids.squeeze())
def __getitem__(self, index):
label, text = self.df.iloc[index]
label = self.label_map[label]
label = torch.tensor(label)
tokenizer_output = self.hf_tokenizer(text, return_tensors="pt", padding='max_length', truncation=True, max_length=512)
tokenizer_output['input_ids'].squeeze_()
tokenizer_output['attention_mask'].squeeze_()
return tokenizer_output, label
每一行都有效,直到 learn.fit_one_cycle
。
这是这一行的完整堆栈跟踪:
0.00% [0/3 00:00<00:00]
epoch train_loss valid_loss accuracy time
0.00% [0/519 00:00<00:00]
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-21-0ec2ff9e12e1> in <module>
----> 1 learn.fit_one_cycle(3, 1e-4)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/callback/schedule.py in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt)
111 scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
112 'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
--> 113 self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
114
115 # Cell
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
219 self.opt.set_hypers(lr=self.lr if lr is None else lr)
220 self.n_epoch = n_epoch
--> 221 self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
222
223 def _end_cleanup(self): self.dl,self.xb,self.yb,self.pred,self.loss = None,(None,),(None,),None,None
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
161
162 def _with_events(self, f, event_type, ex, final=noop):
--> 163 try: self(f'before_{event_type}'); f()
164 except ex: self(f'after_cancel_{event_type}')
165 self(f'after_{event_type}'); final()
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in _do_fit(self)
210 for epoch in range(self.n_epoch):
211 self.epoch=epoch
--> 212 self._with_events(self._do_epoch, 'epoch', CancelEpochException)
213
214 def fit(self, n_epoch, lr=None, wd=None, cbs=None, reset_opt=False):
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
161
162 def _with_events(self, f, event_type, ex, final=noop):
--> 163 try: self(f'before_{event_type}'); f()
164 except ex: self(f'after_cancel_{event_type}')
165 self(f'after_{event_type}'); final()
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in _do_epoch(self)
204
205 def _do_epoch(self):
--> 206 self._do_epoch_train()
207 self._do_epoch_validate()
208
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in _do_epoch_train(self)
196 def _do_epoch_train(self):
197 self.dl = self.dls.train
--> 198 self._with_events(self.all_batches, 'train', CancelTrainException)
199
200 def _do_epoch_validate(self, ds_idx=1, dl=None):
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
161
162 def _with_events(self, f, event_type, ex, final=noop):
--> 163 try: self(f'before_{event_type}'); f()
164 except ex: self(f'after_cancel_{event_type}')
165 self(f'after_{event_type}'); final()
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in all_batches(self)
167 def all_batches(self):
168 self.n_iter = len(self.dl)
--> 169 for o in enumerate(self.dl): self.one_batch(*o)
170
171 def _do_one_batch(self):
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in one_batch(self, i, b)
192 b = self._set_device(b)
193 self._split(b)
--> 194 self._with_events(self._do_one_batch, 'batch', CancelBatchException)
195
196 def _do_epoch_train(self):
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
161
162 def _with_events(self, f, event_type, ex, final=noop):
--> 163 try: self(f'before_{event_type}'); f()
164 except ex: self(f'after_cancel_{event_type}')
165 self(f'after_{event_type}'); final()
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in _do_one_batch(self)
173 self('after_pred')
174 if len(self.yb):
--> 175 self.loss_grad = self.loss_func(self.pred, *self.yb)
176 self.loss = self.loss_grad.clone()
177 self('after_loss')
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/loss.py in forward(self, input, target)
1119 def forward(self, input: Tensor, target: Tensor) -> Tensor:
1120 return F.cross_entropy(input, target, weight=self.weight,
-> 1121 ignore_index=self.ignore_index, reduction=self.reduction)
1122
1123
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/functional.py in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction)
2822 if size_average is not None or reduce is not None:
2823 reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 2824 return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
2825
2826
IndexError: Target 6 is out of bounds.
这似乎应该是一个简单的修复。我是否需要调整模型架构中的某些内容以允许它接受 9 个标签?或者我需要对我的标签进行一次热编码吗?如果是这样,是否有预构建的解决方案来在管道中执行此操作?
加载模型时需要定义num_labels=9
:
hf_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=9)
默认值为 2,适合第一个用例,但在您尝试更改时会中断。
请注意,lib 明确表示分类器(您感兴趣的 generates 和 .logits
)是随机初始化的:
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
我希望通过 fastai 和 huggingface 实现 DistilBERT 来解决 mutliclass 序列 classification 问题。我找到了一个有用的教程,它给出了一个很好的例子,说明如何使用二进制 classification 来做到这一点。代码如下:
# !pip install torch==1.9.0
# !pip install torchtext==0.10
# !pip install transformers==4.7
# !pip install fastai==2.4
from fastai.text.all import *
from sklearn.model_selection import train_test_split
import pandas as pd
import glob
from transformers import AutoTokenizer, AutoModelForSequenceClassification
hf_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
hf_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
"""
train_df and val_df looks like this:
label text
4240 5 whoa interesting.
13 7 you could you could we just
4639 4 you set the goal,
28 1 because ive already agreed to that
66 8 oh hey freshman thats you gona need
"""
print(list(train_df.label.value_counts().index))
"""
[4, 1, 5, 6, 7, 0, 2, 3, 8]
"""
class HF_Dataset(torch.utils.data.Dataset):
def __init__(self, df, hf_tokenizer):
self.df = df
self.hf_tokenizer = hf_tokenizer
self.label_map = {
0:0,
1:0,
2:0,
3:0,
4:1,
5:1,
6:1,
7:1,
8:1
}
def __len__(self):
return len(self.df)
def decode(self, token_ids):
return ' '.join([hf_tokenizer.decode(x) for x in tokenizer_outputs['input_ids']])
def decode_to_original(self, token_ids):
return self.hf_tokenizer.decode(token_ids.squeeze())
def __getitem__(self, index):
label, text = self.df.iloc[index]
label = self.label_map[label]
label = torch.tensor(label)
tokenizer_output = self.hf_tokenizer(text, return_tensors="pt", padding='max_length', truncation=True, max_length=512)
tokenizer_output['input_ids'].squeeze_()
tokenizer_output['attention_mask'].squeeze_()
return tokenizer_output, label
train_dataset = HF_Dataset(train_df, hf_tokenizer)
valid_dataset = HF_Dataset(valid_df, hf_tokenizer)
train_dl = DataLoader(train_dataset, bs=16, shuffle=True)
valid_dl = DataLoader(valid_dataset, bs=16)
dls = DataLoaders(train_dl, valid_dl)
hf_model(**batched_data)
class HF_Model(nn.Module):
def __init__(self, hf_model):
super().__init__()
self.hf_model = hf_model
def forward(self, tokenizer_outputs):
model_output = self.hf_model(**tokenizer_outputs)
return model_output.logits
model = HF_Model(hf_model)
# Manually popping the model onto the gpu since the data is in a dictionary format
# (doesn't automatically place model + data on gpu otherwise)
learn = Learner(dls, model, loss_func=nn.CrossEntropyLoss(), metrics=[accuracy])
learn.fit_one_cycle(3, 1e-4)
这很好用。但是,我将我的 multiclass 标签映射到 2 个标签以允许它工作。我实际上有 9 classes。我尝试调整 HF_Dataset()
class 中的标签映射方案以匹配我的实际标签,如下所示:
class HF_Dataset(torch.utils.data.Dataset):
def __init__(self, df, hf_tokenizer):
self.df = df
self.hf_tokenizer = hf_tokenizer
self.label_map = {
0:0,
1:1,
2:2,
3:3,
4:4,
5:5,
6:6,
7:7,
8:8
}
def __len__(self):
return len(self.df)
def decode(self, token_ids):
return ' '.join([hf_tokenizer.decode(x) for x in tokenizer_outputs['input_ids']])
def decode_to_original(self, token_ids):
return self.hf_tokenizer.decode(token_ids.squeeze())
def __getitem__(self, index):
label, text = self.df.iloc[index]
label = self.label_map[label]
label = torch.tensor(label)
tokenizer_output = self.hf_tokenizer(text, return_tensors="pt", padding='max_length', truncation=True, max_length=512)
tokenizer_output['input_ids'].squeeze_()
tokenizer_output['attention_mask'].squeeze_()
return tokenizer_output, label
每一行都有效,直到 learn.fit_one_cycle
。
这是这一行的完整堆栈跟踪:
0.00% [0/3 00:00<00:00]
epoch train_loss valid_loss accuracy time
0.00% [0/519 00:00<00:00]
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-21-0ec2ff9e12e1> in <module>
----> 1 learn.fit_one_cycle(3, 1e-4)
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/callback/schedule.py in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt)
111 scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
112 'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
--> 113 self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
114
115 # Cell
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
219 self.opt.set_hypers(lr=self.lr if lr is None else lr)
220 self.n_epoch = n_epoch
--> 221 self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
222
223 def _end_cleanup(self): self.dl,self.xb,self.yb,self.pred,self.loss = None,(None,),(None,),None,None
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
161
162 def _with_events(self, f, event_type, ex, final=noop):
--> 163 try: self(f'before_{event_type}'); f()
164 except ex: self(f'after_cancel_{event_type}')
165 self(f'after_{event_type}'); final()
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in _do_fit(self)
210 for epoch in range(self.n_epoch):
211 self.epoch=epoch
--> 212 self._with_events(self._do_epoch, 'epoch', CancelEpochException)
213
214 def fit(self, n_epoch, lr=None, wd=None, cbs=None, reset_opt=False):
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
161
162 def _with_events(self, f, event_type, ex, final=noop):
--> 163 try: self(f'before_{event_type}'); f()
164 except ex: self(f'after_cancel_{event_type}')
165 self(f'after_{event_type}'); final()
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in _do_epoch(self)
204
205 def _do_epoch(self):
--> 206 self._do_epoch_train()
207 self._do_epoch_validate()
208
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in _do_epoch_train(self)
196 def _do_epoch_train(self):
197 self.dl = self.dls.train
--> 198 self._with_events(self.all_batches, 'train', CancelTrainException)
199
200 def _do_epoch_validate(self, ds_idx=1, dl=None):
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
161
162 def _with_events(self, f, event_type, ex, final=noop):
--> 163 try: self(f'before_{event_type}'); f()
164 except ex: self(f'after_cancel_{event_type}')
165 self(f'after_{event_type}'); final()
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in all_batches(self)
167 def all_batches(self):
168 self.n_iter = len(self.dl)
--> 169 for o in enumerate(self.dl): self.one_batch(*o)
170
171 def _do_one_batch(self):
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in one_batch(self, i, b)
192 b = self._set_device(b)
193 self._split(b)
--> 194 self._with_events(self._do_one_batch, 'batch', CancelBatchException)
195
196 def _do_epoch_train(self):
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
161
162 def _with_events(self, f, event_type, ex, final=noop):
--> 163 try: self(f'before_{event_type}'); f()
164 except ex: self(f'after_cancel_{event_type}')
165 self(f'after_{event_type}'); final()
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in _do_one_batch(self)
173 self('after_pred')
174 if len(self.yb):
--> 175 self.loss_grad = self.loss_func(self.pred, *self.yb)
176 self.loss = self.loss_grad.clone()
177 self('after_loss')
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/loss.py in forward(self, input, target)
1119 def forward(self, input: Tensor, target: Tensor) -> Tensor:
1120 return F.cross_entropy(input, target, weight=self.weight,
-> 1121 ignore_index=self.ignore_index, reduction=self.reduction)
1122
1123
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/functional.py in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction)
2822 if size_average is not None or reduce is not None:
2823 reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 2824 return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
2825
2826
IndexError: Target 6 is out of bounds.
这似乎应该是一个简单的修复。我是否需要调整模型架构中的某些内容以允许它接受 9 个标签?或者我需要对我的标签进行一次热编码吗?如果是这样,是否有预构建的解决方案来在管道中执行此操作?
加载模型时需要定义num_labels=9
:
hf_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=9)
默认值为 2,适合第一个用例,但在您尝试更改时会中断。
请注意,lib 明确表示分类器(您感兴趣的 generates 和 .logits
)是随机初始化的:
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.