SimpleTransformers "max_seq_length" 参数导致 Kaggle 和 Google Colab 出现 CUDA 内存不足错误
SimpleTransformers "max_seq_length" argument results in CUDA out of memory error in Kaggle and Google Colab
在微调基于 CamemBERT 的 sloBERTa Transformer 模型时,针对使用 SimpleTransformers 的多类分类任务,我想使用模型参数“max_seq_length”:512,因为之前的工作表明它给出了比 128 更好的结果,但包含此参数会触发以下错误。 Kaggle 和 Google Colab 环境中报错相同,终止执行重新运行也无济于事。无论训练 epoch 的数量多小,都会触发错误,并且数据集仅包含 600 个实例(文本为字符串,标签为整数)。我尝试将 max_seq_length 降低到 509、500 和 128,但错误仍然存在。
没有这个参数的设置工作正常,并允许训练 90 个 epochs,所以我有足够的内存。
from simpletransformers.classification import ClassificationModel
# define hyperparameter
model_args ={"overwrite_output_dir": True,
"num_train_epochs": 90,
"labels_list": LABELS_NUM,
"learning_rate": 1e-5,
"train_batch_size": 32,
"no_cache": True,
"no_save": True,
#"max_seq_length": 512,
"save_steps": -1,
}
model = ClassificationModel(
"camembert", "EMBEDDIA/sloberta",
use_cuda = device,
num_labels = NUM_LABELS,
args = model_args)
model.train_model(train_df)
这是错误:
RuntimeError Traceback (most recent call last)
/tmp/ipykernel_34/2529369927.py in <module>
19 args = model_args)
20
---> 21 model.train_model(train_df)
/opt/conda/lib/python3.7/site-packages/simpletransformers/classification/classification_model.py in train_model(self, train_df, multi_label, output_dir, show_running_loss, args, eval_df, verbose, **kwargs)
610 eval_df=eval_df,
611 verbose=verbose,
--> 612 **kwargs,
613 )
614
/opt/conda/lib/python3.7/site-packages/simpletransformers/classification/classification_model.py in train(self, train_dataloader, output_dir, multi_label, show_running_loss, eval_df, test_df, verbose, **kwargs)
883 loss_fct=self.loss_fct,
884 num_labels=self.num_labels,
--> 885 args=self.args,
886 )
887 else:
/opt/conda/lib/python3.7/site-packages/simpletransformers/classification/classification_model.py in _calculate_loss(self, model, inputs, loss_fct, num_labels, args)
2256
2257 def _calculate_loss(self, model, inputs, loss_fct, num_labels, args):
-> 2258 outputs = model(**inputs)
2259 # model outputs are always tuple in pytorch-transformers (see doc)
2260 loss = outputs[0]
/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
720 result = self._slow_forward(*input, **kwargs)
721 else:
--> 722 result = self.forward(*input, **kwargs)
723 for hook in itertools.chain(
724 _global_forward_hooks.values(),
/opt/conda/lib/python3.7/site-packages/transformers/models/roberta/modeling_roberta.py in forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict)
1210 output_attentions=output_attentions,
1211 output_hidden_states=output_hidden_states,
-> 1212 return_dict=return_dict,
1213 )
1214 sequence_output = outputs[0]
/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
720 result = self._slow_forward(*input, **kwargs)
721 else:
--> 722 result = self.forward(*input, **kwargs)
723 for hook in itertools.chain(
724 _global_forward_hooks.values(),
/opt/conda/lib/python3.7/site-packages/transformers/models/roberta/modeling_roberta.py in forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)
859 output_attentions=output_attentions,
860 output_hidden_states=output_hidden_states,
--> 861 return_dict=return_dict,
862 )
863 sequence_output = encoder_outputs[0]
/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
720 result = self._slow_forward(*input, **kwargs)
721 else:
--> 722 result = self.forward(*input, **kwargs)
723 for hook in itertools.chain(
724 _global_forward_hooks.values(),
/opt/conda/lib/python3.7/site-packages/transformers/models/roberta/modeling_roberta.py in forward(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)
531 encoder_attention_mask,
532 past_key_value,
--> 533 output_attentions,
534 )
535
/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
720 result = self._slow_forward(*input, **kwargs)
721 else:
--> 722 result = self.forward(*input, **kwargs)
723 for hook in itertools.chain(
724 _global_forward_hooks.values(),
/opt/conda/lib/python3.7/site-packages/transformers/models/roberta/modeling_roberta.py in forward(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)
415 head_mask,
416 output_attentions=output_attentions,
--> 417 past_key_value=self_attn_past_key_value,
418 )
419 attention_output = self_attention_outputs[0]
/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
720 result = self._slow_forward(*input, **kwargs)
721 else:
--> 722 result = self.forward(*input, **kwargs)
723 for hook in itertools.chain(
724 _global_forward_hooks.values(),
/opt/conda/lib/python3.7/site-packages/transformers/models/roberta/modeling_roberta.py in forward(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)
344 encoder_attention_mask,
345 past_key_value,
--> 346 output_attentions,
347 )
348 attention_output = self.output(self_outputs[0], hidden_states)
/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
720 result = self._slow_forward(*input, **kwargs)
721 else:
--> 722 result = self.forward(*input, **kwargs)
723 for hook in itertools.chain(
724 _global_forward_hooks.values(),
/opt/conda/lib/python3.7/site-packages/transformers/models/roberta/modeling_roberta.py in forward(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)
273 attention_probs = attention_probs * head_mask
274
--> 275 context_layer = torch.matmul(attention_probs, value_layer)
276
277 context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
RuntimeError: CUDA out of memory. Tried to allocate 192.00 MiB (GPU 0; 15.90 GiB total capacity; 15.04 GiB already allocated; 15.75 MiB free; 15.12 GiB reserved in total by PyTorch)
其他代码(如果有帮助 - 我已经尝试了我在网上找到的所有关于 pytorch 的东西 - 完整代码可以在 https://www.kaggle.com/tajakuz/0-sloberta-example-max-seq-length-error 访问):
!conda install --yes pytorch>=1.6 cudatoolkit=11.0 -c pytorch
# install simpletransformers
!pip install -q transformers
!pip install --upgrade transformers
!pip install -q simpletransformers
# check installed version
!pip freeze | grep simpletransformers
!pip uninstall -q torch -y
!pip install -q torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
# pytorch libraries
import torch # the main pytorch library
import torch.nn as nn # the sub-library containing Softmax, Module and other useful functions
import torch.optim as optim # the sub-library containing the common optimizers (SGD, Adam, etc.)
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
#importing other necessary packages and ClassificationModel for bert
from tqdm import tqdm
import warnings
warnings.simplefilter('ignore')
from scipy.special import softmax
非常感谢您的帮助,非常感谢!
发生这种情况是因为 max_seq_length
定义了模型的输入神经元数量,从而增加了可训练参数的数量,这将需要它分配更多内存,这可能会超出这些平台上的内存限制。
大多数时候,max_seq_length
在数据集上,有时添加太多可能会浪费训练时间和模型大小。
您可以做的是找到训练数据集中每个样本的最大单词数,并将其用作您的 max_seq_length
。
在微调基于 CamemBERT 的 sloBERTa Transformer 模型时,针对使用 SimpleTransformers 的多类分类任务,我想使用模型参数“max_seq_length”:512,因为之前的工作表明它给出了比 128 更好的结果,但包含此参数会触发以下错误。 Kaggle 和 Google Colab 环境中报错相同,终止执行重新运行也无济于事。无论训练 epoch 的数量多小,都会触发错误,并且数据集仅包含 600 个实例(文本为字符串,标签为整数)。我尝试将 max_seq_length 降低到 509、500 和 128,但错误仍然存在。
没有这个参数的设置工作正常,并允许训练 90 个 epochs,所以我有足够的内存。
from simpletransformers.classification import ClassificationModel
# define hyperparameter
model_args ={"overwrite_output_dir": True,
"num_train_epochs": 90,
"labels_list": LABELS_NUM,
"learning_rate": 1e-5,
"train_batch_size": 32,
"no_cache": True,
"no_save": True,
#"max_seq_length": 512,
"save_steps": -1,
}
model = ClassificationModel(
"camembert", "EMBEDDIA/sloberta",
use_cuda = device,
num_labels = NUM_LABELS,
args = model_args)
model.train_model(train_df)
这是错误:
RuntimeError Traceback (most recent call last)
/tmp/ipykernel_34/2529369927.py in <module>
19 args = model_args)
20
---> 21 model.train_model(train_df)
/opt/conda/lib/python3.7/site-packages/simpletransformers/classification/classification_model.py in train_model(self, train_df, multi_label, output_dir, show_running_loss, args, eval_df, verbose, **kwargs)
610 eval_df=eval_df,
611 verbose=verbose,
--> 612 **kwargs,
613 )
614
/opt/conda/lib/python3.7/site-packages/simpletransformers/classification/classification_model.py in train(self, train_dataloader, output_dir, multi_label, show_running_loss, eval_df, test_df, verbose, **kwargs)
883 loss_fct=self.loss_fct,
884 num_labels=self.num_labels,
--> 885 args=self.args,
886 )
887 else:
/opt/conda/lib/python3.7/site-packages/simpletransformers/classification/classification_model.py in _calculate_loss(self, model, inputs, loss_fct, num_labels, args)
2256
2257 def _calculate_loss(self, model, inputs, loss_fct, num_labels, args):
-> 2258 outputs = model(**inputs)
2259 # model outputs are always tuple in pytorch-transformers (see doc)
2260 loss = outputs[0]
/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
720 result = self._slow_forward(*input, **kwargs)
721 else:
--> 722 result = self.forward(*input, **kwargs)
723 for hook in itertools.chain(
724 _global_forward_hooks.values(),
/opt/conda/lib/python3.7/site-packages/transformers/models/roberta/modeling_roberta.py in forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict)
1210 output_attentions=output_attentions,
1211 output_hidden_states=output_hidden_states,
-> 1212 return_dict=return_dict,
1213 )
1214 sequence_output = outputs[0]
/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
720 result = self._slow_forward(*input, **kwargs)
721 else:
--> 722 result = self.forward(*input, **kwargs)
723 for hook in itertools.chain(
724 _global_forward_hooks.values(),
/opt/conda/lib/python3.7/site-packages/transformers/models/roberta/modeling_roberta.py in forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)
859 output_attentions=output_attentions,
860 output_hidden_states=output_hidden_states,
--> 861 return_dict=return_dict,
862 )
863 sequence_output = encoder_outputs[0]
/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
720 result = self._slow_forward(*input, **kwargs)
721 else:
--> 722 result = self.forward(*input, **kwargs)
723 for hook in itertools.chain(
724 _global_forward_hooks.values(),
/opt/conda/lib/python3.7/site-packages/transformers/models/roberta/modeling_roberta.py in forward(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)
531 encoder_attention_mask,
532 past_key_value,
--> 533 output_attentions,
534 )
535
/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
720 result = self._slow_forward(*input, **kwargs)
721 else:
--> 722 result = self.forward(*input, **kwargs)
723 for hook in itertools.chain(
724 _global_forward_hooks.values(),
/opt/conda/lib/python3.7/site-packages/transformers/models/roberta/modeling_roberta.py in forward(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)
415 head_mask,
416 output_attentions=output_attentions,
--> 417 past_key_value=self_attn_past_key_value,
418 )
419 attention_output = self_attention_outputs[0]
/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
720 result = self._slow_forward(*input, **kwargs)
721 else:
--> 722 result = self.forward(*input, **kwargs)
723 for hook in itertools.chain(
724 _global_forward_hooks.values(),
/opt/conda/lib/python3.7/site-packages/transformers/models/roberta/modeling_roberta.py in forward(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)
344 encoder_attention_mask,
345 past_key_value,
--> 346 output_attentions,
347 )
348 attention_output = self.output(self_outputs[0], hidden_states)
/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
720 result = self._slow_forward(*input, **kwargs)
721 else:
--> 722 result = self.forward(*input, **kwargs)
723 for hook in itertools.chain(
724 _global_forward_hooks.values(),
/opt/conda/lib/python3.7/site-packages/transformers/models/roberta/modeling_roberta.py in forward(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)
273 attention_probs = attention_probs * head_mask
274
--> 275 context_layer = torch.matmul(attention_probs, value_layer)
276
277 context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
RuntimeError: CUDA out of memory. Tried to allocate 192.00 MiB (GPU 0; 15.90 GiB total capacity; 15.04 GiB already allocated; 15.75 MiB free; 15.12 GiB reserved in total by PyTorch)
其他代码(如果有帮助 - 我已经尝试了我在网上找到的所有关于 pytorch 的东西 - 完整代码可以在 https://www.kaggle.com/tajakuz/0-sloberta-example-max-seq-length-error 访问):
!conda install --yes pytorch>=1.6 cudatoolkit=11.0 -c pytorch
# install simpletransformers
!pip install -q transformers
!pip install --upgrade transformers
!pip install -q simpletransformers
# check installed version
!pip freeze | grep simpletransformers
!pip uninstall -q torch -y
!pip install -q torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
# pytorch libraries
import torch # the main pytorch library
import torch.nn as nn # the sub-library containing Softmax, Module and other useful functions
import torch.optim as optim # the sub-library containing the common optimizers (SGD, Adam, etc.)
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
#importing other necessary packages and ClassificationModel for bert
from tqdm import tqdm
import warnings
warnings.simplefilter('ignore')
from scipy.special import softmax
非常感谢您的帮助,非常感谢!
发生这种情况是因为 max_seq_length
定义了模型的输入神经元数量,从而增加了可训练参数的数量,这将需要它分配更多内存,这可能会超出这些平台上的内存限制。
大多数时候,max_seq_length
在数据集上,有时添加太多可能会浪费训练时间和模型大小。
您可以做的是找到训练数据集中每个样本的最大单词数,并将其用作您的 max_seq_length
。