tokenizer.max len 在这个 class 定义中做什么?
What is tokenizer.max len doing in this class definition?
我正在按照 Rostylav 的教程找到 here,但遇到了一个我不太明白的错误:
AttributeError
Traceback (most recent call last)
<ipython-input-22-523c0d2a27d3> in <module>()
----> 1 main(trn_df, val_df)
<ipython-input-20-1f17c050b9e5> in main(df_trn, df_val)
59 # Training
60 if args.do_train:
---> 61 train_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False)
62
63 global_step, tr_loss = train(args, train_dataset, model, tokenizer)
<ipython-input-18-3c4f1599e14e> in load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate)
40
41 def load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False):
---> 42 return ConversationDataset(tokenizer, args, df_val if evaluate else df_trn)
43
44 def set_seed(args):
<ipython-input-18-3c4f1599e14e> in __init__(self, tokenizer, args, df, block_size)
8 def __init__(self, tokenizer: PreTrainedTokenizer, args, df, block_size=512):
9
---> 10 block_size = block_size - (tokenizer.max_len - tokenizer.max_len_single_sentence)
11
12 directory = args.cache_dir
AttributeError: 'GPT2TokenizerFast' object has no attribute 'max_len'
我认为这是导致错误的 class,但是我无法理解 Tokenize.max_len 应该做什么,所以我可以尝试修复它:
class ConversationDataset(Dataset):
def __init__(self, tokenizer: PreTrainedTokenizer, args, df, block_size=512):
block_size = block_size - (tokenizer.max_len - tokenizer.max_len_single_sentence)
directory = args.cache_dir
cached_features_file = os.path.join(
directory, args.model_type + "_cached_lm_" + str(block_size)
)
if os.path.exists(cached_features_file) and not args.overwrite_cache:
logger.info("Loading features from cached file %s", cached_features_file)
with open(cached_features_file, "rb") as handle:
self.examples = pickle.load(handle)
else:
logger.info("Creating features from dataset file at %s", directory)
self.examples = []
for _, row in df.iterrows():
conv = construct_conv(row, tokenizer)
self.examples.append(conv)
logger.info("Saving features into cached file %s", cached_features_file)
with open(cached_features_file, "wb") as handle:
pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
def __len__(self):
return len(self.examples)
def __getitem__(self, item):
return torch.tensor(self.examples[item], dtype=torch.long)
# Cacheing and storing of data/checkpoints
def load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False):
return ConversationDataset(tokenizer, args, df_val if evaluate else df_trn)
感谢阅读!
属性 max_len
是 migrated to model_max_length
. It represents the maximum number of tokens a model can handle (i.e. including special tokens) (documentation)。
另一边的max_len_single_sentence
表示单个句子可以有的最大token数(即没有特殊token)(documentation)。
我正在按照 Rostylav 的教程找到 here,但遇到了一个我不太明白的错误:
AttributeError
Traceback (most recent call last)
<ipython-input-22-523c0d2a27d3> in <module>()
----> 1 main(trn_df, val_df)
<ipython-input-20-1f17c050b9e5> in main(df_trn, df_val)
59 # Training
60 if args.do_train:
---> 61 train_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False)
62
63 global_step, tr_loss = train(args, train_dataset, model, tokenizer)
<ipython-input-18-3c4f1599e14e> in load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate)
40
41 def load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False):
---> 42 return ConversationDataset(tokenizer, args, df_val if evaluate else df_trn)
43
44 def set_seed(args):
<ipython-input-18-3c4f1599e14e> in __init__(self, tokenizer, args, df, block_size)
8 def __init__(self, tokenizer: PreTrainedTokenizer, args, df, block_size=512):
9
---> 10 block_size = block_size - (tokenizer.max_len - tokenizer.max_len_single_sentence)
11
12 directory = args.cache_dir
AttributeError: 'GPT2TokenizerFast' object has no attribute 'max_len'
我认为这是导致错误的 class,但是我无法理解 Tokenize.max_len 应该做什么,所以我可以尝试修复它:
class ConversationDataset(Dataset):
def __init__(self, tokenizer: PreTrainedTokenizer, args, df, block_size=512):
block_size = block_size - (tokenizer.max_len - tokenizer.max_len_single_sentence)
directory = args.cache_dir
cached_features_file = os.path.join(
directory, args.model_type + "_cached_lm_" + str(block_size)
)
if os.path.exists(cached_features_file) and not args.overwrite_cache:
logger.info("Loading features from cached file %s", cached_features_file)
with open(cached_features_file, "rb") as handle:
self.examples = pickle.load(handle)
else:
logger.info("Creating features from dataset file at %s", directory)
self.examples = []
for _, row in df.iterrows():
conv = construct_conv(row, tokenizer)
self.examples.append(conv)
logger.info("Saving features into cached file %s", cached_features_file)
with open(cached_features_file, "wb") as handle:
pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
def __len__(self):
return len(self.examples)
def __getitem__(self, item):
return torch.tensor(self.examples[item], dtype=torch.long)
# Cacheing and storing of data/checkpoints
def load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False):
return ConversationDataset(tokenizer, args, df_val if evaluate else df_trn)
感谢阅读!
属性 max_len
是 migrated to model_max_length
. It represents the maximum number of tokens a model can handle (i.e. including special tokens) (documentation)。
max_len_single_sentence
表示单个句子可以有的最大token数(即没有特殊token)(documentation)。