"for tokens_tensor, segments_tensors, att_mask, pos_id, trg in data_loader: NameError: name 'data_loader' is not defined"
"for tokens_tensor, segments_tensors, att_mask, pos_id, trg in data_loader: NameError: name 'data_loader' is not defined"
我正在尝试使用由 jugapuff 实现的 BERT 转换器来实现问答模型。
Link 到代码:https://github.com/jugapuff/BERT-for-bAbi-task
执行下面写的 main.py 文件后,我收到此错误:“for tokens_tensor, segments_tensors, att_mask, pos_id, data_loader 中的 trg: NameError: 名称 'data_loader' 未定义 "
from dataloader import bAbi_Dataset
import torch
import torch.nn as nn
from model import model
from pytorch_transformers import AdamW
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
print("GPU:" + str(torch.cuda.get_device_name(0)))
my_model = model()
my_model.to(device)
optimizer = AdamW(my_model.parameters())
criterion = nn.NLLLoss()
EPOCHS = 10
for epoch in range(1, EPOCHS+1):
my_model.train()
train_loss = 0
length = 0
for tokens_tensor, segments_tensors, att_mask, pos_id, trg in data_loader:
output = my_model(tokens_tensor.to(device), segments_tensors.to(device), att_mask.to(device), pos_id.to(device))
loss = criterion(output, trg.to(device))
optimizer.zero_grad()
loss.backward()
optimizer.step()
length+=1
train_loss += loss.item()
if length % 10 == 0:
print("\t\t{:3}/25000 : {}".format(length, train_loss / length))
epoch_loss = train_loss / length
print("##################")
print("{} epoch Loss : {:.4f}".format(epoch, epoch_loss))
和data_loader.py是
import os
import torch
import torch.utils.data as data
from pytorch_transformers import BertTokenizer
def _parse( file, only_supporting=False):
data, story = [], []
for line in file:
tid, text = line.rstrip('\n').split(' ', 1)
if tid == '1':
story = []
if text.endswith('.'):
story.append(text[:])
else:
query, answer, supporting = (x.strip() for x in text.split('\t'))
if only_supporting:
substory = [story[int(i) - 1] for i in supporting.split()]
else:
substory = [x for x in story if x]
data.append((substory, query[:-1], answer))
story.append("")
return data
def build_trg_dics(tenK=True, path="tasks_1-20_v1-2", train=True):
if tenK:
dirname = os.path.join(path, 'en-10k')
else:
dirname = os.path.join(path, 'en')
for (dirpath, dirnames, filenames) in os.walk(dirname):
filenames = filenames
if train:
filenames = [filename for filename in filenames if "train.txt" in filename]
else:
filenames = [filename for filename in filenames if "test.txt" in filename]
temp = []
for filename in filenames:
f = open(os.path.join(dirname, filename), 'r')
parsed =_parse(f)
temp.extend([d[2] for d in parsed])
temp = set(temp)
trg_word2id = {word:i for i, word in enumerate(temp)}
trg_id2word = {i:word for i, word in enumerate(temp)}
return trg_word2id, trg_id2word
class bAbi_Dataset(data.Dataset):
def __init__(self, trg_word2id, tenK=True, path = "tasks_1-20_v1-2", train=True):
# joint is Default
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
if tenK:
dirname = os.path.join(path, 'en-10k')
else:
dirname = os.path.join(path, 'en')
for (dirpath, dirnames, filenames) in os.walk(dirname):
filenames = filenames
if train:
filenames = [filename for filename in filenames if "train.txt" in filename]
else:
filenames = [filename for filename in filenames if "test.txt" in filename]
self.src = []
self.trg = []
for filename in filenames:
f = open(os.path.join(dirname, filename), 'r')
parsed = _parse(f)
self.src.extend([d[:2] for d in parsed])
self.trg.extend([trg_word2id[d[2]] for d in parsed])
self.trg = torch.tensor(self.trg)
def __getitem__(self, index):
src_seq = self.src[index]
trg = self.trg[index]
src_seq, seg_seq, att_mask, pos_id = self.preprocess_sequence(src_seq)
return src_seq, seg_seq, att_mask, pos_id, trg
def __len__(self):
return len(self.trg)
def preprocess_sequence(self, seq):
text = ["[CLS]"] + list(seq[0]) + ["[SEP]"] + [seq[1]] + ["[SEP]"]
tokenized_text = self.tokenizer.tokenize(" ".join(text))
indexed_text = self.tokenizer.convert_tokens_to_ids(tokenized_text)
where_is_sep = indexed_text.index(102) + 1
segment_ids = [0 ]* (where_is_sep) + [1] * (len(indexed_text)- where_is_sep)
attention_mask = [1] *len(indexed_text)
pos_id = [i for i in range(len(indexed_text))]
return torch.tensor(indexed_text), torch.tensor(segment_ids), torch.tensor(attention_mask), torch.tensor(pos_id)
def collate_fn(data):
def merge(sequences):
lengths = [len(seq) for seq in sequences]
padded_seqs = torch.zeros(len(sequences), 512).long()
for i, seq in enumerate(sequences):
end = lengths[i]
if end <= 512:
padded_seqs[i, :end] = seq[:end]
else:
padded_seqs[i] = seq[-512:]
return padded_seqs
def pos_merge(sequences):
lengths = [len(seq) for seq in sequences]
padded_seqs = torch.zeros(len(sequences), 512).long()
for i, seq in enumerate(sequences):
padded_seqs[i] = torch.tensor([i for i in range(512)])
return padded_seqs
src_seqs, seg_seqs, att_mask, pos_id, trgs = zip(*data)
src_seqs = merge(src_seqs)
seg_seqs = merge(seg_seqs)
att_mask = merge(att_mask)
pos_id = pos_merge(pos_id)
trgs = torch.tensor(trgs)
return src_seqs, seg_seqs, att_mask, pos_id, trgs
data_loader 缺少 main.py 中的变量声明。所以我尝试将 data_loader 加载为
for tokens_tensor, segments_tensors, att_mask, pos_id, trg in dataloader.collate_fn(bAbi_Dataset):
使用 data_loader.py 中的 collate_fn() 函数,但它不起作用。当我按上面的方式更改它时,它会出现以下错误:
Traceback (most recent call last):
File "main.py", line 27, in <module>
File "/content/BERT-for-bAbi-task/dataloader.py", line 133, in collate_fn
src_seqs, seg_seqs, att_mask, pos_id, trgs = zip(*data)
File "/usr/lib/python3.6/typing.py", line 682, in inner
return func(*args, **kwds)
File "/usr/lib/python3.6/typing.py", line 1107, in __getitem__
params = tuple(_type_check(p, msg) for p in params)
File "/usr/lib/python3.6/typing.py", line 1107, in <genexpr>
params = tuple(_type_check(p, msg) for p in params)
File "/usr/lib/python3.6/typing.py", line 374, in _type_check
raise TypeError(msg + " Got %.100r." % (arg,))
TypeError: Parameters to generic types must be types. Got 0.
谁能帮我改正错误?
我只是给你一些建议:
collate_fn
并不意味着用数据集作为参数来调用。它是给数据加载器的一个特殊回调函数,用于将批次元素整理成批次。
因为 /dataloader.py
中的 bAbi_Dataset
被定义为 torch.utils.data.Dataset
我猜你应该初始化它。它被定义为 here 为:
def __init__(self, trg_word2id, tenK=True, path = "tasks_1-20_v1-2", train=True)
/dataloader.py
中还有另一个函数build_trg_dics
,用于从文件创建解析内容。在为 bAbi_Dataset
.
设置正确的参数之前,您应该先看一下它们
最后,初始化数据集后,您可以使用 torch.utils.data.DataLoader
在其上附加数据加载器。这看起来像:
data_loader = DataLoader(dataset, batch_size=16)
此时,您甚至可能需要插入/dataloader.py
中提供的整理功能。
如果您真的不知道自己在做什么,我建议您从一个可用的存储库开始,然后从那里开始工作。祝你好运!
我正在尝试使用由 jugapuff 实现的 BERT 转换器来实现问答模型。 Link 到代码:https://github.com/jugapuff/BERT-for-bAbi-task
执行下面写的 main.py 文件后,我收到此错误:“for tokens_tensor, segments_tensors, att_mask, pos_id, data_loader 中的 trg: NameError: 名称 'data_loader' 未定义 "
from dataloader import bAbi_Dataset
import torch
import torch.nn as nn
from model import model
from pytorch_transformers import AdamW
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
print("GPU:" + str(torch.cuda.get_device_name(0)))
my_model = model()
my_model.to(device)
optimizer = AdamW(my_model.parameters())
criterion = nn.NLLLoss()
EPOCHS = 10
for epoch in range(1, EPOCHS+1):
my_model.train()
train_loss = 0
length = 0
for tokens_tensor, segments_tensors, att_mask, pos_id, trg in data_loader:
output = my_model(tokens_tensor.to(device), segments_tensors.to(device), att_mask.to(device), pos_id.to(device))
loss = criterion(output, trg.to(device))
optimizer.zero_grad()
loss.backward()
optimizer.step()
length+=1
train_loss += loss.item()
if length % 10 == 0:
print("\t\t{:3}/25000 : {}".format(length, train_loss / length))
epoch_loss = train_loss / length
print("##################")
print("{} epoch Loss : {:.4f}".format(epoch, epoch_loss))
和data_loader.py是
import os
import torch
import torch.utils.data as data
from pytorch_transformers import BertTokenizer
def _parse( file, only_supporting=False):
data, story = [], []
for line in file:
tid, text = line.rstrip('\n').split(' ', 1)
if tid == '1':
story = []
if text.endswith('.'):
story.append(text[:])
else:
query, answer, supporting = (x.strip() for x in text.split('\t'))
if only_supporting:
substory = [story[int(i) - 1] for i in supporting.split()]
else:
substory = [x for x in story if x]
data.append((substory, query[:-1], answer))
story.append("")
return data
def build_trg_dics(tenK=True, path="tasks_1-20_v1-2", train=True):
if tenK:
dirname = os.path.join(path, 'en-10k')
else:
dirname = os.path.join(path, 'en')
for (dirpath, dirnames, filenames) in os.walk(dirname):
filenames = filenames
if train:
filenames = [filename for filename in filenames if "train.txt" in filename]
else:
filenames = [filename for filename in filenames if "test.txt" in filename]
temp = []
for filename in filenames:
f = open(os.path.join(dirname, filename), 'r')
parsed =_parse(f)
temp.extend([d[2] for d in parsed])
temp = set(temp)
trg_word2id = {word:i for i, word in enumerate(temp)}
trg_id2word = {i:word for i, word in enumerate(temp)}
return trg_word2id, trg_id2word
class bAbi_Dataset(data.Dataset):
def __init__(self, trg_word2id, tenK=True, path = "tasks_1-20_v1-2", train=True):
# joint is Default
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
if tenK:
dirname = os.path.join(path, 'en-10k')
else:
dirname = os.path.join(path, 'en')
for (dirpath, dirnames, filenames) in os.walk(dirname):
filenames = filenames
if train:
filenames = [filename for filename in filenames if "train.txt" in filename]
else:
filenames = [filename for filename in filenames if "test.txt" in filename]
self.src = []
self.trg = []
for filename in filenames:
f = open(os.path.join(dirname, filename), 'r')
parsed = _parse(f)
self.src.extend([d[:2] for d in parsed])
self.trg.extend([trg_word2id[d[2]] for d in parsed])
self.trg = torch.tensor(self.trg)
def __getitem__(self, index):
src_seq = self.src[index]
trg = self.trg[index]
src_seq, seg_seq, att_mask, pos_id = self.preprocess_sequence(src_seq)
return src_seq, seg_seq, att_mask, pos_id, trg
def __len__(self):
return len(self.trg)
def preprocess_sequence(self, seq):
text = ["[CLS]"] + list(seq[0]) + ["[SEP]"] + [seq[1]] + ["[SEP]"]
tokenized_text = self.tokenizer.tokenize(" ".join(text))
indexed_text = self.tokenizer.convert_tokens_to_ids(tokenized_text)
where_is_sep = indexed_text.index(102) + 1
segment_ids = [0 ]* (where_is_sep) + [1] * (len(indexed_text)- where_is_sep)
attention_mask = [1] *len(indexed_text)
pos_id = [i for i in range(len(indexed_text))]
return torch.tensor(indexed_text), torch.tensor(segment_ids), torch.tensor(attention_mask), torch.tensor(pos_id)
def collate_fn(data):
def merge(sequences):
lengths = [len(seq) for seq in sequences]
padded_seqs = torch.zeros(len(sequences), 512).long()
for i, seq in enumerate(sequences):
end = lengths[i]
if end <= 512:
padded_seqs[i, :end] = seq[:end]
else:
padded_seqs[i] = seq[-512:]
return padded_seqs
def pos_merge(sequences):
lengths = [len(seq) for seq in sequences]
padded_seqs = torch.zeros(len(sequences), 512).long()
for i, seq in enumerate(sequences):
padded_seqs[i] = torch.tensor([i for i in range(512)])
return padded_seqs
src_seqs, seg_seqs, att_mask, pos_id, trgs = zip(*data)
src_seqs = merge(src_seqs)
seg_seqs = merge(seg_seqs)
att_mask = merge(att_mask)
pos_id = pos_merge(pos_id)
trgs = torch.tensor(trgs)
return src_seqs, seg_seqs, att_mask, pos_id, trgs
data_loader 缺少 main.py 中的变量声明。所以我尝试将 data_loader 加载为
for tokens_tensor, segments_tensors, att_mask, pos_id, trg in dataloader.collate_fn(bAbi_Dataset):
使用 data_loader.py 中的 collate_fn() 函数,但它不起作用。当我按上面的方式更改它时,它会出现以下错误:
Traceback (most recent call last):
File "main.py", line 27, in <module>
File "/content/BERT-for-bAbi-task/dataloader.py", line 133, in collate_fn
src_seqs, seg_seqs, att_mask, pos_id, trgs = zip(*data)
File "/usr/lib/python3.6/typing.py", line 682, in inner
return func(*args, **kwds)
File "/usr/lib/python3.6/typing.py", line 1107, in __getitem__
params = tuple(_type_check(p, msg) for p in params)
File "/usr/lib/python3.6/typing.py", line 1107, in <genexpr>
params = tuple(_type_check(p, msg) for p in params)
File "/usr/lib/python3.6/typing.py", line 374, in _type_check
raise TypeError(msg + " Got %.100r." % (arg,))
TypeError: Parameters to generic types must be types. Got 0.
谁能帮我改正错误?
我只是给你一些建议:
collate_fn
并不意味着用数据集作为参数来调用。它是给数据加载器的一个特殊回调函数,用于将批次元素整理成批次。因为
/dataloader.py
中的bAbi_Dataset
被定义为torch.utils.data.Dataset
我猜你应该初始化它。它被定义为 here 为:def __init__(self, trg_word2id, tenK=True, path = "tasks_1-20_v1-2", train=True)
设置正确的参数之前,您应该先看一下它们/dataloader.py
中还有另一个函数build_trg_dics
,用于从文件创建解析内容。在为bAbi_Dataset
.最后,初始化数据集后,您可以使用
torch.utils.data.DataLoader
在其上附加数据加载器。这看起来像:data_loader = DataLoader(dataset, batch_size=16)
此时,您甚至可能需要插入
/dataloader.py
中提供的整理功能。
如果您真的不知道自己在做什么,我建议您从一个可用的存储库开始,然后从那里开始工作。祝你好运!