Torchtext 0.7 显示 Field 正在被弃用。有什么选择?
Torchtext 0.7 shows Field is being deprecated. What is the alternative?
看起来像以前声明字段、示例和使用 BucketIterator 的范例已被弃用,并将在 0.8 中移至旧版。但是,我似乎无法找到不使用 Field 的自定义数据集的新范例示例(例如,不是 torch.datasets 中包含的那些)。谁能给我指出一个最新的例子?
弃用参考:
我自己花了一点时间才找到解决方案。对于预建数据集,新范例是这样的:
from torchtext.experimental.datasets import AG_NEWS
train, test = AG_NEWS(ngrams=3)
或自定义构建的数据集:
from torch.utils.data import DataLoader
def collate_fn(batch):
texts, labels = [], []
for label, txt in batch:
texts.append(txt)
labels.append(label)
return texts, labels
dataloader = DataLoader(train, batch_size=8, collate_fn=collate_fn)
for idx, (texts, labels) in enumerate(dataloader):
print(idx, texts, labels)
我已经从 Source
中复制了示例
正在浏览 torchtext
的 GitHub repo I stumbled over the README in the legacy directory, which is not documented in the official docs. The README links a GitHub issue that explains the rationale behind the change as well as a migration guide。
如果您只想保留现有代码 运行 torchtext
0.9.0,其中弃用的 类 已移至 legacy
模块,您有调整您的导入:
# from torchtext.data import Field, TabularDataset
from torchtext.legacy.data import Field, TabularDataset
或者,您可以按照 README 的建议将整个 torchtext.legacy
模块导入为 torchtext
:
import torchtext.legacy as torchtext
有一个关于此的 post。它没有使用已弃用的 Field
和 BucketIterator
类,而是使用 TextClassificationDataset
以及整理器和其他预处理。它读取一个 txt 文件并构建一个数据集,然后是一个模型。在 post 里面,有一个 link 到一个完整的工作笔记本。 post 位于:https://mmg10.github.io/pytorch/2021/02/16/text_torch.html。但是你需要 PyTorch 的 'dev'(或每晚构建)才能工作。
来自上面的link:
分词和构建词汇表后,您可以按如下方式构建数据集
def data_to_dataset(data, tokenizer, vocab):
data = [(text, label) for (text, label) in data]
text_transform = sequential_transforms(tokenizer.tokenize,
vocab_func(vocab),
totensor(dtype=torch.long)
)
label_transform = sequential_transforms(lambda x: 1 if x =='1' else (0 if x =='0' else x),
totensor(dtype=torch.long)
)
transforms = (text_transform, label_transform)
dataset = TextClassificationDataset(data, vocab, transforms)
return dataset
整理如下:
def __init__(self, pad_idx):
self.pad_idx = pad_idx
def collate(self, batch):
text, labels = zip(*batch)
labels = torch.LongTensor(labels)
text = nn.utils.rnn.pad_sequence(text, padding_value=self.pad_idx, batch_first=True)
return text, labels
然后,您可以使用 collate_fn
参数构建具有典型 torch.utils.data.DataLoader
的数据加载器。
嗯,看起来管道可能是这样的:
import torchtext as TT
import torch
from collections import Counter
from torchtext.vocab import Vocab
# read the data
with open('text_data.txt','r') as f:
data = f.readlines()
with open('labels.txt', 'r') as f:
labels = f.readlines()
tokenizer = TT.data.utils.get_tokenizer('spacy', 'en') # can remove 'spacy' and use a simple built-in tokenizer
train_iter = zip(labels, data)
counter = Counter()
for (label, line) in train_iter:
counter.update(tokenizer(line))
vocab = TT.vocab.Vocab(counter, min_freq=1)
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
# this is data-specific - adapt for your data
label_pipeline = lambda x: 1 if x == 'positive\n' else 0
class TextData(torch.utils.data.Dataset):
'''
very basic dataset for processing text data
'''
def __init__(self, labels, text):
super(TextData, self).__init__()
self.labels = labels
self.text = text
def __getitem__(self, index):
return self.labels[index], self.text[index]
def __len__(self):
return len(self.labels)
def tokenize_batch(batch, max_len=200):
'''
tokenizer to use in DataLoader
takes a text batch of text dataset and produces a tensor batch, converting text and labels though tokenizer, labeler
tokenizer is a global function text_pipeline
labeler is a global function label_pipeline
max_len is a fixed len size, if text is less than max_len it is padded with ones (pad number)
if text is larger that max_len it is truncated but from the end of the string
'''
labels_list, text_list = [], []
for _label, _text in batch:
labels_list.append(label_pipeline(_label))
text_holder = torch.ones(max_len, dtype=torch.int32) # fixed size tensor of max_len
processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int32)
pos = min(200, len(processed_text))
text_holder[-pos:] = processed_text[-pos:]
text_list.append(text_holder.unsqueeze(dim=0))
return torch.FloatTensor(labels_list), torch.cat(text_list, dim=0)
train_dataset = TextData(labels, data)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=False, collate_fn=tokenize_batch)
lbl, txt = iter(train_loader).next()
看起来像以前声明字段、示例和使用 BucketIterator 的范例已被弃用,并将在 0.8 中移至旧版。但是,我似乎无法找到不使用 Field 的自定义数据集的新范例示例(例如,不是 torch.datasets 中包含的那些)。谁能给我指出一个最新的例子?
弃用参考:
我自己花了一点时间才找到解决方案。对于预建数据集,新范例是这样的:
from torchtext.experimental.datasets import AG_NEWS
train, test = AG_NEWS(ngrams=3)
或自定义构建的数据集:
from torch.utils.data import DataLoader
def collate_fn(batch):
texts, labels = [], []
for label, txt in batch:
texts.append(txt)
labels.append(label)
return texts, labels
dataloader = DataLoader(train, batch_size=8, collate_fn=collate_fn)
for idx, (texts, labels) in enumerate(dataloader):
print(idx, texts, labels)
我已经从 Source
中复制了示例正在浏览 torchtext
的 GitHub repo I stumbled over the README in the legacy directory, which is not documented in the official docs. The README links a GitHub issue that explains the rationale behind the change as well as a migration guide。
如果您只想保留现有代码 运行 torchtext
0.9.0,其中弃用的 类 已移至 legacy
模块,您有调整您的导入:
# from torchtext.data import Field, TabularDataset
from torchtext.legacy.data import Field, TabularDataset
或者,您可以按照 README 的建议将整个 torchtext.legacy
模块导入为 torchtext
:
import torchtext.legacy as torchtext
有一个关于此的 post。它没有使用已弃用的 Field
和 BucketIterator
类,而是使用 TextClassificationDataset
以及整理器和其他预处理。它读取一个 txt 文件并构建一个数据集,然后是一个模型。在 post 里面,有一个 link 到一个完整的工作笔记本。 post 位于:https://mmg10.github.io/pytorch/2021/02/16/text_torch.html。但是你需要 PyTorch 的 'dev'(或每晚构建)才能工作。
来自上面的link:
分词和构建词汇表后,您可以按如下方式构建数据集
def data_to_dataset(data, tokenizer, vocab):
data = [(text, label) for (text, label) in data]
text_transform = sequential_transforms(tokenizer.tokenize,
vocab_func(vocab),
totensor(dtype=torch.long)
)
label_transform = sequential_transforms(lambda x: 1 if x =='1' else (0 if x =='0' else x),
totensor(dtype=torch.long)
)
transforms = (text_transform, label_transform)
dataset = TextClassificationDataset(data, vocab, transforms)
return dataset
整理如下:
def __init__(self, pad_idx):
self.pad_idx = pad_idx
def collate(self, batch):
text, labels = zip(*batch)
labels = torch.LongTensor(labels)
text = nn.utils.rnn.pad_sequence(text, padding_value=self.pad_idx, batch_first=True)
return text, labels
然后,您可以使用 collate_fn
参数构建具有典型 torch.utils.data.DataLoader
的数据加载器。
嗯,看起来管道可能是这样的:
import torchtext as TT
import torch
from collections import Counter
from torchtext.vocab import Vocab
# read the data
with open('text_data.txt','r') as f:
data = f.readlines()
with open('labels.txt', 'r') as f:
labels = f.readlines()
tokenizer = TT.data.utils.get_tokenizer('spacy', 'en') # can remove 'spacy' and use a simple built-in tokenizer
train_iter = zip(labels, data)
counter = Counter()
for (label, line) in train_iter:
counter.update(tokenizer(line))
vocab = TT.vocab.Vocab(counter, min_freq=1)
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
# this is data-specific - adapt for your data
label_pipeline = lambda x: 1 if x == 'positive\n' else 0
class TextData(torch.utils.data.Dataset):
'''
very basic dataset for processing text data
'''
def __init__(self, labels, text):
super(TextData, self).__init__()
self.labels = labels
self.text = text
def __getitem__(self, index):
return self.labels[index], self.text[index]
def __len__(self):
return len(self.labels)
def tokenize_batch(batch, max_len=200):
'''
tokenizer to use in DataLoader
takes a text batch of text dataset and produces a tensor batch, converting text and labels though tokenizer, labeler
tokenizer is a global function text_pipeline
labeler is a global function label_pipeline
max_len is a fixed len size, if text is less than max_len it is padded with ones (pad number)
if text is larger that max_len it is truncated but from the end of the string
'''
labels_list, text_list = [], []
for _label, _text in batch:
labels_list.append(label_pipeline(_label))
text_holder = torch.ones(max_len, dtype=torch.int32) # fixed size tensor of max_len
processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int32)
pos = min(200, len(processed_text))
text_holder[-pos:] = processed_text[-pos:]
text_list.append(text_holder.unsqueeze(dim=0))
return torch.FloatTensor(labels_list), torch.cat(text_list, dim=0)
train_dataset = TextData(labels, data)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=False, collate_fn=tokenize_batch)
lbl, txt = iter(train_loader).next()