ReversibleField 在使用 spacy 自定义分词器时失败
ReversibleField fails when using spacy custom tokenizer
ReversibleField 在没有 spacy 的情况下运行良好
在 ReversibleField
中使用 tokenize=None
时,一切正常
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator, ReversibleField
import spacy
SRC = ReversibleField(tokenize=None,
init_token = '<sos>',
eos_token = '<eos>',
lower = True,
batch_first= True)
TRG = ReversibleField(tokenize=None,
init_token = '<sos>',
eos_token = '<eos>',
lower = True,
batch_first= True)
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'),
fields = (SRC, TRG))
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)
device = 'cuda:2'
BATCH_SIZE = 3
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
(train_data, valid_data, test_data),
batch_size = BATCH_SIZE,
device = device)
batch = next(iter(train_iterator))
TRG.reverse(batch.trg)
output>>>
['a group of kids playing with tires.',
'seven construction workers working on a building.',
'a man is performing with fire sticks before a crowd outside.']
使用 spacy 时 ReversibleField 失败
但是,当我尝试使用 spacy 作为我的分词器时,它给了我一大段对我来说没有意义的字符串。
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')
def tokenize_de(text):
"""
Tokenizes German text from a string into a list of strings (tokens) and reverses it
"""
return [tok.text for tok in spacy_de.tokenizer(text)][::-1]
def tokenize_en(text):
"""
Tokenizes English text from a string into a list of strings (tokens)
"""
return [tok.text for tok in spacy_en.tokenizer(text)]
SRC = ReversibleField(tokenize = tokenize_de,
init_token = '<sos>',
eos_token = '<eos>',
lower = True,
batch_first= True)
TRG = ReversibleField(tokenize = tokenize_en,
init_token = '<sos>',
eos_token = '<eos>',
lower = True,
batch_first= True)
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'),
fields = (SRC, TRG))
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
(train_data, valid_data, test_data),
batch_size = BATCH_SIZE,
device = device)
batch = next(iter(train_iterator))
TRG.reverse(batch.trg)
output >>>
['agroupofkidsplayingwithtires.',
'sevenconstructionworkersworkingonabuilding.',
'amanisperformingwithfiresticksbeforeacrowdoutside.']
这里有什么问题?如何在使用 spacy 时将标记正确转换回字符串?
ReversibleField
有明显错误 definition:
class ReversibleField(Field):
def __init__(self, **kwargs):
warnings.warn('{} class will be retired in the 0.8.0 release and moved to torchtext.legacy. Please see 0.7.0 release notes for further information.'.format(self.__class__.__name__), UserWarning)
if kwargs.get('tokenize') is list:
self.use_revtok = False
else:
self.use_revtok = True
...
def reverse(self, batch):
if self.use_revtok:
try:
import revtok
except ImportError:
print("Please install revtok.")
raise
...
if self.use_revtok:
return [revtok.detokenize(ex) for ex in batch]
你看,除非你提供 tokenize
kwarg 作为列表 reverse
在空的 revtok
分词器上总是作为 detokenize
返回。
- 注释上面代码块中的最后两行(class 定义位于
/home/USER/anaconda3/envs/ENV_NAME/lib/python3.7/site-packages/torchtext-0.8.0a0+db31b5d-py3.7-linux-x86_64.egg/torchtext/data/field.py
,第 408-409 行)
- 更改您的分词器以包含空格,如下面的代码块
你可以走了。
证明:
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator, ReversibleField
import spacy
# spacy download en_core_web_sm
# spacy download de_core_news_sm
nlp_en = spacy.load("en_core_web_sm")
nlp_de = spacy.load("de_core_news_sm")
def tokenize_de(text):
return [el for els in [(tok.text, tok.whitespace_) for tok in nlp_de(text)] for el in els]
def tokenize_en(text):
return [el for els in [(tok.text, tok.whitespace_) for tok in nlp_en(text)] for el in els]
SRC = ReversibleField(tokenize = tokenize_de,
init_token = '<sos>',
eos_token = '<eos>',
unk_token='<unk>',
lower = True,
batch_first= True)
TRG = ReversibleField(tokenize = tokenize_en,
init_token = '<sos>',
eos_token = '<eos>',
unk_token='<unk>',
lower = True,
batch_first= True)
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'),
fields = (SRC, TRG))
SRC.build_vocab(train_data, min_freq = 3)
TRG.build_vocab(train_data, min_freq = 3)
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
(train_data, valid_data, test_data),
batch_size = 3, device="cuda:0")
batch = next(iter(train_iterator))
TRG.reverse(batch.trg)
['asian people wearing helmet waiting to buy food.',
'a mother stands in a kitchen holding a small baby.',
'a person performing a <unk> bicycle jump over dirt ramps.']
ReversibleField 在没有 spacy 的情况下运行良好
在 ReversibleField
中使用 tokenize=None
时,一切正常
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator, ReversibleField
import spacy
SRC = ReversibleField(tokenize=None,
init_token = '<sos>',
eos_token = '<eos>',
lower = True,
batch_first= True)
TRG = ReversibleField(tokenize=None,
init_token = '<sos>',
eos_token = '<eos>',
lower = True,
batch_first= True)
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'),
fields = (SRC, TRG))
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)
device = 'cuda:2'
BATCH_SIZE = 3
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
(train_data, valid_data, test_data),
batch_size = BATCH_SIZE,
device = device)
batch = next(iter(train_iterator))
TRG.reverse(batch.trg)
output>>>
['a group of kids playing with tires.',
'seven construction workers working on a building.',
'a man is performing with fire sticks before a crowd outside.']
使用 spacy 时 ReversibleField 失败
但是,当我尝试使用 spacy 作为我的分词器时,它给了我一大段对我来说没有意义的字符串。
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')
def tokenize_de(text):
"""
Tokenizes German text from a string into a list of strings (tokens) and reverses it
"""
return [tok.text for tok in spacy_de.tokenizer(text)][::-1]
def tokenize_en(text):
"""
Tokenizes English text from a string into a list of strings (tokens)
"""
return [tok.text for tok in spacy_en.tokenizer(text)]
SRC = ReversibleField(tokenize = tokenize_de,
init_token = '<sos>',
eos_token = '<eos>',
lower = True,
batch_first= True)
TRG = ReversibleField(tokenize = tokenize_en,
init_token = '<sos>',
eos_token = '<eos>',
lower = True,
batch_first= True)
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'),
fields = (SRC, TRG))
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
(train_data, valid_data, test_data),
batch_size = BATCH_SIZE,
device = device)
batch = next(iter(train_iterator))
TRG.reverse(batch.trg)
output >>>
['agroupofkidsplayingwithtires.',
'sevenconstructionworkersworkingonabuilding.',
'amanisperformingwithfiresticksbeforeacrowdoutside.']
这里有什么问题?如何在使用 spacy 时将标记正确转换回字符串?
ReversibleField
有明显错误 definition:
class ReversibleField(Field):
def __init__(self, **kwargs):
warnings.warn('{} class will be retired in the 0.8.0 release and moved to torchtext.legacy. Please see 0.7.0 release notes for further information.'.format(self.__class__.__name__), UserWarning)
if kwargs.get('tokenize') is list:
self.use_revtok = False
else:
self.use_revtok = True
...
def reverse(self, batch):
if self.use_revtok:
try:
import revtok
except ImportError:
print("Please install revtok.")
raise
...
if self.use_revtok:
return [revtok.detokenize(ex) for ex in batch]
你看,除非你提供 tokenize
kwarg 作为列表 reverse
在空的 revtok
分词器上总是作为 detokenize
返回。
- 注释上面代码块中的最后两行(class 定义位于
/home/USER/anaconda3/envs/ENV_NAME/lib/python3.7/site-packages/torchtext-0.8.0a0+db31b5d-py3.7-linux-x86_64.egg/torchtext/data/field.py
,第 408-409 行) - 更改您的分词器以包含空格,如下面的代码块
你可以走了。
证明:
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator, ReversibleField
import spacy
# spacy download en_core_web_sm
# spacy download de_core_news_sm
nlp_en = spacy.load("en_core_web_sm")
nlp_de = spacy.load("de_core_news_sm")
def tokenize_de(text):
return [el for els in [(tok.text, tok.whitespace_) for tok in nlp_de(text)] for el in els]
def tokenize_en(text):
return [el for els in [(tok.text, tok.whitespace_) for tok in nlp_en(text)] for el in els]
SRC = ReversibleField(tokenize = tokenize_de,
init_token = '<sos>',
eos_token = '<eos>',
unk_token='<unk>',
lower = True,
batch_first= True)
TRG = ReversibleField(tokenize = tokenize_en,
init_token = '<sos>',
eos_token = '<eos>',
unk_token='<unk>',
lower = True,
batch_first= True)
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'),
fields = (SRC, TRG))
SRC.build_vocab(train_data, min_freq = 3)
TRG.build_vocab(train_data, min_freq = 3)
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
(train_data, valid_data, test_data),
batch_size = 3, device="cuda:0")
batch = next(iter(train_iterator))
TRG.reverse(batch.trg)
['asian people wearing helmet waiting to buy food.',
'a mother stands in a kitchen holding a small baby.',
'a person performing a <unk> bicycle jump over dirt ramps.']