OverflowError: Python int too large to convert to C long torchtext.datasets.text_classification.DATASETS['AG_NEWS']()
OverflowError: Python int too large to convert to C long torchtext.datasets.text_classification.DATASETS['AG_NEWS']()
我有 64 位 windows 10 OS
我已经安装了 python 3.6.8
我已经使用 pip 安装了 torch 和 torchtext。
手电筒版本为 1.2.0
我正在尝试使用以下代码加载 AG_NEWS 数据集:
import torch
import torchtext
from torchtext.datasets import text_classification
NGRAMS = 2
import os
if not os.path.isdir('./.data'):
os.mkdir('./.data')
train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](root='./.data', ngrams=NGRAMS, vocab=None)
在上述代码的最后一条语句中,出现以下错误:
---------------------------------------------------------------------------
OverflowError Traceback (most recent call last)
<ipython-input-1-7e8544fdaaf6> in <module>
6 if not os.path.isdir('./.data'):
7 os.mkdir('./.data')
----> 8 train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](root='./.data', ngrams=NGRAMS, vocab=None)
9 # BATCH_SIZE = 16
10 # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
c:\users\pramodp\appdata\local\programs\python\python36\lib\site-packages\torchtext\datasets\text_classification.py in AG_NEWS(*args, **kwargs)
168 """
169
--> 170 return _setup_datasets(*(("AG_NEWS",) + args), **kwargs)
171
172
c:\users\pramodp\appdata\local\programs\python\python36\lib\site-packages\torchtext\datasets\text_classification.py in _setup_datasets(dataset_name, root, ngrams, vocab, include_unk)
126 if vocab is None:
127 logging.info('Building Vocab based on {}'.format(train_csv_path))
--> 128 vocab = build_vocab_from_iterator(_csv_iterator(train_csv_path, ngrams))
129 else:
130 if not isinstance(vocab, Vocab):
c:\users\pramodp\appdata\local\programs\python\python36\lib\site-packages\torchtext\vocab.py in build_vocab_from_iterator(iterator)
555 counter = Counter()
556 with tqdm(unit_scale=0, unit='lines') as t:
--> 557 for tokens in iterator:
558 counter.update(tokens)
559 t.update(1)
c:\users\pramodp\appdata\local\programs\python\python36\lib\site-packages\torchtext\datasets\text_classification.py in _csv_iterator(data_path, ngrams, yield_cls)
33 with io.open(data_path, encoding="utf8") as f:
34 reader = unicode_csv_reader(f)
---> 35 for row in reader:
36 tokens = ' '.join(row[1:])
37 tokens = tokenizer(tokens)
c:\users\pramodp\appdata\local\programs\python\python36\lib\site-packages\torchtext\utils.py in unicode_csv_reader(unicode_csv_data, **kwargs)
128 maxInt = int(maxInt / 10)
129
--> 130 csv.field_size_limit(sys.maxsize)
131
132 if six.PY2:
OverflowError: Python int too large to convert to C long
我认为问题出在 windows os 或 torchtext 上,因为我在下面的代码中也遇到了同样的错误。
pos = data.TabularDataset( path='data/pos/pos_wsj_train.tsv', format='tsv', fields=[('text', data.Field()),
('labels', data.Field())])
有人可以帮忙吗?主要是我在文件中没有任何大数值。
我也遇到了类似的问题。我在我的 torchtext\utils.py
文件中更改了一行代码,我的错误消失了。
改变了这个:
csv.field_size_limit(sys.maxsize)
为此:
csv.field_size_limit(maxInt)
我有 64 位 windows 10 OS 我已经安装了 python 3.6.8 我已经使用 pip 安装了 torch 和 torchtext。 手电筒版本为 1.2.0
我正在尝试使用以下代码加载 AG_NEWS 数据集:
import torch
import torchtext
from torchtext.datasets import text_classification
NGRAMS = 2
import os
if not os.path.isdir('./.data'):
os.mkdir('./.data')
train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](root='./.data', ngrams=NGRAMS, vocab=None)
在上述代码的最后一条语句中,出现以下错误:
---------------------------------------------------------------------------
OverflowError Traceback (most recent call last)
<ipython-input-1-7e8544fdaaf6> in <module>
6 if not os.path.isdir('./.data'):
7 os.mkdir('./.data')
----> 8 train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](root='./.data', ngrams=NGRAMS, vocab=None)
9 # BATCH_SIZE = 16
10 # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
c:\users\pramodp\appdata\local\programs\python\python36\lib\site-packages\torchtext\datasets\text_classification.py in AG_NEWS(*args, **kwargs)
168 """
169
--> 170 return _setup_datasets(*(("AG_NEWS",) + args), **kwargs)
171
172
c:\users\pramodp\appdata\local\programs\python\python36\lib\site-packages\torchtext\datasets\text_classification.py in _setup_datasets(dataset_name, root, ngrams, vocab, include_unk)
126 if vocab is None:
127 logging.info('Building Vocab based on {}'.format(train_csv_path))
--> 128 vocab = build_vocab_from_iterator(_csv_iterator(train_csv_path, ngrams))
129 else:
130 if not isinstance(vocab, Vocab):
c:\users\pramodp\appdata\local\programs\python\python36\lib\site-packages\torchtext\vocab.py in build_vocab_from_iterator(iterator)
555 counter = Counter()
556 with tqdm(unit_scale=0, unit='lines') as t:
--> 557 for tokens in iterator:
558 counter.update(tokens)
559 t.update(1)
c:\users\pramodp\appdata\local\programs\python\python36\lib\site-packages\torchtext\datasets\text_classification.py in _csv_iterator(data_path, ngrams, yield_cls)
33 with io.open(data_path, encoding="utf8") as f:
34 reader = unicode_csv_reader(f)
---> 35 for row in reader:
36 tokens = ' '.join(row[1:])
37 tokens = tokenizer(tokens)
c:\users\pramodp\appdata\local\programs\python\python36\lib\site-packages\torchtext\utils.py in unicode_csv_reader(unicode_csv_data, **kwargs)
128 maxInt = int(maxInt / 10)
129
--> 130 csv.field_size_limit(sys.maxsize)
131
132 if six.PY2:
OverflowError: Python int too large to convert to C long
我认为问题出在 windows os 或 torchtext 上,因为我在下面的代码中也遇到了同样的错误。
pos = data.TabularDataset( path='data/pos/pos_wsj_train.tsv', format='tsv', fields=[('text', data.Field()),
('labels', data.Field())])
有人可以帮忙吗?主要是我在文件中没有任何大数值。
我也遇到了类似的问题。我在我的 torchtext\utils.py
文件中更改了一行代码,我的错误消失了。
改变了这个:
csv.field_size_limit(sys.maxsize)
为此:
csv.field_size_limit(maxInt)