TextLMDataBunch 和 language_model_learner 的 Fastai 问题
Fastai issue with TextLMDataBunch and language_model_learner
尝试使用 fastai lib 复制 this code,但我 运行 有两个主要问题。
这段代码:
data_lm = TextLMDataBunch.from_df('data', train_df, valid_df, text_cols='idea')
出现这种错误:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-39-74ec5bcc1e2a> in <module>
----> 1 data_lm = TextLMDataBunch.from_df('data', train_df, valid_df, text_cols='idea')
~\Anaconda3\lib\site-packages\fastai\text\data.py in from_df(cls, path, train_df, valid_df, test_df, tokenizer, vocab, **kwargs)
325 k_names = ['max_vocab', 'min_freq', 'n_labels', 'txt_cols', 'label_cols', 'clear_cache']
326 txt_kwargs, kwargs = extract_kwargs(k_names, kwargs)
--> 327 train_ds = TextDataset.from_df(path, train_df, tokenizer, 'train', vocab=vocab, **txt_kwargs)
328 datasets = [train_ds, TextDataset.from_df(path, valid_df, tokenizer, 'valid', vocab=train_ds.vocab, **txt_kwargs)]
329 if test_df is not None: datasets.append(TextDataset.from_df(path, test_df, tokenizer, 'test', vocab=train_ds.vocab, **txt_kwargs))
~\Anaconda3\lib\site-packages\fastai\text\data.py in from_df(cls, folder, df, tokenizer, name, **kwargs)
150 tokenizer = ifnone(tokenizer, Tokenizer())
151 chunksize = 1 if (type(df) == DataFrame) else df.chunksize
--> 152 return cls(folder, tokenizer, df=df, create_mtd=TextMtd.DF, name=name, chunksize=chunksize, **kwargs)
153
154 @classmethod
~\Anaconda3\lib\site-packages\fastai\text\data.py in __init__(self, path, tokenizer, vocab, max_vocab, chunksize, name, df, min_freq, n_labels, txt_cols, label_cols, create_mtd, classes, clear_cache)
35 os.makedirs(self.path, exist_ok=True)
36 if clear_cache: self.clear()
---> 37 if not self.check_toks(): self.tokenize()
38 if not self.check_ids(): self.numericalize()
39
~\Anaconda3\lib\site-packages\fastai\text\data.py in tokenize(self)
86 df = next(dfs) if (type(dfs) == pd.io.parsers.TextFileReader) else self.df
87 lbl_type = np.float32 if len(self.label_cols) > 1 else np.int64
---> 88 lbls = df[self.label_cols].values.astype(lbl_type) if (len(self.label_cols) > 0) else []
89 self.txt_cols = ifnone(self.txt_cols, list(range(len(self.label_cols),len(df.columns))))
90 texts = f'{FLD} {1} ' + df[self.txt_cols[0]].astype(str)
~\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2680 if isinstance(key, (Series, np.ndarray, Index, list)):
2681 # either boolean or fancy integer index
-> 2682 return self._getitem_array(key)
2683 elif isinstance(key, DataFrame):
2684 return self._getitem_frame(key)
~\Anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_array(self, key)
2724 return self._take(indexer, axis=0)
2725 else:
-> 2726 indexer = self.loc._convert_to_indexer(key, axis=1)
2727 return self._take(indexer, axis=1)
2728
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _convert_to_indexer(self, obj, axis, is_setter)
1325 if mask.any():
1326 raise KeyError('{mask} not in index'
-> 1327 .format(mask=objarr[mask]))
1328
1329 return com._values_from_object(indexer)
KeyError: '[0] not in index'
这部分代码:
learn = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.5)
learn.fit_one_cycle(1, 1e-2)
运行 在此错误中:
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-37-7b61575a202a> in <module>
----> 1 learn = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.5)
2 learn.fit_one_cycle(1, 1e-2)
NameError: name 'language_model_learner' is not defined
我在安装 fastai 时遇到问题,但我不知道问题出在哪里。
好吧,您的问题是 2 个月前提出的,此后图书馆经历了很多变化。在我看来,您的第一个错误是因为您没有指定带有标签的列, self.labels_cols 设置为 [0] 因此,它不在您的 Dataframe 的索引中。我相信这种行为自您的 post 以来已经发生了变化,并且今天,不指定 label_cols 将按预期工作。
关于您的第二个问题,languague_model_learner 在重构之前的调用方式有所不同。你在尝试使用 fastai 时使用的是最新版本吗?请随时使用最新版本的 fastai 再次测试,看看是否出现相同的错误。
运行 这个“来自 fastai.text.learner 导入 language_model_learner”
在 运行 之前:
'''
learn = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.5)
learn.fit_one_cycle(1, 1e-2)
'''
尝试使用 fastai lib 复制 this code,但我 运行 有两个主要问题。
这段代码:
data_lm = TextLMDataBunch.from_df('data', train_df, valid_df, text_cols='idea')
出现这种错误:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-39-74ec5bcc1e2a> in <module>
----> 1 data_lm = TextLMDataBunch.from_df('data', train_df, valid_df, text_cols='idea')
~\Anaconda3\lib\site-packages\fastai\text\data.py in from_df(cls, path, train_df, valid_df, test_df, tokenizer, vocab, **kwargs)
325 k_names = ['max_vocab', 'min_freq', 'n_labels', 'txt_cols', 'label_cols', 'clear_cache']
326 txt_kwargs, kwargs = extract_kwargs(k_names, kwargs)
--> 327 train_ds = TextDataset.from_df(path, train_df, tokenizer, 'train', vocab=vocab, **txt_kwargs)
328 datasets = [train_ds, TextDataset.from_df(path, valid_df, tokenizer, 'valid', vocab=train_ds.vocab, **txt_kwargs)]
329 if test_df is not None: datasets.append(TextDataset.from_df(path, test_df, tokenizer, 'test', vocab=train_ds.vocab, **txt_kwargs))
~\Anaconda3\lib\site-packages\fastai\text\data.py in from_df(cls, folder, df, tokenizer, name, **kwargs)
150 tokenizer = ifnone(tokenizer, Tokenizer())
151 chunksize = 1 if (type(df) == DataFrame) else df.chunksize
--> 152 return cls(folder, tokenizer, df=df, create_mtd=TextMtd.DF, name=name, chunksize=chunksize, **kwargs)
153
154 @classmethod
~\Anaconda3\lib\site-packages\fastai\text\data.py in __init__(self, path, tokenizer, vocab, max_vocab, chunksize, name, df, min_freq, n_labels, txt_cols, label_cols, create_mtd, classes, clear_cache)
35 os.makedirs(self.path, exist_ok=True)
36 if clear_cache: self.clear()
---> 37 if not self.check_toks(): self.tokenize()
38 if not self.check_ids(): self.numericalize()
39
~\Anaconda3\lib\site-packages\fastai\text\data.py in tokenize(self)
86 df = next(dfs) if (type(dfs) == pd.io.parsers.TextFileReader) else self.df
87 lbl_type = np.float32 if len(self.label_cols) > 1 else np.int64
---> 88 lbls = df[self.label_cols].values.astype(lbl_type) if (len(self.label_cols) > 0) else []
89 self.txt_cols = ifnone(self.txt_cols, list(range(len(self.label_cols),len(df.columns))))
90 texts = f'{FLD} {1} ' + df[self.txt_cols[0]].astype(str)
~\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2680 if isinstance(key, (Series, np.ndarray, Index, list)):
2681 # either boolean or fancy integer index
-> 2682 return self._getitem_array(key)
2683 elif isinstance(key, DataFrame):
2684 return self._getitem_frame(key)
~\Anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_array(self, key)
2724 return self._take(indexer, axis=0)
2725 else:
-> 2726 indexer = self.loc._convert_to_indexer(key, axis=1)
2727 return self._take(indexer, axis=1)
2728
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _convert_to_indexer(self, obj, axis, is_setter)
1325 if mask.any():
1326 raise KeyError('{mask} not in index'
-> 1327 .format(mask=objarr[mask]))
1328
1329 return com._values_from_object(indexer)
KeyError: '[0] not in index'
这部分代码:
learn = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.5)
learn.fit_one_cycle(1, 1e-2)
运行 在此错误中:
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-37-7b61575a202a> in <module>
----> 1 learn = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.5)
2 learn.fit_one_cycle(1, 1e-2)
NameError: name 'language_model_learner' is not defined
我在安装 fastai 时遇到问题,但我不知道问题出在哪里。
好吧,您的问题是 2 个月前提出的,此后图书馆经历了很多变化。在我看来,您的第一个错误是因为您没有指定带有标签的列, self.labels_cols 设置为 [0] 因此,它不在您的 Dataframe 的索引中。我相信这种行为自您的 post 以来已经发生了变化,并且今天,不指定 label_cols 将按预期工作。
关于您的第二个问题,languague_model_learner 在重构之前的调用方式有所不同。你在尝试使用 fastai 时使用的是最新版本吗?请随时使用最新版本的 fastai 再次测试,看看是否出现相同的错误。
运行 这个“来自 fastai.text.learner 导入 language_model_learner”
在 运行 之前: ''' learn = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.5) learn.fit_one_cycle(1, 1e-2) '''