AttributeError: Caught AttributeError in DataLoader worker process 0. - fine tuning pre-trained transformer model
AttributeError: Caught AttributeError in DataLoader worker process 0. - fine tuning pre-trained transformer model
谁能帮我解决这个错误?
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-4-aaa58b106c77> in <module>()
25 output_path='fine_tuned_bert',
26 save_best_model= True,
---> 27 show_progress_bar= True
28 )
4 frames
/usr/local/lib/python3.7/dist-packages/torch/_utils.py in reraise(self)
423 # have message field
424 raise self.exc_type(message=msg)
--> 425 raise self.exc_type(msg)
426
427
AttributeError: Caught AttributeError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/worker.py", line 287, in _worker_loop
data = fetcher.fetch(index)
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
return self.collate_fn(data)
File "/usr/local/lib/python3.7/dist-packages/sentence_transformers/SentenceTransformer.py", line 518, in smart_batching_collate
num_texts = len(batch[0].texts)
AttributeError: 'str' object has no attribute 'texts'
代码:
import pandas as pd
# initialise data of lists.
data = {'input':[
"Alpro, Cioccolato bevanda a base di soia 1 ltr", #Alpro, Chocolate soy drink 1 ltr
"Milka cioccolato al latte 100 g", #Milka milk chocolate 100 g
"Danone, HiPRO 25g Proteine gusto cioccolato 330 ml", #Danone, HiPRO 25g Protein chocolate flavor 330 ml
]
}
# Creates pandas DataFrame.
x_sample = pd.DataFrame(data)
print(x_sample['input'])
# load model
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses, evaluation
from torch.utils.data import DataLoader
embedder = SentenceTransformer('sentence-transformers/paraphrase-xlm-r-multilingual-v1') # or any other pretrained model
print("embedder loaded...")
# define your train dataset, the dataloader, and the train loss
train_dataset = SentencesDataset(x_sample["input"].tolist(), embedder)
train_dataloader = DataLoader(train_dataset, shuffle=False, batch_size=4, num_workers=1)
train_loss = losses.CosineSimilarityLoss(embedder)
# dummy evaluator to make the api work
sentences1 = ['latte al cioccolato', 'latte al cioccolato','latte al cioccolato']
sentences2 = ['Alpro, Cioccolato bevanda a base di soia 1 ltr', 'Danone, HiPRO 25g Proteine gusto cioccolato 330 ml','Milka cioccolato al latte 100 g']
scores = [0.99,0.95,0.4]
evaluator = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)
# tune the model
embedder.fit(train_objectives=[(train_dataloader, train_loss)],
epochs=5,
warmup_steps=500,
evaluator=evaluator,
evaluation_steps=1,
output_path='fine_tuned_bert',
save_best_model= True,
show_progress_bar= True
)
[已更新]
我浏览了几行关于如何使用 fit()
方法的文档 here,我意识到有一个更简单的解决方案可以满足您的需求。您需要考虑的唯一更改是定义适当的 InputExample
以构建 DataLoader 并造成损失!
import pandas as pd
# initialise data of lists.
data = {'input':[
"Alpro, Cioccolato bevanda a base di soia 1 ltr", #Alpro, Chocolate soy drink 1 ltr
"Milka cioccolato al latte 100 g", #Milka milk chocolate 100 g
"Danone, HiPRO 25g Proteine gusto cioccolato 330 ml", #Danone, HiPRO 25g Protein chocolate flavor 330 ml
]
}
# Creates pandas DataFrame.
x_sample = pd.DataFrame(data)
print(x_sample['input'])
# load model
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses, evaluation
from torch.utils.data import DataLoader
embedder = SentenceTransformer('sentence-transformers/paraphrase-xlm-r-multilingual-v1') # or any other pretrained model
print("embedder loaded...")
# define your train dataset, the dataloader, and the train loss
# train_dataset = SentencesDataset(x_sample["input"].tolist(), embedder)
# train_dataloader = DataLoader(train_dataset, shuffle=False, batch_size=4, num_workers=1)
# train_loss = losses.CosineSimilarityLoss(embedder)
# dummy evaluator to make the api work
sentences1 = ['latte al cioccolato', 'latte al cioccolato','latte al cioccolato']
sentences2 = ['Alpro, Cioccolato bevanda a base di soia 1 ltr', 'Danone, HiPRO 25g Proteine gusto cioccolato 330 ml','Milka cioccolato al latte 100 g']
scores = [0.99,0.95,0.4]
evaluator = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)
examples = []
for s1,s2,l in zip(sentences1, sentences2, scores):
examples.append(InputExample(texts=[s1, s2], label=l))
train_dataloader = DataLoader(examples, shuffle=False, batch_size=4, num_workers=1)
train_loss = losses.CosineSimilarityLoss(embedder)
# tune the model
embedder.fit(train_objectives=[(train_dataloader, train_loss)],
epochs=5,
warmup_steps=500,
evaluator=evaluator,
evaluation_steps=1,
output_path='fine_tuned_bert',
save_best_model= True,
show_progress_bar= True
)
谁能帮我解决这个错误?
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-4-aaa58b106c77> in <module>()
25 output_path='fine_tuned_bert',
26 save_best_model= True,
---> 27 show_progress_bar= True
28 )
4 frames
/usr/local/lib/python3.7/dist-packages/torch/_utils.py in reraise(self)
423 # have message field
424 raise self.exc_type(message=msg)
--> 425 raise self.exc_type(msg)
426
427
AttributeError: Caught AttributeError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/worker.py", line 287, in _worker_loop
data = fetcher.fetch(index)
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
return self.collate_fn(data)
File "/usr/local/lib/python3.7/dist-packages/sentence_transformers/SentenceTransformer.py", line 518, in smart_batching_collate
num_texts = len(batch[0].texts)
AttributeError: 'str' object has no attribute 'texts'
代码:
import pandas as pd
# initialise data of lists.
data = {'input':[
"Alpro, Cioccolato bevanda a base di soia 1 ltr", #Alpro, Chocolate soy drink 1 ltr
"Milka cioccolato al latte 100 g", #Milka milk chocolate 100 g
"Danone, HiPRO 25g Proteine gusto cioccolato 330 ml", #Danone, HiPRO 25g Protein chocolate flavor 330 ml
]
}
# Creates pandas DataFrame.
x_sample = pd.DataFrame(data)
print(x_sample['input'])
# load model
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses, evaluation
from torch.utils.data import DataLoader
embedder = SentenceTransformer('sentence-transformers/paraphrase-xlm-r-multilingual-v1') # or any other pretrained model
print("embedder loaded...")
# define your train dataset, the dataloader, and the train loss
train_dataset = SentencesDataset(x_sample["input"].tolist(), embedder)
train_dataloader = DataLoader(train_dataset, shuffle=False, batch_size=4, num_workers=1)
train_loss = losses.CosineSimilarityLoss(embedder)
# dummy evaluator to make the api work
sentences1 = ['latte al cioccolato', 'latte al cioccolato','latte al cioccolato']
sentences2 = ['Alpro, Cioccolato bevanda a base di soia 1 ltr', 'Danone, HiPRO 25g Proteine gusto cioccolato 330 ml','Milka cioccolato al latte 100 g']
scores = [0.99,0.95,0.4]
evaluator = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)
# tune the model
embedder.fit(train_objectives=[(train_dataloader, train_loss)],
epochs=5,
warmup_steps=500,
evaluator=evaluator,
evaluation_steps=1,
output_path='fine_tuned_bert',
save_best_model= True,
show_progress_bar= True
)
[已更新]
我浏览了几行关于如何使用 fit()
方法的文档 here,我意识到有一个更简单的解决方案可以满足您的需求。您需要考虑的唯一更改是定义适当的 InputExample
以构建 DataLoader 并造成损失!
import pandas as pd
# initialise data of lists.
data = {'input':[
"Alpro, Cioccolato bevanda a base di soia 1 ltr", #Alpro, Chocolate soy drink 1 ltr
"Milka cioccolato al latte 100 g", #Milka milk chocolate 100 g
"Danone, HiPRO 25g Proteine gusto cioccolato 330 ml", #Danone, HiPRO 25g Protein chocolate flavor 330 ml
]
}
# Creates pandas DataFrame.
x_sample = pd.DataFrame(data)
print(x_sample['input'])
# load model
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses, evaluation
from torch.utils.data import DataLoader
embedder = SentenceTransformer('sentence-transformers/paraphrase-xlm-r-multilingual-v1') # or any other pretrained model
print("embedder loaded...")
# define your train dataset, the dataloader, and the train loss
# train_dataset = SentencesDataset(x_sample["input"].tolist(), embedder)
# train_dataloader = DataLoader(train_dataset, shuffle=False, batch_size=4, num_workers=1)
# train_loss = losses.CosineSimilarityLoss(embedder)
# dummy evaluator to make the api work
sentences1 = ['latte al cioccolato', 'latte al cioccolato','latte al cioccolato']
sentences2 = ['Alpro, Cioccolato bevanda a base di soia 1 ltr', 'Danone, HiPRO 25g Proteine gusto cioccolato 330 ml','Milka cioccolato al latte 100 g']
scores = [0.99,0.95,0.4]
evaluator = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)
examples = []
for s1,s2,l in zip(sentences1, sentences2, scores):
examples.append(InputExample(texts=[s1, s2], label=l))
train_dataloader = DataLoader(examples, shuffle=False, batch_size=4, num_workers=1)
train_loss = losses.CosineSimilarityLoss(embedder)
# tune the model
embedder.fit(train_objectives=[(train_dataloader, train_loss)],
epochs=5,
warmup_steps=500,
evaluator=evaluator,
evaluation_steps=1,
output_path='fine_tuned_bert',
save_best_model= True,
show_progress_bar= True
)