BertTokenizer error ValueError: Input nan is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers
BertTokenizer error ValueError: Input nan is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers
import pandas as pd
from sklearn.model_selection import train_test_split
# read text data
df = pd.read_csv('E:/bert4keras-master/resume_data/111.txt', header=None,encoding='utf-8', sep='\t',names=['label', 'sentence'])
print(df)
# split text data
train, valid_test = train_test_split(df, test_size=0.3, shuffle=True, random_state=123, stratify=df['label'])
print(valid_test.head)
valid, test = train_test_split(valid_test, test_size=0.5, shuffle=True, random_state=123, stratify=valid_test['label'])
train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
class CreateDataset(Dataset):
def __init__(self, X, y, tokenizer, max_len):
self.X = X
self.y = y
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self): # len(Dataset)
return len(self.y)
def __getitem__(self, index): # Dataset[index]
text = self.X[index]
inputs = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=self.max_len,
pad_to_max_length=True
)
ids = inputs['input_ids']
mask = inputs['attention_mask']
return {
'ids': torch.LongTensor(ids),
'mask': torch.LongTensor(mask),
'labels': torch.Tensor(self.y[index])
}
# label one-hot
y_train = pd.get_dummies(train, columns=['label'])[['label_Exp','label_PI','label_Sum','label_Edu', 'label_QC', 'label_Skill', 'label_Obj']].values
y_valid = pd.get_dummies(valid, columns=['label'])[['label_Exp','label_PI','label_Sum','label_Edu', 'label_QC', 'label_Skill', 'label_Obj']].values
y_test = pd.get_dummies(test, columns=['label'])[['label_Exp','label_PI','label_Sum','label_Edu', 'label_QC', 'label_Skill', 'label_Obj']].values
# make dataset
max_len = 256
tokenizer = BertTokenizer.from_pretrained('E:/bert4keras-master/pytorch_bert_large/')
dataset_train = CreateDataset(train['sentence'], y_train, tokenizer, max_len)
dataset_valid = CreateDataset(valid['sentence'], y_valid, tokenizer, max_len)
dataset_test = CreateDataset(test['sentence'], y_test, tokenizer, max_len)
# dataloader
dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
dataloader_valid = DataLoader(dataset_valid, batch_size=len(dataset_valid), shuffle=False)
当我尝试训练模型时出现此错误 log = train_model(dataset_train, dataset_valid, BATCH_SIZE, model, criterion, optimizer, NUM_EPOCHS, device=device)
错误
>>> log = train_model(dataset_train, dataset_valid, BATCH_SIZE, model, criterion, optimizer, NUM_EPOCHS, device=device)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "<stdin>", line 29, in train_model
File "<stdin>", line 8, in calculate_loss_and_accuracy
File "E:\anaconda3\envs\py38pytorch\lib\site-packages\torch\utils\data\dataloader.py", line 521, in __next__
data = self._next_data()
File "E:\anaconda3\envs\py38pytorch\lib\site-packages\torch\utils\data\dataloader.py", line 561, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "E:\anaconda3\envs\py38pytorch\lib\site-packages\torch\utils\data\_utils\fetch.py", line 49, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "E:\anaconda3\envs\py38pytorch\lib\site-packages\torch\utils\data\_utils\fetch.py", line 49, in <listcomp>
data = [self.dataset[idx] for idx in possibly_batched_index]
File "<stdin>", line 11, in __getitem__
File "E:\anaconda3\envs\py38pytorch\lib\site-packages\transformers\tokenization_utils_base.py", line 2556, in encode_plus
return self._encode_plus(
File "E:\anaconda3\envs\py38pytorch\lib\site-packages\transformers\tokenization_utils.py", line 647, in _encode_plus
first_ids = get_input_ids(text)
File "E:\anaconda3\envs\py38pytorch\lib\site-packages\transformers\tokenization_utils.py", line 634, in get_input_ids
raise ValueError(
ValueError: Input nan is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.
有什么建议请多谢
yoy 应该调试
输入 = self.tokenizer.encode_plus(
文本,
add_special_tokens=对,
max_length=self.max_len,
pad_to_max_length=真
)
'text' 应为字符串或列表字符串...
import pandas as pd
from sklearn.model_selection import train_test_split
# read text data
df = pd.read_csv('E:/bert4keras-master/resume_data/111.txt', header=None,encoding='utf-8', sep='\t',names=['label', 'sentence'])
print(df)
# split text data
train, valid_test = train_test_split(df, test_size=0.3, shuffle=True, random_state=123, stratify=df['label'])
print(valid_test.head)
valid, test = train_test_split(valid_test, test_size=0.5, shuffle=True, random_state=123, stratify=valid_test['label'])
train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
class CreateDataset(Dataset):
def __init__(self, X, y, tokenizer, max_len):
self.X = X
self.y = y
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self): # len(Dataset)
return len(self.y)
def __getitem__(self, index): # Dataset[index]
text = self.X[index]
inputs = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=self.max_len,
pad_to_max_length=True
)
ids = inputs['input_ids']
mask = inputs['attention_mask']
return {
'ids': torch.LongTensor(ids),
'mask': torch.LongTensor(mask),
'labels': torch.Tensor(self.y[index])
}
# label one-hot
y_train = pd.get_dummies(train, columns=['label'])[['label_Exp','label_PI','label_Sum','label_Edu', 'label_QC', 'label_Skill', 'label_Obj']].values
y_valid = pd.get_dummies(valid, columns=['label'])[['label_Exp','label_PI','label_Sum','label_Edu', 'label_QC', 'label_Skill', 'label_Obj']].values
y_test = pd.get_dummies(test, columns=['label'])[['label_Exp','label_PI','label_Sum','label_Edu', 'label_QC', 'label_Skill', 'label_Obj']].values
# make dataset
max_len = 256
tokenizer = BertTokenizer.from_pretrained('E:/bert4keras-master/pytorch_bert_large/')
dataset_train = CreateDataset(train['sentence'], y_train, tokenizer, max_len)
dataset_valid = CreateDataset(valid['sentence'], y_valid, tokenizer, max_len)
dataset_test = CreateDataset(test['sentence'], y_test, tokenizer, max_len)
# dataloader
dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
dataloader_valid = DataLoader(dataset_valid, batch_size=len(dataset_valid), shuffle=False)
当我尝试训练模型时出现此错误 log = train_model(dataset_train, dataset_valid, BATCH_SIZE, model, criterion, optimizer, NUM_EPOCHS, device=device)
错误
>>> log = train_model(dataset_train, dataset_valid, BATCH_SIZE, model, criterion, optimizer, NUM_EPOCHS, device=device)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "<stdin>", line 29, in train_model
File "<stdin>", line 8, in calculate_loss_and_accuracy
File "E:\anaconda3\envs\py38pytorch\lib\site-packages\torch\utils\data\dataloader.py", line 521, in __next__
data = self._next_data()
File "E:\anaconda3\envs\py38pytorch\lib\site-packages\torch\utils\data\dataloader.py", line 561, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "E:\anaconda3\envs\py38pytorch\lib\site-packages\torch\utils\data\_utils\fetch.py", line 49, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "E:\anaconda3\envs\py38pytorch\lib\site-packages\torch\utils\data\_utils\fetch.py", line 49, in <listcomp>
data = [self.dataset[idx] for idx in possibly_batched_index]
File "<stdin>", line 11, in __getitem__
File "E:\anaconda3\envs\py38pytorch\lib\site-packages\transformers\tokenization_utils_base.py", line 2556, in encode_plus
return self._encode_plus(
File "E:\anaconda3\envs\py38pytorch\lib\site-packages\transformers\tokenization_utils.py", line 647, in _encode_plus
first_ids = get_input_ids(text)
File "E:\anaconda3\envs\py38pytorch\lib\site-packages\transformers\tokenization_utils.py", line 634, in get_input_ids
raise ValueError(
ValueError: Input nan is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.
有什么建议请多谢
yoy 应该调试 输入 = self.tokenizer.encode_plus( 文本, add_special_tokens=对, max_length=self.max_len, pad_to_max_length=真 ) 'text' 应为字符串或列表字符串...