二元分类的精度为 0%
Accuracy 0% for binary classification
我正在使用 OpenFL 框架进行联邦学习实验。我 运行 他们的教程笔记本没有问题,所以例如我可以 运行 在 MNIST 上分类,一切正常。
现在我正在使用 2 个客户端和 2 个不同的数据集。但是,对于二元分类问题,我的准确率约为 0%。
因此,对于两个数据集,我都有 2 类、“neg”和“pos”。第一个数据集的图像是 3000x2951,而第二个数据集的图像是 4892x4020。我将两者都调整为 256x256。我的网络是 ResNet9,最后没有任何 sigmoid,因为我使用的是 BCEWithLogitsLoss()。这里有一些代码,用于检查是否一切正常:
optimizer_adam = optim.Adam(params_to_update, lr=1e-4)
def cross_entropy(output, target):
"""Binary cross-entropy metric
"""
target = target.unsqueeze(1)
criterion = nn.BCEWithLogitsLoss()
loss = criterion(output, target.float())
return loss
def train(net_model, train_loader, optimizer, device, loss_fn=cross_entropy, some_parameter=None):
torch.manual_seed(0)
device='cpu'
function_defined_in_notebook(some_parameter)
train_loader = tqdm.tqdm(train_loader, desc="train")
net_model.train()
net_model.to(device)
losses = []
for data, target in train_loader:
data, target = torch.tensor(data).to(device), torch.tensor(
target).to(device, dtype=torch.int64)
optimizer.zero_grad()
#data = data.type(torch.LongTensor)
#target = target.type(torch.LongTensor)
output = net_model(data)
loss = loss_fn(output=output, target=target)
loss.backward()
optimizer.step()
losses.append(loss.detach().cpu().numpy())
return {'train_loss': np.mean(losses),}
@task_interface.register_fl_task(model='net_model', data_loader='val_loader', device='device')
def validate(net_model, val_loader, device):
torch.manual_seed(0)
device = torch.device('cpu')
net_model.eval()
net_model.to(device)
val_loader = tqdm.tqdm(val_loader, desc="validate")
val_score = 0
total_samples = 0
with torch.no_grad():
for data, target in val_loader:
samples = target.shape[0]
total_samples += samples
data, target = torch.tensor(data).to(device), \
torch.tensor(target).to(device, dtype=torch.int64)
output = net_model(data)
pred = (output >= 0.5).long() # Binarize predictions to 0 and 1
val_score = (pred == target).sum().cpu().item()/data.size(0)
#val_score += pred.eq(target).sum().cpu().numpy()
return {'acc': val_score / total_samples,}
我认为这一切都是正确的。所以唯一可能出错的部分是我导入数据的时候,因为在这个联邦学习框架中有点棘手。基本上我的数据集都是这样组织的:/Dataset1(2)/Train(Test)/neg(pos)/images.png。我想提取 x_train、y_train、x_test 和 y_test,因为我完全遵循有效教程的结构。所以这是我提出的解决方案:
def download_data(self):
"""Download prepared dataset."""
image_list_train = []
image_list_test = []
x_train = []
y_train = []
x_test = []
y_test = []
base_dir_train = 'Montgomery_real_splitted/TRAIN/'
base_dir_test = 'Montgomery_real_splitted/TEST/'
for f in sorted(os.listdir(base_dir_train)):
if os.path.isdir(base_dir_train+f):
print(f"{f} is a target class")
for i in sorted(os.listdir(base_dir_train+f)):
y_train.append(f)
im = Image.open(base_dir_train+f+'/'+i)
x_train.append(im)
for f in sorted(os.listdir(base_dir_test)):
if os.path.isdir(base_dir_test+f):
print(f"{f} is a target class")
for i in sorted(os.listdir(base_dir_test+f)):
y_test.append(f)
imt=Image.open(base_dir_test+f+'/'+i)
x_test.append(imt)
y_train = np.array(y_train)
y_test = np.array(y_test)
for i in range(len(y_train)):
if y_train[i]=="neg":
y_train[i]=0
else:
y_train[i]=1
y_train = y_train.astype(np.uint8)
for i in range(len(y_test)):
if y_test[i]=="neg":
y_test[i]=0
else:
y_test[i]=1
y_test = y_test.astype(np.uint8)
print('Mont-china data was loaded!')
return (x_train, y_train), (x_test, y_test)
以上代码位于加载数据所需的 python 脚本中。然后,在 Jupyter notebook 中,我有这些单元格以导入数据集:
normalize = T.Normalize(
mean=[0.1307],
std=[0.3081]
)
augmentation = T.RandomApply(
[T.RandomHorizontalFlip(),
T.RandomRotation(10)],
p=.8
)
training_transform = T.Compose(
[T.Resize((256,256)),
augmentation,
T.ToTensor()]
)
valid_transform = T.Compose(
[T.Resize((256,256)),
T.ToTensor()]
)
class TransformedDataset(Dataset):
def __init__(self, dataset, transform=None, target_transform=None):
"""Initialize Dataset."""
self.dataset = dataset
self.transform = transform
self.target_transform = target_transform
def __len__(self):
"""Length of dataset."""
return len(self.dataset)
def __getitem__(self, index):
img, label = self.dataset[index]
label = self.target_transform(label) if self.target_transform else label
img = self.transform(img) if self.transform else img
return img, label
class MontChinaDataset(DataInterface):
def __init__(self, **kwargs):
self.kwargs = kwargs
@property
def shard_descriptor(self):
return self._shard_descriptor
@shard_descriptor.setter
def shard_descriptor(self, shard_descriptor):
"""
Describe per-collaborator procedures or sharding.
This method will be called during a collaborator initialization.
Local shard_descriptor will be set by Envoy.
"""
self._shard_descriptor = shard_descriptor
self.train_set = TransformedDataset(
self._shard_descriptor.get_dataset('train'),
transform=training_transform
)
self.valid_set = TransformedDataset(
self._shard_descriptor.get_dataset('val'),
transform=valid_transform
)
def get_train_loader(self, **kwargs):
"""
Output of this method will be provided to tasks with optimizer in contract
"""
generator=torch.Generator()
generator.manual_seed(0)
return DataLoader(
self.train_set, batch_size=self.kwargs['train_bs'], shuffle=True, generator=generator
)
def get_valid_loader(self, **kwargs):
"""
Output of this method will be provided to tasks without optimizer in contract
"""
return DataLoader(self.valid_set, batch_size=self.kwargs['valid_bs'])
def get_train_data_size(self):
"""
Information for aggregation
"""
return len(self.train_set)
def get_valid_data_size(self):
"""
Information for aggregation
"""
return len(self.valid_set)
fed_dataset = MontChinaDataset(train_bs=16, valid_bs=16)
奇怪的是loss减少了,而accuracy还是0或者0左右
[12:29:44] METRIC Round 0, collaborator env_one train result train_loss: 0.673127 experiment.py:116
[12:29:53] METRIC Round 0, collaborator env_one locally_tuned_model_validate result acc: 0.000000 experiment.py:116
[12:29:56] METRIC Round 0, collaborator env_one aggregated_model_validate result acc: 0.000000 experiment.py:116
[12:30:49] METRIC Round 0, collaborator env_two train result train_loss: 0.562856 experiment.py:116
[12:31:14] METRIC Round 0, collaborator env_two locally_tuned_model_validate result acc: 0.000000 experiment.py:116
[12:31:19] METRIC Round 0, collaborator env_two aggregated_model_validate result acc: 0.000000 experiment.py:116
[12:31:21] METRIC Round 0, collaborator Aggregator train result train_loss: 0.581464 experiment.py:116
METRIC Round 0, collaborator Aggregator locally_tuned_model_validate result acc: 0.000000 experiment.py:116
[12:31:22] METRIC Round 0, collaborator Aggregator aggregated_model_validate result acc: 0.000000 experiment.py:116
[12:31:39] METRIC Round 1, collaborator env_one train result train_loss: 0.637785 experiment.py:116
[12:31:41] METRIC Round 1, collaborator env_one locally_tuned_model_validate result acc: 0.000000 experiment.py:116
[12:31:44] METRIC Round 1, collaborator env_one aggregated_model_validate result acc: 0.000000 experiment.py:116
[12:31:55] METRIC Round 1, collaborator env_two train result train_loss: 0.432979 experiment.py:116
[12:32:00] METRIC Round 1, collaborator env_two locally_tuned_model_validate result acc: 0.000000 experiment.py:116
[12:32:05] METRIC Round 1, collaborator env_two aggregated_model_validate result acc: 0.000000 experiment.py:116
[12:32:08] METRIC Round 1, collaborator Aggregator train result train_loss: 0.467540 experiment.py:116
METRIC Round 1, collaborator Aggregator locally_tuned_model_validate result acc: 0.000000 experiment.py:116
METRIC Round 1, collaborator Aggregator aggregated_model_validate result acc: 0.000000
这样持续了好几轮
我不确定这是否能解决您的问题,但您的验证代码有一些错误(下面注释了两行):
@task_interface.register_fl_task(model='net_model', data_loader='val_loader', device='device')
def validate(net_model, val_loader, device):
torch.manual_seed(0)
device = torch.device('cpu')
net_model.eval()
net_model.to(device)
val_loader = tqdm.tqdm(val_loader, desc="validate")
val_score = 0
total_samples = 0
with torch.no_grad():
for data, target in val_loader:
samples = target.shape[0]
total_samples += samples
data, target = torch.tensor(data).to(device), \
torch.tensor(target).to(device, dtype=torch.int64)
output = net_model(data)
##new line vvv
output = torch.sigmoid(output) #compress output into prob distribution
pred = (output >= 0.5).long() # Binarize predictions to 0 and 1
##changed line below
val_score += (pred == target).sum().cpu().item() ###/data.size(0)
#val_score += pred.eq(target).sum().cpu().numpy()
return {'acc': val_score / total_samples,}
本质上有两个问题:
- 当您没有将输出通过 sigmoid 时,您正在与 0.5 进行比较。我知道你说过你没有这样做是因为你的损失函数,这是正确的,但是你必须在 eval 模式下使用 sigmoid/softmax。
- 您将
val_score
除以 data.size(0)
(批次大小?),然后 total_samples
这不是批次的数量,而是所有批次的数量数据。
- 您并不是在每次迭代时都增加
val_score
,而是在重置它。如果您有很多批次,这将解释为什么它是 0 或接近 0。
希望这些修复能让您更接近目标!
我正在使用 OpenFL 框架进行联邦学习实验。我 运行 他们的教程笔记本没有问题,所以例如我可以 运行 在 MNIST 上分类,一切正常。 现在我正在使用 2 个客户端和 2 个不同的数据集。但是,对于二元分类问题,我的准确率约为 0%。 因此,对于两个数据集,我都有 2 类、“neg”和“pos”。第一个数据集的图像是 3000x2951,而第二个数据集的图像是 4892x4020。我将两者都调整为 256x256。我的网络是 ResNet9,最后没有任何 sigmoid,因为我使用的是 BCEWithLogitsLoss()。这里有一些代码,用于检查是否一切正常:
optimizer_adam = optim.Adam(params_to_update, lr=1e-4)
def cross_entropy(output, target):
"""Binary cross-entropy metric
"""
target = target.unsqueeze(1)
criterion = nn.BCEWithLogitsLoss()
loss = criterion(output, target.float())
return loss
def train(net_model, train_loader, optimizer, device, loss_fn=cross_entropy, some_parameter=None):
torch.manual_seed(0)
device='cpu'
function_defined_in_notebook(some_parameter)
train_loader = tqdm.tqdm(train_loader, desc="train")
net_model.train()
net_model.to(device)
losses = []
for data, target in train_loader:
data, target = torch.tensor(data).to(device), torch.tensor(
target).to(device, dtype=torch.int64)
optimizer.zero_grad()
#data = data.type(torch.LongTensor)
#target = target.type(torch.LongTensor)
output = net_model(data)
loss = loss_fn(output=output, target=target)
loss.backward()
optimizer.step()
losses.append(loss.detach().cpu().numpy())
return {'train_loss': np.mean(losses),}
@task_interface.register_fl_task(model='net_model', data_loader='val_loader', device='device')
def validate(net_model, val_loader, device):
torch.manual_seed(0)
device = torch.device('cpu')
net_model.eval()
net_model.to(device)
val_loader = tqdm.tqdm(val_loader, desc="validate")
val_score = 0
total_samples = 0
with torch.no_grad():
for data, target in val_loader:
samples = target.shape[0]
total_samples += samples
data, target = torch.tensor(data).to(device), \
torch.tensor(target).to(device, dtype=torch.int64)
output = net_model(data)
pred = (output >= 0.5).long() # Binarize predictions to 0 and 1
val_score = (pred == target).sum().cpu().item()/data.size(0)
#val_score += pred.eq(target).sum().cpu().numpy()
return {'acc': val_score / total_samples,}
我认为这一切都是正确的。所以唯一可能出错的部分是我导入数据的时候,因为在这个联邦学习框架中有点棘手。基本上我的数据集都是这样组织的:/Dataset1(2)/Train(Test)/neg(pos)/images.png。我想提取 x_train、y_train、x_test 和 y_test,因为我完全遵循有效教程的结构。所以这是我提出的解决方案:
def download_data(self):
"""Download prepared dataset."""
image_list_train = []
image_list_test = []
x_train = []
y_train = []
x_test = []
y_test = []
base_dir_train = 'Montgomery_real_splitted/TRAIN/'
base_dir_test = 'Montgomery_real_splitted/TEST/'
for f in sorted(os.listdir(base_dir_train)):
if os.path.isdir(base_dir_train+f):
print(f"{f} is a target class")
for i in sorted(os.listdir(base_dir_train+f)):
y_train.append(f)
im = Image.open(base_dir_train+f+'/'+i)
x_train.append(im)
for f in sorted(os.listdir(base_dir_test)):
if os.path.isdir(base_dir_test+f):
print(f"{f} is a target class")
for i in sorted(os.listdir(base_dir_test+f)):
y_test.append(f)
imt=Image.open(base_dir_test+f+'/'+i)
x_test.append(imt)
y_train = np.array(y_train)
y_test = np.array(y_test)
for i in range(len(y_train)):
if y_train[i]=="neg":
y_train[i]=0
else:
y_train[i]=1
y_train = y_train.astype(np.uint8)
for i in range(len(y_test)):
if y_test[i]=="neg":
y_test[i]=0
else:
y_test[i]=1
y_test = y_test.astype(np.uint8)
print('Mont-china data was loaded!')
return (x_train, y_train), (x_test, y_test)
以上代码位于加载数据所需的 python 脚本中。然后,在 Jupyter notebook 中,我有这些单元格以导入数据集:
normalize = T.Normalize(
mean=[0.1307],
std=[0.3081]
)
augmentation = T.RandomApply(
[T.RandomHorizontalFlip(),
T.RandomRotation(10)],
p=.8
)
training_transform = T.Compose(
[T.Resize((256,256)),
augmentation,
T.ToTensor()]
)
valid_transform = T.Compose(
[T.Resize((256,256)),
T.ToTensor()]
)
class TransformedDataset(Dataset):
def __init__(self, dataset, transform=None, target_transform=None):
"""Initialize Dataset."""
self.dataset = dataset
self.transform = transform
self.target_transform = target_transform
def __len__(self):
"""Length of dataset."""
return len(self.dataset)
def __getitem__(self, index):
img, label = self.dataset[index]
label = self.target_transform(label) if self.target_transform else label
img = self.transform(img) if self.transform else img
return img, label
class MontChinaDataset(DataInterface):
def __init__(self, **kwargs):
self.kwargs = kwargs
@property
def shard_descriptor(self):
return self._shard_descriptor
@shard_descriptor.setter
def shard_descriptor(self, shard_descriptor):
"""
Describe per-collaborator procedures or sharding.
This method will be called during a collaborator initialization.
Local shard_descriptor will be set by Envoy.
"""
self._shard_descriptor = shard_descriptor
self.train_set = TransformedDataset(
self._shard_descriptor.get_dataset('train'),
transform=training_transform
)
self.valid_set = TransformedDataset(
self._shard_descriptor.get_dataset('val'),
transform=valid_transform
)
def get_train_loader(self, **kwargs):
"""
Output of this method will be provided to tasks with optimizer in contract
"""
generator=torch.Generator()
generator.manual_seed(0)
return DataLoader(
self.train_set, batch_size=self.kwargs['train_bs'], shuffle=True, generator=generator
)
def get_valid_loader(self, **kwargs):
"""
Output of this method will be provided to tasks without optimizer in contract
"""
return DataLoader(self.valid_set, batch_size=self.kwargs['valid_bs'])
def get_train_data_size(self):
"""
Information for aggregation
"""
return len(self.train_set)
def get_valid_data_size(self):
"""
Information for aggregation
"""
return len(self.valid_set)
fed_dataset = MontChinaDataset(train_bs=16, valid_bs=16)
奇怪的是loss减少了,而accuracy还是0或者0左右
[12:29:44] METRIC Round 0, collaborator env_one train result train_loss: 0.673127 experiment.py:116
[12:29:53] METRIC Round 0, collaborator env_one locally_tuned_model_validate result acc: 0.000000 experiment.py:116
[12:29:56] METRIC Round 0, collaborator env_one aggregated_model_validate result acc: 0.000000 experiment.py:116
[12:30:49] METRIC Round 0, collaborator env_two train result train_loss: 0.562856 experiment.py:116
[12:31:14] METRIC Round 0, collaborator env_two locally_tuned_model_validate result acc: 0.000000 experiment.py:116
[12:31:19] METRIC Round 0, collaborator env_two aggregated_model_validate result acc: 0.000000 experiment.py:116
[12:31:21] METRIC Round 0, collaborator Aggregator train result train_loss: 0.581464 experiment.py:116
METRIC Round 0, collaborator Aggregator locally_tuned_model_validate result acc: 0.000000 experiment.py:116
[12:31:22] METRIC Round 0, collaborator Aggregator aggregated_model_validate result acc: 0.000000 experiment.py:116
[12:31:39] METRIC Round 1, collaborator env_one train result train_loss: 0.637785 experiment.py:116
[12:31:41] METRIC Round 1, collaborator env_one locally_tuned_model_validate result acc: 0.000000 experiment.py:116
[12:31:44] METRIC Round 1, collaborator env_one aggregated_model_validate result acc: 0.000000 experiment.py:116
[12:31:55] METRIC Round 1, collaborator env_two train result train_loss: 0.432979 experiment.py:116
[12:32:00] METRIC Round 1, collaborator env_two locally_tuned_model_validate result acc: 0.000000 experiment.py:116
[12:32:05] METRIC Round 1, collaborator env_two aggregated_model_validate result acc: 0.000000 experiment.py:116
[12:32:08] METRIC Round 1, collaborator Aggregator train result train_loss: 0.467540 experiment.py:116
METRIC Round 1, collaborator Aggregator locally_tuned_model_validate result acc: 0.000000 experiment.py:116
METRIC Round 1, collaborator Aggregator aggregated_model_validate result acc: 0.000000
这样持续了好几轮
我不确定这是否能解决您的问题,但您的验证代码有一些错误(下面注释了两行):
@task_interface.register_fl_task(model='net_model', data_loader='val_loader', device='device')
def validate(net_model, val_loader, device):
torch.manual_seed(0)
device = torch.device('cpu')
net_model.eval()
net_model.to(device)
val_loader = tqdm.tqdm(val_loader, desc="validate")
val_score = 0
total_samples = 0
with torch.no_grad():
for data, target in val_loader:
samples = target.shape[0]
total_samples += samples
data, target = torch.tensor(data).to(device), \
torch.tensor(target).to(device, dtype=torch.int64)
output = net_model(data)
##new line vvv
output = torch.sigmoid(output) #compress output into prob distribution
pred = (output >= 0.5).long() # Binarize predictions to 0 and 1
##changed line below
val_score += (pred == target).sum().cpu().item() ###/data.size(0)
#val_score += pred.eq(target).sum().cpu().numpy()
return {'acc': val_score / total_samples,}
本质上有两个问题:
- 当您没有将输出通过 sigmoid 时,您正在与 0.5 进行比较。我知道你说过你没有这样做是因为你的损失函数,这是正确的,但是你必须在 eval 模式下使用 sigmoid/softmax。
- 您将
val_score
除以data.size(0)
(批次大小?),然后total_samples
这不是批次的数量,而是所有批次的数量数据。 - 您并不是在每次迭代时都增加
val_score
,而是在重置它。如果您有很多批次,这将解释为什么它是 0 或接近 0。
希望这些修复能让您更接近目标!