PyTorch 中的标签平滑
Label Smoothing in PyTorch
我正在使用迁移学习为 Stanford Cars 数据集构建 ResNet-18
分类模型。我想实施 label smoothing 来惩罚过度自信的预测并提高泛化能力。
TensorFlow
在 CrossEntropyLoss
中有一个简单的关键字参数。有没有人为 PyTorch
构建了一个我可以即插即用的类似功能?
我知道的None
这里有两个 PyTorch 实现的例子:
LabelSmoothingLoss
module 用于机器翻译的 OpenNMT 框架
attention-is-all-you-need-pytorch
, re-implementation of Google's Attention is all you need paper
我一直在寻找源自 _Loss
的选项,就像 PyTorch 中的其他损失 类 一样,并尊重 reduction
等基本参数。不幸的是,我找不到直接的替代品,所以最终自己写了。不过,我还没有对此进行全面测试:
import torch
from torch.nn.modules.loss import _WeightedLoss
import torch.nn.functional as F
class SmoothCrossEntropyLoss(_WeightedLoss):
def __init__(self, weight=None, reduction='mean', smoothing=0.0):
super().__init__(weight=weight, reduction=reduction)
self.smoothing = smoothing
self.weight = weight
self.reduction = reduction
@staticmethod
def _smooth_one_hot(targets:torch.Tensor, n_classes:int, smoothing=0.0):
assert 0 <= smoothing < 1
with torch.no_grad():
targets = torch.empty(size=(targets.size(0), n_classes),
device=targets.device) \
.fill_(smoothing /(n_classes-1)) \
.scatter_(1, targets.data.unsqueeze(1), 1.-smoothing)
return targets
def forward(self, inputs, targets):
targets = SmoothCrossEntropyLoss._smooth_one_hot(targets, inputs.size(-1),
self.smoothing)
lsm = F.log_softmax(inputs, -1)
if self.weight is not None:
lsm = lsm * self.weight.unsqueeze(0)
loss = -(targets * lsm).sum(-1)
if self.reduction == 'sum':
loss = loss.sum()
elif self.reduction == 'mean':
loss = loss.mean()
return loss
其他选项:
- utils.pytorch 实施
- DeepMatch实施
标签平滑 PyTorch 实现
参考:https://github.com/wangleiofficial/label-smoothing-pytorch
import torch.nn.functional as F
def linear_combination(x, y, epsilon):
return epsilon * x + (1 - epsilon) * y
def reduce_loss(loss, reduction='mean'):
return loss.mean() if reduction == 'mean' else loss.sum() if reduction == 'sum' else loss
class LabelSmoothingCrossEntropy(nn.Module):
def __init__(self, epsilon: float = 0.1, reduction='mean'):
super().__init__()
self.epsilon = epsilon
self.reduction = reduction
def forward(self, preds, target):
n = preds.size()[-1]
log_preds = F.log_softmax(preds, dim=-1)
loss = reduce_loss(-log_preds.sum(dim=-1), self.reduction)
nll = F.nll_loss(log_preds, target, reduction=self.reduction)
return linear_combination(loss / n, nll, self.epsilon)
多 class 神经网络的泛化和学习速度通常可以通过使用作为 加权平均值的 软目标 显着提高硬目标的和标签上的均匀分布。以这种方式平滑标签可防止网络变得过于自信,标签平滑已用于许多最先进的模型,包括图像 classification、语言翻译和语音识别。
标签平滑 已经在 Tensorflow
的交叉熵损失函数中实现。 BinaryCrossentropy, CategoricalCrossentropy. But currently, there is no official implementation of Label Smoothing in PyTorch
. However, there is going an active discussion on it and hopefully, it will be provided with an official package. Here is that discussion thread: Issue #7455.
在这里,我们将从 PyTorch
从业者那里带来一些可用的 标签平滑 (LS) 的最佳实现。基本上,有很多方法可以实现LS。关于这个请参考这个具体的讨论,一个是here, and another here。在这里,我们将以 2 独特的方式实现,每个方式有两个版本;所以总共 4.
选项 1:CrossEntropyLossWithProbs
这样,它就接受了one-hot
目标向量。用户必须手动平滑他们的目标向量。它可以在 with torch.no_grad()
范围内完成,因为它暂时将所有 requires_grad
标志设置为 false。
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn.modules.loss import _WeightedLoss
class LabelSmoothingLoss(nn.Module):
def __init__(self, classes, smoothing=0.0, dim=-1, weight = None):
"""if smoothing == 0, it's one-hot method
if 0 < smoothing < 1, it's smooth method
"""
super(LabelSmoothingLoss, self).__init__()
self.confidence = 1.0 - smoothing
self.smoothing = smoothing
self.weight = weight
self.cls = classes
self.dim = dim
def forward(self, pred, target):
assert 0 <= self.smoothing < 1
pred = pred.log_softmax(dim=self.dim)
if self.weight is not None:
pred = pred * self.weight.unsqueeze(0)
with torch.no_grad():
true_dist = torch.zeros_like(pred)
true_dist.fill_(self.smoothing / (self.cls - 1))
true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))
此外,我们在 self. smoothing
上添加了断言复选标记,并在此实现上添加了损失加权支持。
Shital 已经在这里发布了答案。这里我们要指出的是,这个实现类似于 Devin Yang上面的实现。然而,这里我们提到他的代码,并最小化了一点 code syntax
.
class SmoothCrossEntropyLoss(_WeightedLoss):
def __init__(self, weight=None, reduction='mean', smoothing=0.0):
super().__init__(weight=weight, reduction=reduction)
self.smoothing = smoothing
self.weight = weight
self.reduction = reduction
def k_one_hot(self, targets:torch.Tensor, n_classes:int, smoothing=0.0):
with torch.no_grad():
targets = torch.empty(size=(targets.size(0), n_classes),
device=targets.device) \
.fill_(smoothing /(n_classes-1)) \
.scatter_(1, targets.data.unsqueeze(1), 1.-smoothing)
return targets
def reduce_loss(self, loss):
return loss.mean() if self.reduction == 'mean' else loss.sum() \
if self.reduction == 'sum' else loss
def forward(self, inputs, targets):
assert 0 <= self.smoothing < 1
targets = self.k_one_hot(targets, inputs.size(-1), self.smoothing)
log_preds = F.log_softmax(inputs, -1)
if self.weight is not None:
log_preds = log_preds * self.weight.unsqueeze(0)
return self.reduce_loss(-(targets * log_preds).sum(dim=-1))
检查
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn.modules.loss import _WeightedLoss
if __name__=="__main__":
# 1. Devin Yang
crit = LabelSmoothingLoss(classes=5, smoothing=0.5)
predict = torch.FloatTensor([[0, 0.2, 0.7, 0.1, 0],
[0, 0.9, 0.2, 0.2, 1],
[1, 0.2, 0.7, 0.9, 1]])
v = crit(Variable(predict),
Variable(torch.LongTensor([2, 1, 0])))
print(v)
# 2. Shital Shah
crit = SmoothCrossEntropyLoss(smoothing=0.5)
predict = torch.FloatTensor([[0, 0.2, 0.7, 0.1, 0],
[0, 0.9, 0.2, 0.2, 1],
[1, 0.2, 0.7, 0.9, 1]])
v = crit(Variable(predict),
Variable(torch.LongTensor([2, 1, 0])))
print(v)
tensor(1.4178)
tensor(1.4178)
选项 2:LabelSmoothingCrossEntropyLoss
由此,它接受目标向量并使用不手动平滑目标向量,而是内置模块负责标签平滑。它允许我们根据 F.nll_loss
.
实现标签平滑
(一)。 Wangleiofficial: Source -(AFAIK),原始发帖者
(b)。 Datasaurus: Source - 添加了加权支持
此外,我们略微减少了代码编写,使其更加简洁。
class LabelSmoothingLoss(torch.nn.Module):
def __init__(self, smoothing: float = 0.1,
reduction="mean", weight=None):
super(LabelSmoothingLoss, self).__init__()
self.smoothing = smoothing
self.reduction = reduction
self.weight = weight
def reduce_loss(self, loss):
return loss.mean() if self.reduction == 'mean' else loss.sum() \
if self.reduction == 'sum' else loss
def linear_combination(self, x, y):
return self.smoothing * x + (1 - self.smoothing) * y
def forward(self, preds, target):
assert 0 <= self.smoothing < 1
if self.weight is not None:
self.weight = self.weight.to(preds.device)
n = preds.size(-1)
log_preds = F.log_softmax(preds, dim=-1)
loss = self.reduce_loss(-log_preds.sum(dim=-1))
nll = F.nll_loss(
log_preds, target, reduction=self.reduction, weight=self.weight
)
return self.linear_combination(loss / n, nll)
class LabelSmoothing(nn.Module):
"""NLL loss with label smoothing.
"""
def __init__(self, smoothing=0.0):
"""Constructor for the LabelSmoothing module.
:param smoothing: label smoothing factor
"""
super(LabelSmoothing, self).__init__()
self.confidence = 1.0 - smoothing
self.smoothing = smoothing
def forward(self, x, target):
logprobs = torch.nn.functional.log_softmax(x, dim=-1)
nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
nll_loss = nll_loss.squeeze(1)
smooth_loss = -logprobs.mean(dim=-1)
loss = self.confidence * nll_loss + self.smoothing * smooth_loss
return loss.mean()
检查
if __name__=="__main__":
# Wangleiofficial
crit = LabelSmoothingLoss(smoothing=0.3, reduction="mean")
predict = torch.FloatTensor([[0, 0.2, 0.7, 0.1, 0],
[0, 0.9, 0.2, 0.2, 1],
[1, 0.2, 0.7, 0.9, 1]])
v = crit(Variable(predict),
Variable(torch.LongTensor([2, 1, 0])))
print(v)
# NVIDIA
crit = LabelSmoothing(smoothing=0.3)
predict = torch.FloatTensor([[0, 0.2, 0.7, 0.1, 0],
[0, 0.9, 0.2, 0.2, 1],
[1, 0.2, 0.7, 0.9, 1]])
v = crit(Variable(predict),
Variable(torch.LongTensor([2, 1, 0])))
print(v)
tensor(1.3883)
tensor(1.3883)
更新:Officially Added
torch.nn.CrossEntropyLoss(weight=None, size_average=None,
ignore_index=- 100, reduce=None,
reduction='mean', label_smoothing=0.0)
这目前在 PyTorch 中没有正式实现,但已被提议作为高优先级 Feature Request #7455, and separately in TorchVision Issue #2980。
其他库中有很多实现:
- NJUNMT-pytorch
NMTCritierion()._smooth_label()
- Snorkel
snorkel.classification.cross_entropy_with_probs()
- OpenNMT
LabelSmoothingLoss()
以及一些非官方的 implementations/code 片段:
TensorFlow / Keras implementation
tf.keras.losses.CategoricalCrossentropy(label_smoothing)
从 1.10.0 版本开始,Pytorch 正式支持 torch.nn.CrossEntropyLoss
中的标签平滑和软目标。
我正在使用迁移学习为 Stanford Cars 数据集构建 ResNet-18
分类模型。我想实施 label smoothing 来惩罚过度自信的预测并提高泛化能力。
TensorFlow
在 CrossEntropyLoss
中有一个简单的关键字参数。有没有人为 PyTorch
构建了一个我可以即插即用的类似功能?
None
这里有两个 PyTorch 实现的例子:
LabelSmoothingLoss
module 用于机器翻译的 OpenNMT 框架attention-is-all-you-need-pytorch
, re-implementation of Google's Attention is all you need paper
我一直在寻找源自 _Loss
的选项,就像 PyTorch 中的其他损失 类 一样,并尊重 reduction
等基本参数。不幸的是,我找不到直接的替代品,所以最终自己写了。不过,我还没有对此进行全面测试:
import torch
from torch.nn.modules.loss import _WeightedLoss
import torch.nn.functional as F
class SmoothCrossEntropyLoss(_WeightedLoss):
def __init__(self, weight=None, reduction='mean', smoothing=0.0):
super().__init__(weight=weight, reduction=reduction)
self.smoothing = smoothing
self.weight = weight
self.reduction = reduction
@staticmethod
def _smooth_one_hot(targets:torch.Tensor, n_classes:int, smoothing=0.0):
assert 0 <= smoothing < 1
with torch.no_grad():
targets = torch.empty(size=(targets.size(0), n_classes),
device=targets.device) \
.fill_(smoothing /(n_classes-1)) \
.scatter_(1, targets.data.unsqueeze(1), 1.-smoothing)
return targets
def forward(self, inputs, targets):
targets = SmoothCrossEntropyLoss._smooth_one_hot(targets, inputs.size(-1),
self.smoothing)
lsm = F.log_softmax(inputs, -1)
if self.weight is not None:
lsm = lsm * self.weight.unsqueeze(0)
loss = -(targets * lsm).sum(-1)
if self.reduction == 'sum':
loss = loss.sum()
elif self.reduction == 'mean':
loss = loss.mean()
return loss
其他选项:
- utils.pytorch 实施
- DeepMatch实施
标签平滑 PyTorch 实现 参考:https://github.com/wangleiofficial/label-smoothing-pytorch
import torch.nn.functional as F
def linear_combination(x, y, epsilon):
return epsilon * x + (1 - epsilon) * y
def reduce_loss(loss, reduction='mean'):
return loss.mean() if reduction == 'mean' else loss.sum() if reduction == 'sum' else loss
class LabelSmoothingCrossEntropy(nn.Module):
def __init__(self, epsilon: float = 0.1, reduction='mean'):
super().__init__()
self.epsilon = epsilon
self.reduction = reduction
def forward(self, preds, target):
n = preds.size()[-1]
log_preds = F.log_softmax(preds, dim=-1)
loss = reduce_loss(-log_preds.sum(dim=-1), self.reduction)
nll = F.nll_loss(log_preds, target, reduction=self.reduction)
return linear_combination(loss / n, nll, self.epsilon)
多 class 神经网络的泛化和学习速度通常可以通过使用作为 加权平均值的 软目标 显着提高硬目标的和标签上的均匀分布。以这种方式平滑标签可防止网络变得过于自信,标签平滑已用于许多最先进的模型,包括图像 classification、语言翻译和语音识别。
标签平滑 已经在 Tensorflow
的交叉熵损失函数中实现。 BinaryCrossentropy, CategoricalCrossentropy. But currently, there is no official implementation of Label Smoothing in PyTorch
. However, there is going an active discussion on it and hopefully, it will be provided with an official package. Here is that discussion thread: Issue #7455.
在这里,我们将从 PyTorch
从业者那里带来一些可用的 标签平滑 (LS) 的最佳实现。基本上,有很多方法可以实现LS。关于这个请参考这个具体的讨论,一个是here, and another here。在这里,我们将以 2 独特的方式实现,每个方式有两个版本;所以总共 4.
选项 1:CrossEntropyLossWithProbs
这样,它就接受了one-hot
目标向量。用户必须手动平滑他们的目标向量。它可以在 with torch.no_grad()
范围内完成,因为它暂时将所有 requires_grad
标志设置为 false。
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn.modules.loss import _WeightedLoss
class LabelSmoothingLoss(nn.Module):
def __init__(self, classes, smoothing=0.0, dim=-1, weight = None):
"""if smoothing == 0, it's one-hot method
if 0 < smoothing < 1, it's smooth method
"""
super(LabelSmoothingLoss, self).__init__()
self.confidence = 1.0 - smoothing
self.smoothing = smoothing
self.weight = weight
self.cls = classes
self.dim = dim
def forward(self, pred, target):
assert 0 <= self.smoothing < 1
pred = pred.log_softmax(dim=self.dim)
if self.weight is not None:
pred = pred * self.weight.unsqueeze(0)
with torch.no_grad():
true_dist = torch.zeros_like(pred)
true_dist.fill_(self.smoothing / (self.cls - 1))
true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))
此外,我们在 self. smoothing
上添加了断言复选标记,并在此实现上添加了损失加权支持。
Shital 已经在这里发布了答案。这里我们要指出的是,这个实现类似于 Devin Yang上面的实现。然而,这里我们提到他的代码,并最小化了一点 code syntax
.
class SmoothCrossEntropyLoss(_WeightedLoss):
def __init__(self, weight=None, reduction='mean', smoothing=0.0):
super().__init__(weight=weight, reduction=reduction)
self.smoothing = smoothing
self.weight = weight
self.reduction = reduction
def k_one_hot(self, targets:torch.Tensor, n_classes:int, smoothing=0.0):
with torch.no_grad():
targets = torch.empty(size=(targets.size(0), n_classes),
device=targets.device) \
.fill_(smoothing /(n_classes-1)) \
.scatter_(1, targets.data.unsqueeze(1), 1.-smoothing)
return targets
def reduce_loss(self, loss):
return loss.mean() if self.reduction == 'mean' else loss.sum() \
if self.reduction == 'sum' else loss
def forward(self, inputs, targets):
assert 0 <= self.smoothing < 1
targets = self.k_one_hot(targets, inputs.size(-1), self.smoothing)
log_preds = F.log_softmax(inputs, -1)
if self.weight is not None:
log_preds = log_preds * self.weight.unsqueeze(0)
return self.reduce_loss(-(targets * log_preds).sum(dim=-1))
检查
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn.modules.loss import _WeightedLoss
if __name__=="__main__":
# 1. Devin Yang
crit = LabelSmoothingLoss(classes=5, smoothing=0.5)
predict = torch.FloatTensor([[0, 0.2, 0.7, 0.1, 0],
[0, 0.9, 0.2, 0.2, 1],
[1, 0.2, 0.7, 0.9, 1]])
v = crit(Variable(predict),
Variable(torch.LongTensor([2, 1, 0])))
print(v)
# 2. Shital Shah
crit = SmoothCrossEntropyLoss(smoothing=0.5)
predict = torch.FloatTensor([[0, 0.2, 0.7, 0.1, 0],
[0, 0.9, 0.2, 0.2, 1],
[1, 0.2, 0.7, 0.9, 1]])
v = crit(Variable(predict),
Variable(torch.LongTensor([2, 1, 0])))
print(v)
tensor(1.4178)
tensor(1.4178)
选项 2:LabelSmoothingCrossEntropyLoss
由此,它接受目标向量并使用不手动平滑目标向量,而是内置模块负责标签平滑。它允许我们根据 F.nll_loss
.
(一)。 Wangleiofficial: Source -(AFAIK),原始发帖者
(b)。 Datasaurus: Source - 添加了加权支持
此外,我们略微减少了代码编写,使其更加简洁。
class LabelSmoothingLoss(torch.nn.Module):
def __init__(self, smoothing: float = 0.1,
reduction="mean", weight=None):
super(LabelSmoothingLoss, self).__init__()
self.smoothing = smoothing
self.reduction = reduction
self.weight = weight
def reduce_loss(self, loss):
return loss.mean() if self.reduction == 'mean' else loss.sum() \
if self.reduction == 'sum' else loss
def linear_combination(self, x, y):
return self.smoothing * x + (1 - self.smoothing) * y
def forward(self, preds, target):
assert 0 <= self.smoothing < 1
if self.weight is not None:
self.weight = self.weight.to(preds.device)
n = preds.size(-1)
log_preds = F.log_softmax(preds, dim=-1)
loss = self.reduce_loss(-log_preds.sum(dim=-1))
nll = F.nll_loss(
log_preds, target, reduction=self.reduction, weight=self.weight
)
return self.linear_combination(loss / n, nll)
class LabelSmoothing(nn.Module):
"""NLL loss with label smoothing.
"""
def __init__(self, smoothing=0.0):
"""Constructor for the LabelSmoothing module.
:param smoothing: label smoothing factor
"""
super(LabelSmoothing, self).__init__()
self.confidence = 1.0 - smoothing
self.smoothing = smoothing
def forward(self, x, target):
logprobs = torch.nn.functional.log_softmax(x, dim=-1)
nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
nll_loss = nll_loss.squeeze(1)
smooth_loss = -logprobs.mean(dim=-1)
loss = self.confidence * nll_loss + self.smoothing * smooth_loss
return loss.mean()
检查
if __name__=="__main__":
# Wangleiofficial
crit = LabelSmoothingLoss(smoothing=0.3, reduction="mean")
predict = torch.FloatTensor([[0, 0.2, 0.7, 0.1, 0],
[0, 0.9, 0.2, 0.2, 1],
[1, 0.2, 0.7, 0.9, 1]])
v = crit(Variable(predict),
Variable(torch.LongTensor([2, 1, 0])))
print(v)
# NVIDIA
crit = LabelSmoothing(smoothing=0.3)
predict = torch.FloatTensor([[0, 0.2, 0.7, 0.1, 0],
[0, 0.9, 0.2, 0.2, 1],
[1, 0.2, 0.7, 0.9, 1]])
v = crit(Variable(predict),
Variable(torch.LongTensor([2, 1, 0])))
print(v)
tensor(1.3883)
tensor(1.3883)
更新:Officially Added
torch.nn.CrossEntropyLoss(weight=None, size_average=None,
ignore_index=- 100, reduce=None,
reduction='mean', label_smoothing=0.0)
这目前在 PyTorch 中没有正式实现,但已被提议作为高优先级 Feature Request #7455, and separately in TorchVision Issue #2980。
其他库中有很多实现:
- NJUNMT-pytorch
NMTCritierion()._smooth_label()
- Snorkel
snorkel.classification.cross_entropy_with_probs()
- OpenNMT
LabelSmoothingLoss()
以及一些非官方的 implementations/code 片段:
TensorFlow / Keras implementation
tf.keras.losses.CategoricalCrossentropy(label_smoothing)
从 1.10.0 版本开始,Pytorch 正式支持 torch.nn.CrossEntropyLoss
中的标签平滑和软目标。