如何在 mxnet 中进行加权 softmax 输出自定义操作?

How to do weighted softmax output custom op in mxnet?

我想用加权版本替换mx.symbol.SoftmaxOutput(在整个数据集中根据标签的频率分配不同的权重)

原始函数运行良好,如下所示:

cls_prob = mx.symbol.SoftmaxOutput(data=data,
                                   label=label,
                                   multi_output=True,
                                   normalization='valid',
                                   use_ignore=True, 
                                   ignore_label=-1,
                                   name='cls_prob')

我目前写的代码如下。代码可以 运行 没有错误,但损失很快爆炸到 nan。我正在处理检测问题,当我将我的代码用作 CustomOp 时,RCNNL1 损失很快变成 nan。 另一件事是我必须忽略标签 -1 并且我不确定如何正确地执行它。任何帮助将不胜感激。

import mxnet as mx
import numpy as np

class WeightedSoftmaxCrossEntropyLoss(mx.operator.CustomOp):
    def __init__(self, num_class):
        self.num_class = int(num_class)

    def forward(self, is_train, req, in_data, out_data, aux):

        data = in_data[0]
        label = in_data[1]
        pred = mx.nd.SoftmaxOutput(data, label, multi_output=True,
                               normalization='valid', use_ignore=True, ignore_label=-1,
                               name='rcnn_cls_prob')

        self.assign(out_data[0], req[0], pred)

    def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
        cls_weight = np.array([
            0.002852781814876101, 
            0.30715984513157385, 
            1.0932468996115976, 
            1.1598757152765971, 
            0.20739109264009636, 
            1.1984256112776808, 
            0.18746186040248036, 
            2.9009928470737023, 
            0.92140970338602113, 
            1.200317380251021
        ])
        label = in_data[1]
        pred = out_data[0]
        label = label.asnumpy().astype('int32').reshape((-1))
        pred = pred.asnumpy().reshape((pred.shape[0], pred.shape[1], -1)).transpose((0, 2, 1))
        pred = pred.reshape((label.shape[0], -1))

        # Need to ignore label (how)
        out_inds = np.where(label == -1)[0]
        #label = label[keep_inds]
        one_hot = np.zeros((label.shape[0], self.num_class))
        one_hot[np.arange(label.shape[0]), label] = 1
        # gradient
        dx = pred - one_hot
        #dx[out_inds] = 0.0
        weighted_dx = cls_weight * dx / 4
        self.assign(in_grad[0], req[0], weighted_dx)

@mx.operator.register("weighted_softmax_ce_loss")
class WeightedSoftmaxCrossEntropyLossProp(mx.operator.CustomOpProp):
    def __init__(self, num_class):
        super(WeightedSoftmaxCrossEntropyLossProp, self).__init__(need_top_grad=False)
        self.num_class = num_class

    def list_arguments(self):
        return ['data', 'label']

    def list_outputs(self):
        return ['output']

    def infer_shape(self, in_shapes):
        data_shape = in_shapes[0]
        label_shape = (in_shapes[0][0],)
        output_shape = in_shapes[0]
        return [data_shape, label_shape], [output_shape], []

    def create_operator(self, ctx, in_shapes, in_dtypes):
        #  create and return the CustomOp class.
        `enter code here`return WeightedSoftmaxCrossEntropyLoss(self.num_class)

我不确定在这里使用 customop 是否最好,因为它可能很慢。 因为 SoftmaxOuput 在向后传递中计算梯度,所以不方便按你想的那样乘以损失。 但是,用符号 API 做起来并不太复杂。我附上了一个玩具示例,希望对您有所帮助。

import mxnet as mx
import numpy as np
import logging

# learn floor function from random numbers in [-1, -1 + num_classes]
n = 10000
batch_size = 128
num_classes = 10
x = (np.random.random((n,)) * num_classes) - 1
y = np.floor(x)
print(x[:2])
print(y[:2])

# define graph
data = mx.symbol.Variable('data')
label = mx.symbol.Variable('label')
class_weights = mx.symbol.Variable('class_weights')
fc = mx.sym.FullyConnected(data=data, num_hidden=num_classes)
fc = mx.sym.Activation(data=fc, act_type='relu')
proba = mx.sym.FullyConnected(data=fc, num_hidden=num_classes)
proba = mx.sym.softmax(proba)

# multipy cross entropy loss by weight
cross_entropy = -mx.sym.pick(proba, label) * mx.sym.pick(class_weights, label)

# mask the loss to zero when label is -1
mask = mx.sym.broadcast_not_equal(label, mx.sym.ones_like(label) * -1)
cross_entropy = cross_entropy * mask

# fit module
class_weights = np.array([np.arange(1, 1 + num_classes)]*n) 
data_iter = mx.io.NDArrayIter(data={'data': x, 'class_weights': class_weights}, label={'label': y}, batch_size=batch_size)
mod = mx.mod.Module(
    mx.sym.Group([mx.sym.MakeLoss(cross_entropy, name='ce_loss'), mx.sym.BlockGrad(proba)]),
    data_names=[v.name for v in data_iter.provide_data],
    label_names=[v.name for v in data_iter.provide_label]
)
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
mod.bind(data_shapes=data_iter.provide_data, label_shapes=data_iter.provide_label)
mod.init_params()
mod.fit(
    data_iter, 
    num_epoch=200, 
    optimizer=mx.optimizer.Adam(learning_rate=0.01, rescale_grad=1.0/batch_size),
    batch_end_callback=mx.callback.Speedometer(batch_size, 200), 
    eval_metric=mx.metric.Loss(name="loss", output_names=["ce_loss_output"]))

# show result, -1 are not predicted correctly as we did not compute their loss
probas = mod.predict(data_iter)[1].asnumpy()
print(zip(x, np.argmax(probas, axis=1)))