Flux.jl : 自定义优化器

Flux.jl : Customizing optimizer

我正在尝试实现无梯度优化器函数,以使用 Flux.jl 与 Julia 一起训练卷积神经网络。参考论文是这样的:https://arxiv.org/abs/2005.05955。本文提出了 RSO,一种无梯度优化算法,在采样基​​础上一次更新单个权重。该算法的伪代码如下图所示

optimizer_pseudocode

我正在使用 MNIST 数据集。

function train(; kws...)
args = Args(; kws...) # collect options in a stuct for convinience

if CUDA.functional() && args.use_cuda
    @info "Training on CUDA GPU"
    CUDA.allwoscalar(false)
    device = gpu
else
    @info "Training on CPU"
    device = cpu
end

# Prepare datasets
x_train, x_test, y_train, y_test = getdata(args, device)

# Create DataLoaders (mini-batch iterators)
train_loader = DataLoader((x_train, y_train), batchsize=args.batchsize, shuffle=true)
test_loader = DataLoader((x_test, y_test), batchsize=args.batchsize)

# Construct model
model = build_model() |> device
ps = Flux.params(model) # model's trainable parameters

best_param = ps
if args.optimiser == "SGD"
    # Regular training step with SGD

elseif args.optimiser == "RSO"
    # Run RSO function and update ps
    best_param .= RSO(x_train, y_train, args.RSOupdate, model, args.batchsize, device)
end

以及对应的RSO函数:

function RSO(X,L,C,model, batch_size, device)
"""
model = convolutional model structure
X = Input data
L = labels
C = Number of rounds to update parameters
W = Weight set of layers
Wd = Weight tensors of layer d that generates an activation
wid = weight tensor that generates an activation aᵢ
wj = a weight in wid
"""

# Normalize input data to have zero mean and unit standard deviation
X .= (X .- sum(X))./std(X)
train_loader = DataLoader((X, L), batchsize=batch_size, shuffle=true)

#println("model = $(typeof(model))")

std_prep = []
σ_d = Float64[]
D = 1
for layer in model
    D += 1
    Wd = Flux.params(layer)
    # Initialize the weights of the network with Gaussian distribution
    for id in Wd
        wj = convert(Array{Float32, 4}, rand(Normal(0, sqrt(2/length(id))), (3,3,4,4)))
        id = wj
        append!(std_prep, vec(wj))
    end
    # Compute std of all elements in the weight tensor Wd
    push!(σ_d, std(std_prep))
end

W = Flux.params(model)

# Weight update
for _ in 1:C
    d = D
    while d > 0
        for id in 1:length(W[d])
            # Randomly sample change in weights from Gaussian distribution
            for j in 1:length(w[d][id])
                # Randomly sample mini-batch
                (x, l) = train_loader[rand(1:length(train_loader))]
                
                # Sample a weight from normal distribution
                ΔWj[d][id][j] = rand(Normal(0, σ_d[d]), 1)

                loss, acc = loss_and_accuracy(data_loader, model, device)
                W = argmin(F(x,l, W+ΔWj), F(x,l,W), F(x,l, W-ΔWj))
            end
        end
        d -= 1
    end
end

return W
end

这里的问题是RSO函数的第二块。我试图在F(w, l, W+gW), F(w, l, W), F(w, l, W-gW)三种情况下评估单个权重变化的损失,并选择损失最小的权重集。但是我如何使用 Flux.jl 来做到这一点?我尝试使用的损失函数是 logitcrossentropy(ŷ, y, agg=sum)。为了生成 y_hat,我们应该使用模型(W),但是在 Zygote.Params() 形式中改变单个权重参数已经具有挑战性....

根据您分享的论文,您似乎需要更改每一层的每个输出神经元的权重数组。不幸的是,这意味着优化例程的实现将取决于层类型,因为卷积层的“输出神经元”与全连接层完全不同。换句话说,仅仅遍历 Flux.params(model) 是不够的,因为这只是模型中所有权重数组的集合,并且每个权重数组根据它来自哪一层而被不同地对待。

幸运的是,如果您使用单独的函数而不是一个巨大的循环,Julia 的多重分派确实使这更容易编写。我将使用下面的伪代码总结该算法:

for layer in model
  for output_neuron in layer
    for weight_element in parameters(output_neuron)
      weight_element = sample(N(0, sqrt(2 / num_outputs(layer))))
    end
  end
  sigmas[layer] = stddev(parameters(layer))
end

for c in 1 to C
  for layer in reverse(model)
    for output_neuron in layer
      for weight_element in parameters(output_neuron)
        x, y = sample(batches)
        dw = N(0, sigmas[layer])
        # optimize weights
      end
    end
  end
end

我们需要将 for output_neuron ... 部分隔离成单独的函数。

在第一个块中,我们实际上并没有对每个 weight_element 做任何不同的事情,它们都是从相同的正态分布中采样的。所以,我们实际上不需要迭代输出神经元,但我们确实需要知道有多少。

using Statistics: std

# this function will set the weights according to the
# normal distribution and the number of output neurons
# it also returns the standard deviation of the weights
function sample_weight!(layer::Dense)
  sample = randn(eltype(layer.weight), size(layer.weight))
  num_outputs = size(layer.weight, 1)
  # notice the "." notation which is used to mutate the array
  layer.weight .= sample .* num_outputs

  return std(layer.weight)
end

function sample_weight!(layer::Conv)
  sample = randn(eltype(layer.weight), size(layer.weight))
  num_outputs = size(layer.weight, 4)
  # notice the "." notation which is used to mutate the array
  layer.weight .= sample .* num_outputs

  return std(layer.weight)
end

sigmas = map(sample_weights!, model)

现在,对于第二个块,我们将通过为每一层定义不同的函数来实现类似的技巧。

function optimize_layer!(loss, layer::Dense, data, sigma)
  for i in 1:size(layer.weight, 1)
    for j in 1:size(layer.weight, 2)
      wj = layer.weight[i, j]
      x, y = data[rand(1:length(data))]
      dw = randn() * sigma
      ws = [wj + dw, wj, wj - dw]
      losses = Float32[]
      for (k, w) in enumerate(ws)
        layer.weight[i, j] = w
        losses[k] = loss(x, y)
      end
      layer.weight[i, j] = ws[argmin(losses)]
    end
  end
end

function optimize_layer!(loss, layer::Conv, data, sigma)
  for i in 1:size(layer.weight, 4)
    # we use a view to reference the full kernel
    # for this output channel
    wid = view(layer.weight, :, :, :, i)
    
    # each index let's us treat wid like a vector
    for j in eachindex(wid)
      wj = wid[j]
      x, y = data[rand(1:length(data))]
      dw = randn() * sigma
      ws = [wj + dw, wj, wj - dw]
      losses = Float32[]
      for (k, w) in enumerate(ws)
        wid[j] = w
        losses[k] = loss(x, y)
      end
      wid[j] = ws[argmin(losses)]
    end
  end
end

for c in 1:C
  for (layer, sigma) in reverse(zip(model, sigmas))
    optimize_layer!(layer, data, sigma) do x, y
      logitcrossentropy(model(x), y; agg = sum)
    end
  end
end

请注意,我在任何地方都没有使用 Flux.params,这对我们没有帮助。此外,Flux.params 将同时包含权重和偏差,而且这篇论文看起来根本不在乎偏差。如果您有一种优化方法,可以普遍优化任何参数,而不管层类型是否相同(即像梯度下降),那么您可以使用 for p in Flux.params(model) ....

谢谢@darsnack :)

我发现你的答案有点晚了,所以与此同时我可以弄清楚我自己的脚本是否有效。我的代码确实有点硬编码,但您能否对此提供反馈?

function RSO(train_loader, test_loader, C,model, batch_size, device, args)
"""
model = convolutional model structure
C = Number of rounds to update parameters (epochs)
batch_size = size of the mini batch that will be used to calculate loss
device = CPU or GPU
"""

# Evaluate initial weight
test_loss, test_acc = loss_and_accuracy(test_loader, model, device)
println("Initial Weight:")
println("   test_loss = $test_loss, test_accuracy = $test_acc")

random_batch = []
for (x, l) in train_loader
    push!(random_batch, (x,l))
end

# Initialize weights
std_prep = []
σ_d = Float64[]
D = 0
for layer in model
    D += 1
    Wd = Flux.params(layer)
    # Initialize the weights of the network with Gaussian distribution
    for id in Wd
        if typeof(id) == Array{Float32, 4}
            wj = convert(Array{Float32, 4}, rand(Normal(0, sqrt(2/length(id))), size(id)))
        elseif typeof(id) == Vector{Float32}
            wj = convert(Vector{Float32}, rand(Normal(0, sqrt(2/length(id))), length(id)))
        elseif typeof(id) == Matrix{Float32}
            wj = convert(Matrix{Float32}, rand(Normal(0, sqrt(2/length(id))), size(id)))
        end
        id = wj
        append!(std_prep, vec(wj))
    end
    # Compute std of all elements in the weight tensor Wd
    push!(σ_d, std(std_prep))
end

# Weight update
for c in 1:C
    d = D
    # First update the weights of the layer closest to the labels 
    # and then sequentially move closer to the input
    while d > 0
        Wd = Flux.params(model[d])
        for id in Wd
            # Randomly sample change in weights from Gaussian distribution
            for j in 1:length(id)
                # Randomly sample mini-batch
                (x, y) = rand(random_batch, 1)[1]
                x, y = device(x), device(y)
                
                # Sample a weight from normal distribution
                ΔWj = rand(Normal(0, σ_d[d]), 1)[1]

                # Weight update with three scenario
                ## F(x,l, W+ΔWj)
                id[j] = id[j]+ΔWj
                ŷ = model(x)
                ls_pos = logitcrossentropy(ŷ, y, agg=sum) / size(x)[end]

                ## F(x,l,W)
                id[j] = id[j]-ΔWj
                ŷ = model(x)
                ls_org = logitcrossentropy(ŷ, y, agg=sum) / size(x)[end]

                ## F(x,l, W-ΔWj)
                id[j] = id[j]-ΔWj
                ŷ = model(x)
                ls_neg = logitcrossentropy(ŷ, y, agg=sum) / size(x)[end]

                # Check weight update that gives minimum loss
                min_loss = argmin([ls_org, ls_pos, ls_neg])

                # Save weight update with minimum loss
                if min_loss == 1
                    id[j] = id[j] + ΔWj
                elseif min_loss == 2
                    id[j] = id[j] + 2*ΔWj
                elseif min_loss == 3
                    id[j] = id[j]
                end
    
            end
        end
        d -= 1
    end

    train_loss, train_acc = loss_and_accuracy(train_loader, model, device)
    test_loss, test_acc = loss_and_accuracy(test_loader, model, device)

    track!(args.tracker, test_acc)

    println("RSO Round=$c")
    println("   train_loss = $train_loss, train_accuracy = $train_acc")
    println("   test_loss = $test_loss, test_accuracy = $test_acc")

end

return Flux.params(model)
end