忽略keras中R的缺失目标值的损失函数
Loss function that ignores missing target values in keras for R
我正在使用 keras
R
包将 LSTM 模型拟合到多元时间序列(关于 Python 或 PyTorch 中的 keras 的回答也会有所帮助,因为我可以switch)并有多个输出(3个连续的,一个分类的)。一些目标在某些时间步长中丢失(编码为 -1,因为所有观测值都是 $\geq 0$,但我显然可以将其更改为其他任何值)。我认为有意义的是,如果目标变量缺失 (=-1),模型的任何预测都被认为是正确的(=没有损失)。我对预测值是否缺失没有兴趣,因此强制模型输出 -1 对我来说没有兴趣,即使模型可以可靠地预测缺失。我更愿意预测缺失值是多少(即使我无法检查这是否正确)。
如何创建“忽略”-1 值/认为它们正确的自定义损失函数?
如果上下文更重要,下图是说明我的模型的图表,下面是 R
代码,用于生成一些示例数据并拟合模型,以防没有缺失数据。删除下面代码中 # %>% mutate_at(vars(x1:x4, y1:y4), randomly_set_to_minus_one)
行的注释后,您会得到一些编码为 -1 的输入和输出。我没有强烈的意见应该如何将这些编码为特征,我也可以将这些值设置为输入值的中值并添加一个标志以表示缺失或其他。 (对我而言)真正重要的是我的损失函数正确处理 -1 目标值。在 post 结束时,我尝试编写这样的损失函数失败了。
library(tidyverse)
library(keras)
# A function I use to set some values randomly to -1
randomly_set_to_minus_one = function(x){
ifelse(rnorm(length(x))>1, -1, x)
}
# randomly_set_to_minus_one(rnorm(100))
set.seed(1234)
subjects = 250
records_per_subject = 25
# Simulate some time series for multiple subject with multiple records per subject.
example = tibble(subject = rep(1:subjects, each=records_per_subject),
rand1 = rep(rnorm(subjects), each=records_per_subject),
rand2 = rep(rnorm(subjects), each=records_per_subject),
rand3 = rnorm(subjects*records_per_subject),
rand4 = rnorm(subjects*records_per_subject)) %>%
mutate(x1 = 0.8*rand1 + 0.2*rand2 + 0.8*rand3 + 0.2*rand4 + rnorm(n=n(),sd=0.1),
x2 = 0.1*rand1 + 0.9*rand2 + 2*rand3 + rnorm(n=n(),sd=0.1),
x3 = 0.5*rand1 + 0.5*rand2 + 0.2*rand4 + rnorm(n=n(),sd=0.25),
x4 = 0.2*rand1 + 0.2*rand2 + 0.5*rand3 + 0.5*rand4 + rnorm(n=n(),sd=0.1),
x5 = rep(1:records_per_subject, subjects),
y1 = 1+tanh(rand1 + rand2 + 0.05*rand3 + 0.05*rand4 + 2*x5/records_per_subject + rnorm(n=n(),sd=0.05)),
y2 = 10*plogis(0.2*rand1 + 0.2*rand2 + 0.2*rand3 + 0.2*rand4),
y3 = 3*plogis(0.8*rand1 + 0.8*rand4 + 2*(x5-records_per_subject/2)/records_per_subject),
prob1 = exp(rand1/4*3+rand3/4),
prob2 = exp(rand2/4*3+rand4/4),
prob3 = exp(-rand1-rand2-rand3-rand4),
total = prob1+prob2+prob3,
prob1 = prob1/total,
prob2 = prob2/total,
prob3 = prob3/total,
y4 = pmap(list(prob1, prob2, prob3), function(x,y,z) sample(1:3, 1, replace=T, prob=c(x,y,z)))) %>%
unnest(y4) %>%
mutate(x1 = x1 + min(x1),
x2 = x2 + min(x2),
x3 = x3 + min(x3),
x4 = x4 + min(x4)) %>%
dplyr::select(subject, x1:x5, y1:y4)
# %>% mutate_at(vars(x1:x4, y1:y4), randomly_set_to_minus_one)
# Create arrays the way keras wants them as inputs/outputs:
# 250, 25, 5 array of predictors
x_array = map(sort(unique(example$subject)), function(x) {
example %>%
filter(subject==x) %>%
dplyr::select(x1:x5) %>%
as.matrix()
}) %>%
abind::abind(along=3 ) %>%
aperm(perm=c(3,1,2))
# 250, 25, 3 array of continuous target variables
y13_array = map(sort(unique(example$subject)), function(x) {
example %>%
filter(subject==x) %>%
dplyr::select(y1:y3) %>%
as.matrix()
}) %>%
abind::abind(along=3 ) %>%
aperm(perm=c(3,1,2))
# 250, 25, 1 array of categorical target variables (one-hot-encoded)
y4_array = map(sort(unique(example$subject)), function(x) {
example %>%
filter(subject==x) %>%
mutate(y41 = case_when(y4==1~1, y4==-1~-1, TRUE~0),
y42 = case_when(y4==2~1, y4==-1~-1, TRUE~0),
y43 = case_when(y4==3~1, y4==-1~-1, TRUE~0)) %>%
dplyr::select(y41:y43) %>%
as.matrix()
}) %>%
abind::abind(along=3 ) %>%
aperm(perm=c(3,1,2))
# Define LSTM neural network
nn_inputs <- layer_input(shape = c(dim(x_array)[2], dim(x_array)[3]))
nn_lstm_layers <- nn_inputs %>%
layer_lstm(units = 32, return_sequences = TRUE,
dropout = 0.3, # That's dropout applied to the inputs, the below is recurrent drop-out applied to LSTM memory cells
recurrent_dropout = 0.3) %>%
layer_lstm(units = 16,
return_sequences = TRUE,
dropout = 0.3,
recurrent_dropout = 0.3)
# First continuous output (3 variables)
cont_target <- nn_lstm_layers %>%
layer_dense(units = dim(y13_array)[3], name = "cont_target")
# Categorical outcome (3 categories one-hot-encoded)
cat_target <- nn_lstm_layers %>%
layer_dense(units = dim(y4_array)[3], activation = "sigmoid", name = "cat_target")
model <- keras_model(nn_inputs,
list(cont_target, cat_target))
summary(model)
val_samples = sample(x=c( rep(FALSE, floor(dim(x_array)[1]*0.8)),
rep(TRUE, ceiling(dim(x_array)[1]*0.2))),
size = dim(x_array)[1],
replace = F)
model %>% compile(
optimizer = "rmsprop",
loss = list( cont_target = "mse",
cat_target = "categorical_crossentropy"),
loss_weights = list(cont_target = 1.0, cat_target = 1.0))
history <- model %>%
fit(
x_array[!val_samples,,],
list(cont_target = y13_array[!val_samples,,],
cat_target = y4_array[!val_samples,,]),
epochs = 100,
batch_size = 32,
validation_data = list(x_array[val_samples,,],
list(cont_target = y13_array[val_samples,,],
cat_target = y4_array[val_samples,,])),
callbacks = list(callback_reduce_lr_on_plateau(
monitor = "val_loss", factor = 0.5, patience = 10, verbose = 0,
mode = "min", min_delta = 1e-04, cooldown = 0, min_lr = 0),
callback_early_stopping(monitor = "val_loss",
min_delta = 0,
patience = 20,
restore_best_weights = TRUE,
verbose = 0, mode = c("auto")))
)
plot(history) + scale_y_log10()
这是我编写忽略 -1 值的修改后的 MSE 损失函数的尝试:
# Custom loss functions to deal with missing values (coded as -1)
mse_na_loss <- function(y_true, y_pred){
K <- backend()
#K$mean( K$switch(K$equal(y_true, -1), K$zeros(shape=K$constant(y_true)$shape), K$pow(y_true-y_pred, 2)), axis=-1)
#K$mean( K$pow(y_true-y_pred, 2))
#K$zeros(shape=K$constant(y_true)$shape)
#K$equal(y_true, -1)
K$mean(
K$switch( K$equal(y_true, -1),
K$zeros(shape=K$constant(y_true)$shape, dtype = "float64"),
K$pow(y_true-y_pred, 2)),
axis=-1L)
}
What I think would make sense is that any prediction by the model is considered right (=no loss incurred), if the target variable is missing (=-1).
您可以通过检查 y_true 是否不同于 -1 (k_not_equal
) 然后将二进制转换为数字 (k_cast
) 来实现此目的 (=no loss incurred
) .这将为您提供诸如 (1,0,1,1,0) 之类的值,它可以与 MSE 成倍数。
mse_na_loss <- function(y_true, y_pred){
k_pow(y_true-y_pred, 2) * k_cast(k_not_equal(y_true, -1), 'float32')
}
这基本上会为您提供您在问题末尾尝试创建的损失函数。并回答你问题中引用的部分。
但是,我认为这不是一个好方法。该损失函数不会像您所说的那样“忽略”那些观察结果。它只是了解到任何值都适合这里。这可能会给您的学习带来不必要的干扰。
基于域,其他 NA 处理方法如 'last observation carried forward' (na.locf
) 可能比 -1 更好。
我正在使用 keras
R
包将 LSTM 模型拟合到多元时间序列(关于 Python 或 PyTorch 中的 keras 的回答也会有所帮助,因为我可以switch)并有多个输出(3个连续的,一个分类的)。一些目标在某些时间步长中丢失(编码为 -1,因为所有观测值都是 $\geq 0$,但我显然可以将其更改为其他任何值)。我认为有意义的是,如果目标变量缺失 (=-1),模型的任何预测都被认为是正确的(=没有损失)。我对预测值是否缺失没有兴趣,因此强制模型输出 -1 对我来说没有兴趣,即使模型可以可靠地预测缺失。我更愿意预测缺失值是多少(即使我无法检查这是否正确)。
如何创建“忽略”-1 值/认为它们正确的自定义损失函数?
如果上下文更重要,下图是说明我的模型的图表,下面是 R
代码,用于生成一些示例数据并拟合模型,以防没有缺失数据。删除下面代码中 # %>% mutate_at(vars(x1:x4, y1:y4), randomly_set_to_minus_one)
行的注释后,您会得到一些编码为 -1 的输入和输出。我没有强烈的意见应该如何将这些编码为特征,我也可以将这些值设置为输入值的中值并添加一个标志以表示缺失或其他。 (对我而言)真正重要的是我的损失函数正确处理 -1 目标值。在 post 结束时,我尝试编写这样的损失函数失败了。
library(tidyverse)
library(keras)
# A function I use to set some values randomly to -1
randomly_set_to_minus_one = function(x){
ifelse(rnorm(length(x))>1, -1, x)
}
# randomly_set_to_minus_one(rnorm(100))
set.seed(1234)
subjects = 250
records_per_subject = 25
# Simulate some time series for multiple subject with multiple records per subject.
example = tibble(subject = rep(1:subjects, each=records_per_subject),
rand1 = rep(rnorm(subjects), each=records_per_subject),
rand2 = rep(rnorm(subjects), each=records_per_subject),
rand3 = rnorm(subjects*records_per_subject),
rand4 = rnorm(subjects*records_per_subject)) %>%
mutate(x1 = 0.8*rand1 + 0.2*rand2 + 0.8*rand3 + 0.2*rand4 + rnorm(n=n(),sd=0.1),
x2 = 0.1*rand1 + 0.9*rand2 + 2*rand3 + rnorm(n=n(),sd=0.1),
x3 = 0.5*rand1 + 0.5*rand2 + 0.2*rand4 + rnorm(n=n(),sd=0.25),
x4 = 0.2*rand1 + 0.2*rand2 + 0.5*rand3 + 0.5*rand4 + rnorm(n=n(),sd=0.1),
x5 = rep(1:records_per_subject, subjects),
y1 = 1+tanh(rand1 + rand2 + 0.05*rand3 + 0.05*rand4 + 2*x5/records_per_subject + rnorm(n=n(),sd=0.05)),
y2 = 10*plogis(0.2*rand1 + 0.2*rand2 + 0.2*rand3 + 0.2*rand4),
y3 = 3*plogis(0.8*rand1 + 0.8*rand4 + 2*(x5-records_per_subject/2)/records_per_subject),
prob1 = exp(rand1/4*3+rand3/4),
prob2 = exp(rand2/4*3+rand4/4),
prob3 = exp(-rand1-rand2-rand3-rand4),
total = prob1+prob2+prob3,
prob1 = prob1/total,
prob2 = prob2/total,
prob3 = prob3/total,
y4 = pmap(list(prob1, prob2, prob3), function(x,y,z) sample(1:3, 1, replace=T, prob=c(x,y,z)))) %>%
unnest(y4) %>%
mutate(x1 = x1 + min(x1),
x2 = x2 + min(x2),
x3 = x3 + min(x3),
x4 = x4 + min(x4)) %>%
dplyr::select(subject, x1:x5, y1:y4)
# %>% mutate_at(vars(x1:x4, y1:y4), randomly_set_to_minus_one)
# Create arrays the way keras wants them as inputs/outputs:
# 250, 25, 5 array of predictors
x_array = map(sort(unique(example$subject)), function(x) {
example %>%
filter(subject==x) %>%
dplyr::select(x1:x5) %>%
as.matrix()
}) %>%
abind::abind(along=3 ) %>%
aperm(perm=c(3,1,2))
# 250, 25, 3 array of continuous target variables
y13_array = map(sort(unique(example$subject)), function(x) {
example %>%
filter(subject==x) %>%
dplyr::select(y1:y3) %>%
as.matrix()
}) %>%
abind::abind(along=3 ) %>%
aperm(perm=c(3,1,2))
# 250, 25, 1 array of categorical target variables (one-hot-encoded)
y4_array = map(sort(unique(example$subject)), function(x) {
example %>%
filter(subject==x) %>%
mutate(y41 = case_when(y4==1~1, y4==-1~-1, TRUE~0),
y42 = case_when(y4==2~1, y4==-1~-1, TRUE~0),
y43 = case_when(y4==3~1, y4==-1~-1, TRUE~0)) %>%
dplyr::select(y41:y43) %>%
as.matrix()
}) %>%
abind::abind(along=3 ) %>%
aperm(perm=c(3,1,2))
# Define LSTM neural network
nn_inputs <- layer_input(shape = c(dim(x_array)[2], dim(x_array)[3]))
nn_lstm_layers <- nn_inputs %>%
layer_lstm(units = 32, return_sequences = TRUE,
dropout = 0.3, # That's dropout applied to the inputs, the below is recurrent drop-out applied to LSTM memory cells
recurrent_dropout = 0.3) %>%
layer_lstm(units = 16,
return_sequences = TRUE,
dropout = 0.3,
recurrent_dropout = 0.3)
# First continuous output (3 variables)
cont_target <- nn_lstm_layers %>%
layer_dense(units = dim(y13_array)[3], name = "cont_target")
# Categorical outcome (3 categories one-hot-encoded)
cat_target <- nn_lstm_layers %>%
layer_dense(units = dim(y4_array)[3], activation = "sigmoid", name = "cat_target")
model <- keras_model(nn_inputs,
list(cont_target, cat_target))
summary(model)
val_samples = sample(x=c( rep(FALSE, floor(dim(x_array)[1]*0.8)),
rep(TRUE, ceiling(dim(x_array)[1]*0.2))),
size = dim(x_array)[1],
replace = F)
model %>% compile(
optimizer = "rmsprop",
loss = list( cont_target = "mse",
cat_target = "categorical_crossentropy"),
loss_weights = list(cont_target = 1.0, cat_target = 1.0))
history <- model %>%
fit(
x_array[!val_samples,,],
list(cont_target = y13_array[!val_samples,,],
cat_target = y4_array[!val_samples,,]),
epochs = 100,
batch_size = 32,
validation_data = list(x_array[val_samples,,],
list(cont_target = y13_array[val_samples,,],
cat_target = y4_array[val_samples,,])),
callbacks = list(callback_reduce_lr_on_plateau(
monitor = "val_loss", factor = 0.5, patience = 10, verbose = 0,
mode = "min", min_delta = 1e-04, cooldown = 0, min_lr = 0),
callback_early_stopping(monitor = "val_loss",
min_delta = 0,
patience = 20,
restore_best_weights = TRUE,
verbose = 0, mode = c("auto")))
)
plot(history) + scale_y_log10()
这是我编写忽略 -1 值的修改后的 MSE 损失函数的尝试:
# Custom loss functions to deal with missing values (coded as -1)
mse_na_loss <- function(y_true, y_pred){
K <- backend()
#K$mean( K$switch(K$equal(y_true, -1), K$zeros(shape=K$constant(y_true)$shape), K$pow(y_true-y_pred, 2)), axis=-1)
#K$mean( K$pow(y_true-y_pred, 2))
#K$zeros(shape=K$constant(y_true)$shape)
#K$equal(y_true, -1)
K$mean(
K$switch( K$equal(y_true, -1),
K$zeros(shape=K$constant(y_true)$shape, dtype = "float64"),
K$pow(y_true-y_pred, 2)),
axis=-1L)
}
What I think would make sense is that any prediction by the model is considered right (=no loss incurred), if the target variable is missing (=-1).
您可以通过检查 y_true 是否不同于 -1 (k_not_equal
) 然后将二进制转换为数字 (k_cast
) 来实现此目的 (=no loss incurred
) .这将为您提供诸如 (1,0,1,1,0) 之类的值,它可以与 MSE 成倍数。
mse_na_loss <- function(y_true, y_pred){
k_pow(y_true-y_pred, 2) * k_cast(k_not_equal(y_true, -1), 'float32')
}
这基本上会为您提供您在问题末尾尝试创建的损失函数。并回答你问题中引用的部分。
但是,我认为这不是一个好方法。该损失函数不会像您所说的那样“忽略”那些观察结果。它只是了解到任何值都适合这里。这可能会给您的学习带来不必要的干扰。
基于域,其他 NA 处理方法如 'last observation carried forward' (na.locf
) 可能比 -1 更好。