使用 H2O R 包中的 h2o.anomaly 函数重建 MSE 计算
Reconstruction MSE calculation using h2o.anomaly function from H2O R package
我正在尝试执行自动编码器进行异常检测。我使用 H2O R 包使用 h2o.anomaly
函数为样本数据生成重建 MSE。但是,我也尝试根据下面文档 link 中的 MSE 公式自己手动计算它:
http://docs.h2o.ai/h2o/latest-stable/h2o-docs/performance-and-prediction.html#mse-mean-squared-error
我用来建立模型的训练数据由三个特征和5行组成如下:
head(train_dat)
Feature1 Feature2 Feature3
1 68.18 0.1806535 3.871201
2 71.51 0.3987761 2.484907
3 67.77 0.4285304 3.332205
4 69.58 0.1823216 2.890372
5 70.98 0.4134333 1.791759
我用于预测的由三个特征和5行组成的测试数据如下:
head(test_dat)
Feature1 Feature2 Feature3
1 68.33000 0.4350239 2.708050
2 73.98000 0.5550339 3.044522
3 67.11000 0.7323679 2.639057
4 69.90395 0.9999787 4.499810
5 71.28867 0.4882539 3.091042
经过运行训练和预测,重构后的特征如下:
head(mod.out)
reconstr_Feature1 reconstr_Feature2 reconstr_Feature3
1 69.66297 0.4239244 2.346250
2 69.88329 0.3963843 2.381598
3 69.46544 0.4610502 2.233164
4 68.96117 0.4229165 2.676295
5 69.63208 0.3895452 2.530025
当我使用h2o.anomaly
函数进行MSE计算时,我收到如下MSE输出:
head(mse.list)
Reconstruction.MSE
1 0.05310159
2 0.57037600
3 0.54427385
4 2.08407248
5 0.14251951
然而,当我试图通过应用下面的函数来计算 MSE 时,我得到了不同的 MSE 输出:
mod.anon.validate <- apply((test_dat - mod.out)^2, 1, mean)
mse.list.validate <- as.data.frame(mod.anon.validate)
head(mse.list.validate)
mod.anon.validate
1 0.6359438
2 5.7492281
3 1.9288268
4 1.5156829
5 1.0229217
我想知道我在手动 MSE 计算中做错了什么?当调用"Reconstruction MSE"的时候,是不是和一般的MSE有区别?完整的 R 脚本如下:
### H2O Autoencoder test run ###
#Load test and training data.
test_dat <- read.table("sample.test.dat", header=TRUE)
train_dat <- read.table("sample.train.dat", header=TRUE)
#Start H2O
library(h2o)
localH2O <- h2o.init(port =54321)
#Training and deep learning
feature_names <- names(train_dat[1:3])
unmod.hex <- as.h2o(train_dat, destination_frame="train.hex") ; mod.hex=as.h2o(test_dat, destination_frame="test.hex")
unmod.dl <- h2o.deeplearning(x=feature_names,
training_frame=unmod.hex,
autoencoder = TRUE,
reproducible = T,
hidden = c(3,2,3), epochs = 50,
activation = "Tanh")
#Output result
mod.out <- as.data.frame(h2o.predict(unmod.dl,mod.hex,type=response))
mod.anon <- h2o.anomaly(unmod.dl, mod.hex, per_feature=FALSE)
mse.list <- as.data.frame(mod.anon)
mod.anon.validate <- apply((test_dat - mod.out)^2, 1, mean)
mse.list.validate <- as.data.frame(mod.anon.validate)
感谢您的帮助。
计算不匹配,因为 MSE 是在标准化 space 中计算的。如果您在 h2o.deeplearning()
中设置 standardize=FALSE
参数,它将匹配:
unmod.dl <- h2o.deeplearning(x=feature_names, standardize = FALSE,
training_frame=unmod.hex,
autoencoder = TRUE,
reproducible = T,
hidden = c(3,2,3), epochs = 50,
activation = "Tanh")
mod.out <- as.data.frame(h2o.predict(unmod.dl, mod.hex, type=response))
mod.anon <- h2o.anomaly(unmod.dl, mod.hex, per_feature=FALSE)
mse.list <- as.data.frame(mod.anon)
mse.list
> mse.list
Reconstruction.MSE
1 1512.740
2 1777.491
3 1458.438
4 1587.593
5 1648.999
> mod.anon.validate <- apply((test_dat - mod.out)^2, 1, mean)
> mse.list.validate <- as.data.frame(mod.anon.validate)
> mse.list.validate
mod.anon.validate
1 1512.740
2 1777.491
3 1458.438
4 1587.593
5 1648.999
下面是一个如何规范化的例子:
#Load test and training data.
test_dat <- sample.test
train_dat <- sample.train
#Start H2O
library(h2o)
localH2O <- h2o.init(port =54321, strict_version_check = FALSE)
#Training and deep learning
feature_names <- names(train_dat[1:3])
unmod.hex <- as.h2o(train_dat, destination_frame="train.hex")
mod.hex <- as.h2o(test_dat, destination_frame="test.hex")
unmod.dl <- h2o.deeplearning(x=feature_names,
training_frame=unmod.hex,
autoencoder = TRUE,
reproducible = T,
hidden = c(3,2,3), epochs = 50,
activation = "Tanh")
# Anomaly Detection
mod.anon <- h2o.anomaly(unmod.dl, mod.hex, per_feature=FALSE)
mse.list <- as.data.frame(mod.anon)
# Manual MSE
mod.out <- as.data.frame(h2o.predict(unmod.dl, mod.hex, type=response))
# Scale Output
s <- apply(train_dat, 2, max) - apply(train_dat, 2, min)
m <- apply(train_dat, 2, mean)
original_scaled <- t(apply(test_dat, 1, function(x) (x-m)/s))
recreate_scaled <- t(apply(mod.out, 1, function(x) (x-m)/s))
mod.anon.validate <- apply((original_scaled - recreate_scaled)^2, 1, mean)
mse.list.validate <- as.data.frame(mod.anon.validate)
# Compare Outputs
print(mse.list)
print(mse.list.validate)
我正在尝试执行自动编码器进行异常检测。我使用 H2O R 包使用 h2o.anomaly
函数为样本数据生成重建 MSE。但是,我也尝试根据下面文档 link 中的 MSE 公式自己手动计算它:
http://docs.h2o.ai/h2o/latest-stable/h2o-docs/performance-and-prediction.html#mse-mean-squared-error
我用来建立模型的训练数据由三个特征和5行组成如下:
head(train_dat)
Feature1 Feature2 Feature3
1 68.18 0.1806535 3.871201
2 71.51 0.3987761 2.484907
3 67.77 0.4285304 3.332205
4 69.58 0.1823216 2.890372
5 70.98 0.4134333 1.791759
我用于预测的由三个特征和5行组成的测试数据如下:
head(test_dat)
Feature1 Feature2 Feature3
1 68.33000 0.4350239 2.708050
2 73.98000 0.5550339 3.044522
3 67.11000 0.7323679 2.639057
4 69.90395 0.9999787 4.499810
5 71.28867 0.4882539 3.091042
经过运行训练和预测,重构后的特征如下:
head(mod.out)
reconstr_Feature1 reconstr_Feature2 reconstr_Feature3
1 69.66297 0.4239244 2.346250
2 69.88329 0.3963843 2.381598
3 69.46544 0.4610502 2.233164
4 68.96117 0.4229165 2.676295
5 69.63208 0.3895452 2.530025
当我使用h2o.anomaly
函数进行MSE计算时,我收到如下MSE输出:
head(mse.list)
Reconstruction.MSE
1 0.05310159
2 0.57037600
3 0.54427385
4 2.08407248
5 0.14251951
然而,当我试图通过应用下面的函数来计算 MSE 时,我得到了不同的 MSE 输出:
mod.anon.validate <- apply((test_dat - mod.out)^2, 1, mean)
mse.list.validate <- as.data.frame(mod.anon.validate)
head(mse.list.validate)
mod.anon.validate
1 0.6359438
2 5.7492281
3 1.9288268
4 1.5156829
5 1.0229217
我想知道我在手动 MSE 计算中做错了什么?当调用"Reconstruction MSE"的时候,是不是和一般的MSE有区别?完整的 R 脚本如下:
### H2O Autoencoder test run ###
#Load test and training data.
test_dat <- read.table("sample.test.dat", header=TRUE)
train_dat <- read.table("sample.train.dat", header=TRUE)
#Start H2O
library(h2o)
localH2O <- h2o.init(port =54321)
#Training and deep learning
feature_names <- names(train_dat[1:3])
unmod.hex <- as.h2o(train_dat, destination_frame="train.hex") ; mod.hex=as.h2o(test_dat, destination_frame="test.hex")
unmod.dl <- h2o.deeplearning(x=feature_names,
training_frame=unmod.hex,
autoencoder = TRUE,
reproducible = T,
hidden = c(3,2,3), epochs = 50,
activation = "Tanh")
#Output result
mod.out <- as.data.frame(h2o.predict(unmod.dl,mod.hex,type=response))
mod.anon <- h2o.anomaly(unmod.dl, mod.hex, per_feature=FALSE)
mse.list <- as.data.frame(mod.anon)
mod.anon.validate <- apply((test_dat - mod.out)^2, 1, mean)
mse.list.validate <- as.data.frame(mod.anon.validate)
感谢您的帮助。
计算不匹配,因为 MSE 是在标准化 space 中计算的。如果您在 h2o.deeplearning()
中设置 standardize=FALSE
参数,它将匹配:
unmod.dl <- h2o.deeplearning(x=feature_names, standardize = FALSE,
training_frame=unmod.hex,
autoencoder = TRUE,
reproducible = T,
hidden = c(3,2,3), epochs = 50,
activation = "Tanh")
mod.out <- as.data.frame(h2o.predict(unmod.dl, mod.hex, type=response))
mod.anon <- h2o.anomaly(unmod.dl, mod.hex, per_feature=FALSE)
mse.list <- as.data.frame(mod.anon)
mse.list
> mse.list
Reconstruction.MSE
1 1512.740
2 1777.491
3 1458.438
4 1587.593
5 1648.999
> mod.anon.validate <- apply((test_dat - mod.out)^2, 1, mean)
> mse.list.validate <- as.data.frame(mod.anon.validate)
> mse.list.validate
mod.anon.validate
1 1512.740
2 1777.491
3 1458.438
4 1587.593
5 1648.999
下面是一个如何规范化的例子:
#Load test and training data.
test_dat <- sample.test
train_dat <- sample.train
#Start H2O
library(h2o)
localH2O <- h2o.init(port =54321, strict_version_check = FALSE)
#Training and deep learning
feature_names <- names(train_dat[1:3])
unmod.hex <- as.h2o(train_dat, destination_frame="train.hex")
mod.hex <- as.h2o(test_dat, destination_frame="test.hex")
unmod.dl <- h2o.deeplearning(x=feature_names,
training_frame=unmod.hex,
autoencoder = TRUE,
reproducible = T,
hidden = c(3,2,3), epochs = 50,
activation = "Tanh")
# Anomaly Detection
mod.anon <- h2o.anomaly(unmod.dl, mod.hex, per_feature=FALSE)
mse.list <- as.data.frame(mod.anon)
# Manual MSE
mod.out <- as.data.frame(h2o.predict(unmod.dl, mod.hex, type=response))
# Scale Output
s <- apply(train_dat, 2, max) - apply(train_dat, 2, min)
m <- apply(train_dat, 2, mean)
original_scaled <- t(apply(test_dat, 1, function(x) (x-m)/s))
recreate_scaled <- t(apply(mod.out, 1, function(x) (x-m)/s))
mod.anon.validate <- apply((original_scaled - recreate_scaled)^2, 1, mean)
mse.list.validate <- as.data.frame(mod.anon.validate)
# Compare Outputs
print(mse.list)
print(mse.list.validate)