Error: Model is large in H2o autoencoder training
Error: Model is large in H2o autoencoder training
我有一个 table 大小为 5360*51200 的。这里,5360 是实例数,51200 是特征数。我需要减少特征的维度。我在 H2o 中的堆叠自动编码器的帮助下尝试它,但它不允许我训练以引发错误:
Model is a large and large number of parameters
代码如下:
library(h2o)
h2o.init(nthreads = -1)
check.deeplearning_stacked_autoencoder <- function() {
# this function builds a vector of autoencoder models, one per layer
#library(h2o)
#h2o.init()
get_stacked_ae_array <- function(training_data, layers, args) {
vector <- c()
index = 0
for (i in 1:length(layers)) {
index = index + 1
ae_model <- do.call(h2o.deeplearning,
modifyList(
list(
x = names(training_data),
training_frame = training_data,
autoencoder = T,
hidden = layers[i]
),
args
))
training_data = h2o.deepfeatures(ae_model, training_data, layer =
3)
names(training_data) <-
gsub("DF", paste0("L", index, sep = ""), names(training_data))
vector <- c(vector, ae_model)
}
cat(
length(vector))
}
# this function returns final encoded contents
apply_stacked_ae_array <- function(data, ae) {
index = 0
for (i in 1:length(ae)) {
index = index + 1
data = h2o.deepfeatures(ae[[i]], data, layer = 3)
names(data) <-
gsub("DF", paste0("L", index, sep = ""), names(data))
}
data
}
TRAIN <-
"E:/Chiranjibi file/Geometric features/Lu/Train/d_features.csv"
TEST <-
"E:/Chiranjibi file/Geometric features/Lu/Test/d_features.csv"
response <- 51201
# set to T for RUnit
# set to F for stand-alone demo
if (T) {
train_hex <- h2o.importFile((TRAIN))
test_hex <- h2o.importFile((TEST))
} else
{
library(h2o)
h2o.init()
homedir <-
paste0(path.expand("~"), "/h2o-dev/") #modify if needed
train_hex <-
h2o.importFile(path = paste0(homedir, TRAIN),
header = F,
sep = ',')
test_hex <-
h2o.importFile(path = paste0(homedir, TEST),
header = F,
sep = ',')
}
train <- train_hex[, -response]
test <- test_hex [, -response]
train_hex[, response] <- as.factor(train_hex[, response])
test_hex [, response] <- as.factor(test_hex [, response])
## Build reference model on full dataset and evaluate it on the test set
model_ref <-
h2o.deeplearning(
training_frame = train_hex,
x = 1:(ncol(train_hex) - 1),
y = response,
hidden = c(67),
epochs = 50
)
p_ref <- h2o.performance(model_ref, test_hex)
h2o.logloss(p_ref)
## Now build a stacked autoencoder model with three stacked layer AE models
## First AE model will compress the 717 non-const predictors into 200
## Second AE model will compress 200 into 100
## Third AE model will compress 100 into 50
layers <- c(50000,20000,10000,5000,2000, 1000, 500)
args <- list(activation = "Tanh",
epochs = 1,
l1 = 1e-5)
ae <- get_stacked_ae_array(train, layers, args)
## Now compress the training/testing data with this 3-stage set of AE models
train_compressed <- apply_stacked_ae_array(train, ae)
test_compressed <- apply_stacked_ae_array(test, ae)
## Build a simple model using these new features (compressed training data) and evaluate it on the compressed test set.
train_w_resp <- h2o.cbind(train_compressed, train_hex[, response])
test_w_resp <- h2o.cbind(test_compressed, test_hex[, response])
model_on_compressed_data <-
h2o.deeplearning(
training_frame = train_w_resp,
x = 1:(ncol(train_w_resp) - 1),
y = ncol(train_w_resp),
hidden = c(67),
epochs = 1
)
p <- h2o.performance(model_on_compressed_data, test_w_resp)
h2o.logloss(p)
}
#h2o.describe(train)
#doTest("Deep Learning Stacked Autoencoder", check.deeplearning_stacked_autoencoder)
由于您的数据集有 51,200 个特征,并且您的层数组有 50,000 个作为第一个值,因此第一组网络连接中有 51200 * 50000 == 2.56e9 个权重。
太多了,试试小一些。
正如汤姆所说,你的自动编码器第一层太大了。
51,200 是很多特征。它们之间有多少相关性?你拥有的相关性越多,你的自动编码器的第一层就越小。
尝试 h2o.prcomp()
并查看有多少维度覆盖了 99% 的方差,通常可以很好地指导您的第一层 can/should 有多大。
或者,如果您更喜欢更具实验性的方法:
- 开始于,例如一层有 200 个神经元。
- 看看它达到的 MSE,经过足够多的时期后停止改进。
- 将该层中的神经元数量加倍。
- 看看 MSE 是否变得更好。如果没有,就到此为止。
- 如果是,再次加倍,然后重复。
然后您可以尝试移动到多个图层。但是,使用比尝试单层所能获得的最好层更大的第一层没有多大意义。
我有一个 table 大小为 5360*51200 的。这里,5360 是实例数,51200 是特征数。我需要减少特征的维度。我在 H2o 中的堆叠自动编码器的帮助下尝试它,但它不允许我训练以引发错误:
Model is a large and large number of parameters
代码如下:
library(h2o)
h2o.init(nthreads = -1)
check.deeplearning_stacked_autoencoder <- function() {
# this function builds a vector of autoencoder models, one per layer
#library(h2o)
#h2o.init()
get_stacked_ae_array <- function(training_data, layers, args) {
vector <- c()
index = 0
for (i in 1:length(layers)) {
index = index + 1
ae_model <- do.call(h2o.deeplearning,
modifyList(
list(
x = names(training_data),
training_frame = training_data,
autoencoder = T,
hidden = layers[i]
),
args
))
training_data = h2o.deepfeatures(ae_model, training_data, layer =
3)
names(training_data) <-
gsub("DF", paste0("L", index, sep = ""), names(training_data))
vector <- c(vector, ae_model)
}
cat(
length(vector))
}
# this function returns final encoded contents
apply_stacked_ae_array <- function(data, ae) {
index = 0
for (i in 1:length(ae)) {
index = index + 1
data = h2o.deepfeatures(ae[[i]], data, layer = 3)
names(data) <-
gsub("DF", paste0("L", index, sep = ""), names(data))
}
data
}
TRAIN <-
"E:/Chiranjibi file/Geometric features/Lu/Train/d_features.csv"
TEST <-
"E:/Chiranjibi file/Geometric features/Lu/Test/d_features.csv"
response <- 51201
# set to T for RUnit
# set to F for stand-alone demo
if (T) {
train_hex <- h2o.importFile((TRAIN))
test_hex <- h2o.importFile((TEST))
} else
{
library(h2o)
h2o.init()
homedir <-
paste0(path.expand("~"), "/h2o-dev/") #modify if needed
train_hex <-
h2o.importFile(path = paste0(homedir, TRAIN),
header = F,
sep = ',')
test_hex <-
h2o.importFile(path = paste0(homedir, TEST),
header = F,
sep = ',')
}
train <- train_hex[, -response]
test <- test_hex [, -response]
train_hex[, response] <- as.factor(train_hex[, response])
test_hex [, response] <- as.factor(test_hex [, response])
## Build reference model on full dataset and evaluate it on the test set
model_ref <-
h2o.deeplearning(
training_frame = train_hex,
x = 1:(ncol(train_hex) - 1),
y = response,
hidden = c(67),
epochs = 50
)
p_ref <- h2o.performance(model_ref, test_hex)
h2o.logloss(p_ref)
## Now build a stacked autoencoder model with three stacked layer AE models
## First AE model will compress the 717 non-const predictors into 200
## Second AE model will compress 200 into 100
## Third AE model will compress 100 into 50
layers <- c(50000,20000,10000,5000,2000, 1000, 500)
args <- list(activation = "Tanh",
epochs = 1,
l1 = 1e-5)
ae <- get_stacked_ae_array(train, layers, args)
## Now compress the training/testing data with this 3-stage set of AE models
train_compressed <- apply_stacked_ae_array(train, ae)
test_compressed <- apply_stacked_ae_array(test, ae)
## Build a simple model using these new features (compressed training data) and evaluate it on the compressed test set.
train_w_resp <- h2o.cbind(train_compressed, train_hex[, response])
test_w_resp <- h2o.cbind(test_compressed, test_hex[, response])
model_on_compressed_data <-
h2o.deeplearning(
training_frame = train_w_resp,
x = 1:(ncol(train_w_resp) - 1),
y = ncol(train_w_resp),
hidden = c(67),
epochs = 1
)
p <- h2o.performance(model_on_compressed_data, test_w_resp)
h2o.logloss(p)
}
#h2o.describe(train)
#doTest("Deep Learning Stacked Autoencoder", check.deeplearning_stacked_autoencoder)
由于您的数据集有 51,200 个特征,并且您的层数组有 50,000 个作为第一个值,因此第一组网络连接中有 51200 * 50000 == 2.56e9 个权重。
太多了,试试小一些。
正如汤姆所说,你的自动编码器第一层太大了。
51,200 是很多特征。它们之间有多少相关性?你拥有的相关性越多,你的自动编码器的第一层就越小。
尝试 h2o.prcomp()
并查看有多少维度覆盖了 99% 的方差,通常可以很好地指导您的第一层 can/should 有多大。
或者,如果您更喜欢更具实验性的方法:
- 开始于,例如一层有 200 个神经元。
- 看看它达到的 MSE,经过足够多的时期后停止改进。
- 将该层中的神经元数量加倍。
- 看看 MSE 是否变得更好。如果没有,就到此为止。
- 如果是,再次加倍,然后重复。
然后您可以尝试移动到多个图层。但是,使用比尝试单层所能获得的最好层更大的第一层没有多大意义。