使用 h2o 包重现 Airlines Delay h2o flow 示例不匹配

Reproducing the Airlines Delay h2o flow example with h2o package does not match

以下脚本重现了 h2o 帮助(Help -> View Example FlowHelp -> Browse Installed packs.. -> examples -> Airlines Delay.flowdownload)中所述的等效问题,但使用的是 h2o R 包和固定种子(123456):

library(h2o)
# To use avaliable cores
h2o.init(max_mem_size = "12g", nthreads = -1)

IS_LOCAL_FILE = switch(1, FALSE, TRUE)
if (IS_LOCAL_FILE) {
    data.input <- read.csv(file = "allyears2k.csv", stringsAsFactors = F)
    allyears2k.hex <- as.h2o(data.input, destination_frame = "allyears2k.hex")
} else {
    airlinesPath <- "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears2k.csv"
    allyears2k.hex <- h2o.importFile(path = airlinesPath, destination_frame = "allyears2k.hex")
}

response <- "IsDepDelayed"
predictors <- setdiff(names(allyears2k.hex), response)

# Copied and pasted from the flow, then converting to R syntax
predictors.exc = c("DayofMonth", "DepTime", "CRSDepTime", "ArrTime", "CRSArrTime",
    "TailNum", "ActualElapsedTime", "CRSElapsedTime",
    "AirTime", "ArrDelay", "DepDelay", "TaxiIn", "TaxiOut",
    "Cancelled", "CancellationCode", "Diverted", "CarrierDelay",
    "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay",
    "IsArrDelayed")

predictors <- setdiff(predictors, predictors.exc)
# Convert to factor for classification
allyears2k.hex[, response] <- as.factor(allyears2k.hex[, response])

# Copied and pasted from the flow, then converting to R syntax
fit1 <- h2o.glm(
    x = predictors,
    model_id="glm_model", seed=123456, training_frame=allyears2k.hex,
    ignore_const_cols = T, y = response,
    family="binomial", solver="IRLSM",
    alpha=0.5,lambda=0.00001, lambda_search=F, standardize=T,
    non_negative=F, score_each_iteration=F,
    max_iterations=-1, link="family_default", intercept=T, objective_epsilon=0.00001,
    beta_epsilon=0.0001, gradient_epsilon=0.0001, prior=-1, max_active_predictors=-1
)
# Analysis
confMatrix <- h2o.confusionMatrix(fit1)
print("Confusion Matrix for training dataset")
print(confMatrix)
print(summary(fit1))
h2o.shutdown()

这是训练集的混淆矩阵:

 Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
       NO   YES    Error          Rate
NO      0 20887 1.000000  =20887/20887
YES     0 23091 0.000000      =0/23091
Totals  0 43978 0.474942  =20887/43978

指标:

H2OBinomialMetrics: glm
** Reported on training data. **

MSE:  0.2473858
RMSE:  0.4973789
LogLoss:  0.6878898
Mean Per-Class Error:  0.5
AUC:  0.5550138
Gini:  0.1100276
R^2:  0.007965165
Residual Deviance:  60504.04
AIC:  60516.04

相反,h2o flow 的结果具有更好的性能:

和最大 f1 阈值的混淆矩阵:

水流性能比运行使用等效R-package函数的相同算法.

好得多

注意:为了简单起见,我使用的是 Airlines Delay 问题,这是一个众所周知的使用 h2o 的问题,但我意识到这种显着差异存在于其他类似情况使用 glm 算法。

想过为什么会出现这些显着差异

附录 A:使用默认模型参数

按照@DarrenCook 回答的建议,除了排除列和种子外,只使用默认构建参数:

水流量

现在 buildModel 是这样调用的:

buildModel 'glm', {"model_id":"glm_model-default",
  "seed":"123456","training_frame":"allyears2k.hex",
  "ignored_columns": 
     ["DayofMonth","DepTime","CRSDepTime","ArrTime","CRSArrTime","TailNum",
      "ActualElapsedTime","CRSElapsedTime","AirTime","ArrDelay","DepDelay",
      "TaxiIn","TaxiOut","Cancelled","CancellationCode","Diverted",
      "CarrierDelay","WeatherDelay","NASDelay","SecurityDelay",
      "LateAircraftDelay","IsArrDelayed"],
   "response_column":"IsDepDelayed","family":"binomial"

}

结果是:

和训练指标:

运行 R-Script

以下脚本允许轻松切换到默认配置(通过 IS_DEFAULT_MODEL 变量)并保持配置,如 Airlines Delay 示例中所述:

library(h2o)
h2o.init(max_mem_size = "12g", nthreads = -1) # To use avaliable cores

IS_LOCAL_FILE    = switch(2, FALSE, TRUE)
IS_DEFAULT_MODEL = switch(2, FALSE, TRUE)
if (IS_LOCAL_FILE) {
    data.input <- read.csv(file = "allyears2k.csv", stringsAsFactors = F)
    allyears2k.hex <- as.h2o(data.input, destination_frame = "allyears2k.hex")
} else {
    airlinesPath <- "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears2k.csv"
    allyears2k.hex <- h2o.importFile(path = airlinesPath, destination_frame = "allyears2k.hex")
}

response <- "IsDepDelayed"
predictors <- setdiff(names(allyears2k.hex), response)

# Copied and pasted from the flow, then converting to R syntax
predictors.exc = c("DayofMonth", "DepTime", "CRSDepTime", "ArrTime", "CRSArrTime",
    "TailNum", "ActualElapsedTime", "CRSElapsedTime",
    "AirTime", "ArrDelay", "DepDelay", "TaxiIn", "TaxiOut",
    "Cancelled", "CancellationCode", "Diverted", "CarrierDelay",
    "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay",
    "IsArrDelayed")

predictors <- setdiff(predictors, predictors.exc)
# Convert to factor for classification
allyears2k.hex[, response] <- as.factor(allyears2k.hex[, response])

if (IS_DEFAULT_MODEL) {
    fit1 <- h2o.glm(
        x = predictors, model_id = "glm_model", seed = 123456,
        training_frame = allyears2k.hex, y = response, family = "binomial"
    )
} else { # Copied and pasted from the flow, then converting to R syntax
    fit1 <- h2o.glm(
        x = predictors,
        model_id = "glm_model", seed = 123456, training_frame = allyears2k.hex,
        ignore_const_cols = T, y = response,
        family = "binomial", solver = "IRLSM",
        alpha = 0.5, lambda = 0.00001, lambda_search = F, standardize = T,
        non_negative = F, score_each_iteration = F,
        max_iterations = -1, link = "family_default", intercept = T, objective_epsilon = 0.00001,
        beta_epsilon = 0.0001, gradient_epsilon = 0.0001, prior = -1, max_active_predictors = -1
    )
}

# Analysis
confMatrix <- h2o.confusionMatrix(fit1)
print("Confusion Matrix for training dataset")
print(confMatrix)
print(summary(fit1))
h2o.shutdown()

它产生以下结果:

MSE:  0.2473859
RMSE:  0.497379
LogLoss:  0.6878898
Mean Per-Class Error:  0.5
AUC:  0.5549898
Gini:  0.1099796
R^2:  0.007964984
Residual Deviance:  60504.04
AIC:  60516.04

Confusion Matrix (vertical: actual; across: predicted) 
for F1-optimal threshold:
       NO   YES    Error          Rate
NO      0 20887 1.000000  =20887/20887
YES     0 23091 0.000000      =0/23091
Totals  0 43978 0.474942  =20887/43978

有些指标很接近,但混淆矩阵却大不相同,R-Script 预测所有航班都延误了。

附录 B:配置

Package: h2o
Version: 3.18.0.4
Type: Package
Title: R Interface for H2O
Date: 2018-03-08

注意:我也在 3.19.0.4231 下测试了 R-Script,结果相同

这是运行R:

后的簇信息
> h2o.init(max_mem_size = "12g", nthreads = -1)

R is connected to the H2O cluster: 
H2O cluster version:        3.18.0.4 
...
H2O API Extensions:         Algos, AutoML, Core V3, Core V4 
R Version:                  R version 3.3.3 (2017-03-06)

故障排除提示:首先构建 all-defaults 模型:

mDef = h2o.glm(predictors, response, allyears2k.hex, family="binomial")

这需要 2 秒,并给出与 Flow 屏幕截图中几乎完全相同的 AUC 和混淆矩阵。

所以,我们现在知道您看到的问题是由于您所做的所有模型自定义造成的...

...除非我构建您的 fit1 我得到的结果与我的默认模型基本相同:

         NO   YES    Error          Rate
NO     4276 16611 0.795279  =16611/20887
YES    1573 21518 0.068122   =1573/23091
Totals 5849 38129 0.413479  =18184/43978

这完全按照给定的方式使用您的脚本,因此它获取了远程 csv 文件。 (哦,我删除了 max_mem_size 参数,因为我的笔记本上没有 12g!)

假设您可以获得准确发布的结果,运行 准确获得您发布的代码(并且在新的 R 会话中,使用新启动的 H2O 集群),一个可能的解释是您使用的是 3.19.x,但是最新的稳定版是3.18.0.2? (我的测试是用 3.14.0.1)

最后,我想这是解释:两者具有相同的用于构建模型的参数配置(这不是问题),但是H2o流使用特定的解析自定义将一些变量值转换为Enum,R-script 没有指定。

航空公司延误问题如何在 h2o 流示例中指定用作预测变量(流定义 ignored_columns):

"Year", "Month", "DayOfWeek", "UniqueCarrier", 
   "FlightNum", "Origin", "Dest", "Distance"

其中所有预测变量都应解析为:Enum 除了 Distance。因此 R-Script 需要将这些列从 numericchar 转换为 factor.

正在使用 h2o 执行 R-package

此处 R-Script 已更新:

library(h2o)
h2o.init(max_mem_size = "12g", nthreads = -1) # To use avaliable cores

IS_LOCAL_FILE    = switch(2, FALSE, TRUE)
IS_DEFAULT_MODEL = switch(2, FALSE, TRUE)
if (IS_LOCAL_FILE) {
    data.input <- read.csv(file = "allyears2k.csv", stringsAsFactors = T)
    allyears2k.hex <- as.h2o(data.input, destination_frame = "allyears2k.hex")
} else {
    airlinesPath <- "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears2k.csv"
    allyears2k.hex <- h2o.importFile(path = airlinesPath, destination_frame = "allyears2k.hex")
}

response <- "IsDepDelayed"
predictors <- setdiff(names(allyears2k.hex), response)

# Copied and pasted from the flow, then converting to R syntax
predictors.exc = c("DayofMonth", "DepTime", "CRSDepTime", 
    "ArrTime", "CRSArrTime",
    "TailNum", "ActualElapsedTime", "CRSElapsedTime",
    "AirTime", "ArrDelay", "DepDelay", "TaxiIn", "TaxiOut",
    "Cancelled", "CancellationCode", "Diverted", "CarrierDelay",
    "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay",
    "IsArrDelayed")

predictors <- setdiff(predictors, predictors.exc)
column.asFactor <- c("Year", "Month", "DayofMonth", "DayOfWeek", 
    "UniqueCarrier",  "FlightNum", "Origin", "Dest", response)
# Coercing as factor (equivalent to Enum from h2o Flow)
# Note: Using lapply does not work, see the answer of this question
# 
for (col in column.asFactor) {
    allyears2k.hex[col] <- as.factor(allyears2k.hex[col])
}

if (IS_DEFAULT_MODEL) {
    fit1 <- h2o.glm(x = predictors, y = response, 
       training_frame = allyears2k.hex,
       family = "binomial", seed = 123456
    )
} else { # Copied and pasted from the flow, then converting to R syntax
    fit1 <- h2o.glm(
        x = predictors,
        model_id = "glm_model", seed = 123456, 
        training_frame = allyears2k.hex,
        ignore_const_cols = T, y = response,
        family = "binomial", solver = "IRLSM",
        alpha = 0.5, lambda = 0.00001, lambda_search = F, standardize = T,
        non_negative = F, score_each_iteration = F,
        max_iterations = -1, link = "family_default", intercept = T,
        objective_epsilon = 0.00001,
        beta_epsilon = 0.0001, gradient_epsilon = 0.0001, prior = -1,
        max_active_predictors = -1
    )
}

# Analysis
print("Confusion Matrix for training dataset")
confMatrix <- h2o.confusionMatrix(fit1)
print(confMatrix)
print(summary(fit1))
h2o.shutdown()

这里的结果是 运行 默认配置 R-Script IS_DEFAULT_MODEL=T:

H2OBinomialMetrics: glm
** Reported on training data. **

MSE:                   0.2001145
RMSE:                  0.4473416
LogLoss:               0.5845852
Mean Per-Class Error:  0.3343562
AUC:                   0.7570867
Gini:                  0.5141734
R^2:                   0.1975266
Residual Deviance:     51417.77
AIC:                   52951.77

Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
          NO   YES    Error          Rate
NO     10337 10550 0.505099  =10550/20887
YES     3778 19313 0.163614   =3778/23091
Totals 14115 29863 0.325799  =14328/43978

在水流下执行

现在执行流程:Airlines_Delay_GLMFixedSeed,我们可以得到相同的结果。这里是关于流量配置的详细信息:

parseFiles函数:

parseFiles
  paths: ["https://s3.amazonaws.com/h2o-airlines-unpacked/allyears2k.csv"]
  destination_frame: "allyears2k.hex"
  parse_type: "CSV"
  separator: 44
  number_columns: 31
  single_quotes: false
  column_names: 
  ["Year","Month","DayofMonth","DayOfWeek","DepTime","CRSDepTime","ArrTime",
   "CRSArrTime","UniqueCarrier","FlightNum","TailNum","ActualElapsedTime",
   "CRSElapsedTime","AirTime","ArrDelay","DepDelay","Origin","Dest",
   "Distance","TaxiIn","TaxiOut","Cancelled","CancellationCode",
   "Diverted","CarrierDelay","WeatherDelay","NASDelay","SecurityDelay",
   "LateAircraftDelay","IsArrDelayed",
   "IsDepDelayed"]
  column_types ["Enum","Enum","Enum","Enum","Numeric","Numeric",
   "Numeric","Numeric", "Enum","Enum","Enum","Numeric",
   "Numeric", "Numeric","Numeric","Numeric",
   "Enum","Enum","Numeric","Numeric","Numeric",
   "Enum","Enum","Numeric","Numeric","Numeric",
   "Numeric","Numeric","Numeric","Enum","Enum"]
  delete_on_done: true
  check_header: 1
  chunk_size: 4194304

其中以下预测列转换为 Enum"Year", "Month", "DayOfWeek", "UniqueCarrier", "FlightNum", "Origin", "Dest"

现在调用buildModel函数如下,使用除ignored_columnsseed之外的默认参数:

 buildModel 'glm', {"model_id":"glm_model-default","seed":"123456",
  "training_frame":"allyears2k.hex",
  "ignored_columns":["DayofMonth","DepTime","CRSDepTime","ArrTime",
  "CRSArrTime","TailNum",
  "ActualElapsedTime","CRSElapsedTime","AirTime","ArrDelay","DepDelay",
  "TaxiIn","TaxiOut","Cancelled","CancellationCode","Diverted",
  "CarrierDelay","WeatherDelay","NASDelay","SecurityDelay",
  "LateAircraftDelay","IsArrDelayed"],"response_column":"IsDepDelayed",
  "family":"binomial"}

最后我们得到如下结果:

和训练输出指标:

model                   glm_model-default
model_checksum          -2438376548367921152
frame                   allyears2k.hex
frame_checksum          -2331137066674151424
description             ·
model_category          Binomial
scoring_time            1521598137667
predictions             ·
MSE                     0.200114
RMSE                    0.447342
nobs                    43978
custom_metric_name      ·
custom_metric_value     0
r2                      0.197527
logloss                 0.584585
AUC                     0.757084
Gini                    0.514168
mean_per_class_error    0.334347
residual_deviance       51417.772427
null_deviance           60855.951538
AIC                     52951.772427
null_degrees_of_freedom 43977
residual_degrees_of_freedom 43211

比较两个结果

前 4 位有效数字的训练指标几乎相同:

                       R-Script   H2o Flow
MSE:                   0.2001145  0.200114
RMSE:                  0.4473416  0.447342
LogLoss:               0.5845852  0.584585
Mean Per-Class Error:  0.3343562  0.334347
AUC:                   0.7570867  0.757084
Gini:                  0.5141734  0.514168
R^2:                   0.1975266  0.197527
Residual Deviance:     51417.77   51417.772427
AIC:                   52951.77   52951.772427

混淆矩阵略有不同:

          TP     TN    FP    FN   
R-Script  10337  19313 10550 3778
H2o Flow  10341  19309 10546 3782

          Error
R-Script  0.325799  
H2o Flow  0.3258

我的理解是差异在可接受的阈值范围内(大约 0.0001),因此我们可以说两个接口提供相同的结果。