h2o target_encode_apply kfold 生成重复项

h2o target_encode_apply kfold generates duplicates

我正在按照以下步骤使用目标编码:Target Encoding

编辑:示例代码

请注意,测试数据集的行数在记录数中已从 40k 增加到 200k。同样从示例数据中,您可以看到 ID 2320 已重复 5 次。

library(h2o)
h2o.init()
#>  Connection successful!
#> 
#> R is connected to the H2O cluster: 


loan <- readr::read_csv("/loan.csv")
#> Parsed with column specification:
#> cols(
#>   loan_amnt = col_integer(),
#>   term = col_character(),
#>   int_rate = col_double(),
#>   emp_length = col_integer(),
#>   home_ownership = col_character(),
#>   annual_inc = col_double(),
#>   purpose = col_character(),
#>   addr_state = col_character(),
#>   dti = col_double(),
#>   delinq_2yrs = col_integer(),
#>   revol_util = col_double(),
#>   total_acc = col_integer(),
#>   bad_loan = col_integer(),
#>   longest_credit_length = col_integer(),
#>   verification_status = col_character()
#> )

loan$ID <- seq.int(nrow(loan))
dplyr::glimpse(loan)
#> Observations: 163,987
#> Variables: 16
#> $ loan_amnt             <int> 5000, 2500, 2400, 10000, 5000, 3000, 560...
#> $ term                  <chr> "36 months", "60 months", "36 months", "...
#> $ int_rate              <dbl> 10.65, 15.27, 15.96, 13.49, 7.90, 18.64,...
#> $ emp_length            <int> 10, 0, 10, 10, 3, 9, 4, 0, 5, 10, 0, 3, ...
#> $ home_ownership        <chr> "RENT", "RENT", "RENT", "RENT", "RENT", ...
#> $ annual_inc            <dbl> 24000.00, 30000.00, 12252.00, 49200.00, ...
#> $ purpose               <chr> "credit_card", "car", "small_business", ...
#> $ addr_state            <chr> "AZ", "GA", "IL", "CA", "AZ", "CA", "CA"...
#> $ dti                   <dbl> 27.65, 1.00, 8.72, 20.00, 11.20, 5.35, 5...
#> $ delinq_2yrs           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
#> $ revol_util            <dbl> 83.70, 9.40, 98.50, 21.00, 28.30, 87.50,...
#> $ total_acc             <int> 9, 4, 10, 37, 12, 4, 13, 3, 23, 34, 9, 1...
#> $ bad_loan              <int> 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0...
#> $ longest_credit_length <int> 26, 12, 10, 15, 7, 4, 7, 7, 13, 22, 7, 8...
#> $ verification_status   <chr> "verified", "verified", "not verified", ...
#> $ ID                    <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1...
df <- as.h2o(loan)
df$bad_loan <- as.factor(df$bad_loan)
df$addr_state <- as.factor(df$addr_state)

# Split Frame into training and testing
splits <- h2o.splitFrame(df, seed = 1234,
                         destination_frames=c("train.hex", "test.hex"),
                         ratios = 0.75)
train <- splits[[1]]
test <- splits[[2]]

response <- "bad_loan"
predictors <- c("loan_amnt", "int_rate", "emp_length", "annual_inc", "dti",
                "delinq_2yrs", "revol_util", "total_acc", "longest_credit_length",
                "verification_status", "term", "purpose", "home_ownership",
                "addr_state")


train$fold <- h2o.kfold_column(train, 5, seed = 1234)
te_map <- h2o.target_encode_create(train, x = list("addr_state"),
                                   y = response, fold_column = "fold")
head(te_map$addr_state)
#>   addr_state fold numerator denominator
#> 1         AK    0         7          52
#> 2         AK    1         8          55
#> 3         AK    2         7          56
#> 4         AK    3        13          68
#> 5         AK    4         8          70
#> 6         AL    0        57         297

ext_train <- h2o.target_encode_apply(train, x = list("addr_state"), y = response,
                                     target_encode_map = te_map, holdout_type = "KFold",
                                     fold_column = "fold",
                                     blended_avg = TRUE, noise_level = 0, seed = 1234)
#> Warning in h2o.target_encode_apply(train, x = list("addr_state"),
#> y = response, : The string columns: term, home_ownership, purpose,
#> verification_status were dropped from the dataset

head(ext_train[c("addr_state", "fold", "TargetEncode_addr_state")])
#>   addr_state fold TargetEncode_addr_state
#> 1         AK    0               0.1445783
#> 2         AK    0               0.1445783
#> 3         AK    0               0.1445783
#> 4         AK    0               0.1445783
#> 5         AK    0               0.1445783
#> 6         AK    0               0.1445783

nrow.H2OFrame(test)
#> [1] 40925
ext_test <- h2o.target_encode_apply(test, x = list("addr_state"), y = response,
                                    target_encode_map = te_map, holdout_type = "None",
                                    #fold_column = "fold",
                                    blended_avg = FALSE, noise_level = 0)
#> Warning in h2o.target_encode_apply(test, x = list("addr_state"),
#> y = response, : The string columns: term, home_ownership, purpose,
#> verification_status were dropped from the dataset
nrow.H2OFrame(ext_test)
#> [1] 204614

head(ext_test)
#>   addr_state loan_amnt int_rate emp_length annual_inc   dti delinq_2yrs
#> 1         AK     14000    12.42          9      72000 19.80           0
#> 2         AK     14000    12.42          9      72000 19.80           0
#> 3         AK     14000    12.42          9      72000 19.80           0
#> 4         AK     14000    12.42          9      72000 19.80           0
#> 5         AK     14000    12.42          9      72000 19.80           0
#> 6         AK     16000     7.90          3      35500  6.59           0
#>   revol_util total_acc bad_loan longest_credit_length   ID fold
#> 1       74.6        26        0                    17 2320    0
#> 2       74.6        26        0                    17 2320    1
#> 3       74.6        26        0                    17 2320    2
#> 4       74.6        26        0                    17 2320    3
#> 5       74.6        26        0                    17 2320    4
#> 6       18.1        26        0                    14 2574    0
#>   TargetEncode_addr_state
#> 1               0.1346154
#> 2               0.1454545
#> 3               0.1250000
#> 4               0.1911765
#> 5               0.1142857
#> 6               0.1346154

reprex 创建于 2019-03-14 包 (v0.2.0).

以下代码对我有用,没有重复。与您发布的代码唯一的主要区别是我取消了 fold_column = "fold":

行的注释
library(h2o)

h2o.init()

loan <- readr::read_csv("loan.csv")
#> Parsed with column specification:
#> cols(
#>   loan_amnt = col_double(),
#>   term = col_character(),
#>   int_rate = col_double(),
#>   emp_length = col_double(),
#>   home_ownership = col_character(),
#>   annual_inc = col_double(),
#>   purpose = col_character(),
#>   addr_state = col_character(),
#>   dti = col_double(),
#>   delinq_2yrs = col_double(),
#>   revol_util = col_double(),
#>   total_acc = col_double(),
#>   bad_loan = col_double(),
#>   longest_credit_length = col_double(),
#>   verification_status = col_character()
#> )

loan$ID <- seq.int(nrow(loan))
dplyr::glimpse(loan)
#> Observations: 163,987
#> Variables: 16
#> $ loan_amnt             <dbl> 5000, 2500, 2400, 10000, 5000, 3000, 5600,…
#> $ term                  <chr> "36 months", "60 months", "36 months", "36…
#> $ int_rate              <dbl> 10.65, 15.27, 15.96, 13.49, 7.90, 18.64, 2…
#> $ emp_length            <dbl> 10, 0, 10, 10, 3, 9, 4, 0, 5, 10, 0, 3, 3,…
#> $ home_ownership        <chr> "RENT", "RENT", "RENT", "RENT", "RENT", "R…
#> $ annual_inc            <dbl> 24000.00, 30000.00, 12252.00, 49200.00, 36…
#> $ purpose               <chr> "credit_card", "car", "small_business", "o…
#> $ addr_state            <chr> "AZ", "GA", "IL", "CA", "AZ", "CA", "CA", …
#> $ dti                   <dbl> 27.65, 1.00, 8.72, 20.00, 11.20, 5.35, 5.5…
#> $ delinq_2yrs           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ revol_util            <dbl> 83.70, 9.40, 98.50, 21.00, 28.30, 87.50, 3…
#> $ total_acc             <dbl> 9, 4, 10, 37, 12, 4, 13, 3, 23, 34, 9, 11,…
#> $ bad_loan              <dbl> 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, …
#> $ longest_credit_length <dbl> 26, 12, 10, 15, 7, 4, 7, 7, 13, 22, 7, 8, …
#> $ verification_status   <chr> "verified", "verified", "not verified", "v…
#> $ ID                    <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,…

df <- as.h2o(loan)

df$bad_loan <- as.factor(df$bad_loan)
df$addr_state <- as.factor(df$addr_state)

# Split Frame into training and testing
splits <- h2o.splitFrame(df, seed = 1234,
                         destination_frames=c("train.hex", "test.hex"),
                         ratios = 0.75)
train <- splits[[1]]
test <- splits[[2]]

response <- "bad_loan"
predictors <- c("loan_amnt", "int_rate", "emp_length", "annual_inc", "dti",
                "delinq_2yrs", "revol_util", "total_acc", "longest_credit_length",
                "verification_status", "term", "purpose", "home_ownership",
                "addr_state")


train$fold <- h2o.kfold_column(train, 5, seed = 1234)
te_map <- h2o.target_encode_create(train, x = list("addr_state"),
                                   y = response, fold_column = "fold")

ext_train <- h2o.target_encode_apply(train, x = list("addr_state"), y = response,
                                     target_encode_map = te_map, holdout_type = "KFold",
                                     fold_column = "fold",
                                     blended_avg = TRUE, noise_level = 0, seed = 1234)
#> Warning in h2o.target_encode_apply(train, x = list("addr_state"),
#> y = response, : The string columns: term, home_ownership, purpose,
#> verification_status were dropped from the dataset

ext_test <- h2o.target_encode_apply(test, x = list("addr_state"), y = response,
                                    target_encode_map = te_map, holdout_type = "None",
                                    fold_column = "fold",
                                    blended_avg = FALSE, noise_level = 0)
#> Warning in h2o.target_encode_apply(test, x = list("addr_state"),
#> y = response, : The string columns: term, home_ownership, purpose,
#> verification_status were dropped from the dataset

nrow.H2OFrame(test)
#> [1] 40925
nrow.H2OFrame(ext_test)
#> [1] 40925

reprex package (v0.2.1)

创建于 2019-03-21