h2o target_encode_apply kfold 生成重复项
h2o target_encode_apply kfold generates duplicates
我正在按照以下步骤使用目标编码:Target Encoding
编辑:示例代码
请注意,测试数据集的行数在记录数中已从 40k 增加到 200k。同样从示例数据中,您可以看到 ID 2320
已重复 5 次。
library(h2o)
h2o.init()
#> Connection successful!
#>
#> R is connected to the H2O cluster:
loan <- readr::read_csv("/loan.csv")
#> Parsed with column specification:
#> cols(
#> loan_amnt = col_integer(),
#> term = col_character(),
#> int_rate = col_double(),
#> emp_length = col_integer(),
#> home_ownership = col_character(),
#> annual_inc = col_double(),
#> purpose = col_character(),
#> addr_state = col_character(),
#> dti = col_double(),
#> delinq_2yrs = col_integer(),
#> revol_util = col_double(),
#> total_acc = col_integer(),
#> bad_loan = col_integer(),
#> longest_credit_length = col_integer(),
#> verification_status = col_character()
#> )
loan$ID <- seq.int(nrow(loan))
dplyr::glimpse(loan)
#> Observations: 163,987
#> Variables: 16
#> $ loan_amnt <int> 5000, 2500, 2400, 10000, 5000, 3000, 560...
#> $ term <chr> "36 months", "60 months", "36 months", "...
#> $ int_rate <dbl> 10.65, 15.27, 15.96, 13.49, 7.90, 18.64,...
#> $ emp_length <int> 10, 0, 10, 10, 3, 9, 4, 0, 5, 10, 0, 3, ...
#> $ home_ownership <chr> "RENT", "RENT", "RENT", "RENT", "RENT", ...
#> $ annual_inc <dbl> 24000.00, 30000.00, 12252.00, 49200.00, ...
#> $ purpose <chr> "credit_card", "car", "small_business", ...
#> $ addr_state <chr> "AZ", "GA", "IL", "CA", "AZ", "CA", "CA"...
#> $ dti <dbl> 27.65, 1.00, 8.72, 20.00, 11.20, 5.35, 5...
#> $ delinq_2yrs <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
#> $ revol_util <dbl> 83.70, 9.40, 98.50, 21.00, 28.30, 87.50,...
#> $ total_acc <int> 9, 4, 10, 37, 12, 4, 13, 3, 23, 34, 9, 1...
#> $ bad_loan <int> 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0...
#> $ longest_credit_length <int> 26, 12, 10, 15, 7, 4, 7, 7, 13, 22, 7, 8...
#> $ verification_status <chr> "verified", "verified", "not verified", ...
#> $ ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1...
df <- as.h2o(loan)
df$bad_loan <- as.factor(df$bad_loan)
df$addr_state <- as.factor(df$addr_state)
# Split Frame into training and testing
splits <- h2o.splitFrame(df, seed = 1234,
destination_frames=c("train.hex", "test.hex"),
ratios = 0.75)
train <- splits[[1]]
test <- splits[[2]]
response <- "bad_loan"
predictors <- c("loan_amnt", "int_rate", "emp_length", "annual_inc", "dti",
"delinq_2yrs", "revol_util", "total_acc", "longest_credit_length",
"verification_status", "term", "purpose", "home_ownership",
"addr_state")
train$fold <- h2o.kfold_column(train, 5, seed = 1234)
te_map <- h2o.target_encode_create(train, x = list("addr_state"),
y = response, fold_column = "fold")
head(te_map$addr_state)
#> addr_state fold numerator denominator
#> 1 AK 0 7 52
#> 2 AK 1 8 55
#> 3 AK 2 7 56
#> 4 AK 3 13 68
#> 5 AK 4 8 70
#> 6 AL 0 57 297
ext_train <- h2o.target_encode_apply(train, x = list("addr_state"), y = response,
target_encode_map = te_map, holdout_type = "KFold",
fold_column = "fold",
blended_avg = TRUE, noise_level = 0, seed = 1234)
#> Warning in h2o.target_encode_apply(train, x = list("addr_state"),
#> y = response, : The string columns: term, home_ownership, purpose,
#> verification_status were dropped from the dataset
head(ext_train[c("addr_state", "fold", "TargetEncode_addr_state")])
#> addr_state fold TargetEncode_addr_state
#> 1 AK 0 0.1445783
#> 2 AK 0 0.1445783
#> 3 AK 0 0.1445783
#> 4 AK 0 0.1445783
#> 5 AK 0 0.1445783
#> 6 AK 0 0.1445783
nrow.H2OFrame(test)
#> [1] 40925
ext_test <- h2o.target_encode_apply(test, x = list("addr_state"), y = response,
target_encode_map = te_map, holdout_type = "None",
#fold_column = "fold",
blended_avg = FALSE, noise_level = 0)
#> Warning in h2o.target_encode_apply(test, x = list("addr_state"),
#> y = response, : The string columns: term, home_ownership, purpose,
#> verification_status were dropped from the dataset
nrow.H2OFrame(ext_test)
#> [1] 204614
head(ext_test)
#> addr_state loan_amnt int_rate emp_length annual_inc dti delinq_2yrs
#> 1 AK 14000 12.42 9 72000 19.80 0
#> 2 AK 14000 12.42 9 72000 19.80 0
#> 3 AK 14000 12.42 9 72000 19.80 0
#> 4 AK 14000 12.42 9 72000 19.80 0
#> 5 AK 14000 12.42 9 72000 19.80 0
#> 6 AK 16000 7.90 3 35500 6.59 0
#> revol_util total_acc bad_loan longest_credit_length ID fold
#> 1 74.6 26 0 17 2320 0
#> 2 74.6 26 0 17 2320 1
#> 3 74.6 26 0 17 2320 2
#> 4 74.6 26 0 17 2320 3
#> 5 74.6 26 0 17 2320 4
#> 6 18.1 26 0 14 2574 0
#> TargetEncode_addr_state
#> 1 0.1346154
#> 2 0.1454545
#> 3 0.1250000
#> 4 0.1911765
#> 5 0.1142857
#> 6 0.1346154
由 reprex 创建于 2019-03-14
包 (v0.2.0).
以下代码对我有用,没有重复。与您发布的代码唯一的主要区别是我取消了 fold_column = "fold"
:
行的注释
library(h2o)
h2o.init()
loan <- readr::read_csv("loan.csv")
#> Parsed with column specification:
#> cols(
#> loan_amnt = col_double(),
#> term = col_character(),
#> int_rate = col_double(),
#> emp_length = col_double(),
#> home_ownership = col_character(),
#> annual_inc = col_double(),
#> purpose = col_character(),
#> addr_state = col_character(),
#> dti = col_double(),
#> delinq_2yrs = col_double(),
#> revol_util = col_double(),
#> total_acc = col_double(),
#> bad_loan = col_double(),
#> longest_credit_length = col_double(),
#> verification_status = col_character()
#> )
loan$ID <- seq.int(nrow(loan))
dplyr::glimpse(loan)
#> Observations: 163,987
#> Variables: 16
#> $ loan_amnt <dbl> 5000, 2500, 2400, 10000, 5000, 3000, 5600,…
#> $ term <chr> "36 months", "60 months", "36 months", "36…
#> $ int_rate <dbl> 10.65, 15.27, 15.96, 13.49, 7.90, 18.64, 2…
#> $ emp_length <dbl> 10, 0, 10, 10, 3, 9, 4, 0, 5, 10, 0, 3, 3,…
#> $ home_ownership <chr> "RENT", "RENT", "RENT", "RENT", "RENT", "R…
#> $ annual_inc <dbl> 24000.00, 30000.00, 12252.00, 49200.00, 36…
#> $ purpose <chr> "credit_card", "car", "small_business", "o…
#> $ addr_state <chr> "AZ", "GA", "IL", "CA", "AZ", "CA", "CA", …
#> $ dti <dbl> 27.65, 1.00, 8.72, 20.00, 11.20, 5.35, 5.5…
#> $ delinq_2yrs <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ revol_util <dbl> 83.70, 9.40, 98.50, 21.00, 28.30, 87.50, 3…
#> $ total_acc <dbl> 9, 4, 10, 37, 12, 4, 13, 3, 23, 34, 9, 11,…
#> $ bad_loan <dbl> 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, …
#> $ longest_credit_length <dbl> 26, 12, 10, 15, 7, 4, 7, 7, 13, 22, 7, 8, …
#> $ verification_status <chr> "verified", "verified", "not verified", "v…
#> $ ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,…
df <- as.h2o(loan)
df$bad_loan <- as.factor(df$bad_loan)
df$addr_state <- as.factor(df$addr_state)
# Split Frame into training and testing
splits <- h2o.splitFrame(df, seed = 1234,
destination_frames=c("train.hex", "test.hex"),
ratios = 0.75)
train <- splits[[1]]
test <- splits[[2]]
response <- "bad_loan"
predictors <- c("loan_amnt", "int_rate", "emp_length", "annual_inc", "dti",
"delinq_2yrs", "revol_util", "total_acc", "longest_credit_length",
"verification_status", "term", "purpose", "home_ownership",
"addr_state")
train$fold <- h2o.kfold_column(train, 5, seed = 1234)
te_map <- h2o.target_encode_create(train, x = list("addr_state"),
y = response, fold_column = "fold")
ext_train <- h2o.target_encode_apply(train, x = list("addr_state"), y = response,
target_encode_map = te_map, holdout_type = "KFold",
fold_column = "fold",
blended_avg = TRUE, noise_level = 0, seed = 1234)
#> Warning in h2o.target_encode_apply(train, x = list("addr_state"),
#> y = response, : The string columns: term, home_ownership, purpose,
#> verification_status were dropped from the dataset
ext_test <- h2o.target_encode_apply(test, x = list("addr_state"), y = response,
target_encode_map = te_map, holdout_type = "None",
fold_column = "fold",
blended_avg = FALSE, noise_level = 0)
#> Warning in h2o.target_encode_apply(test, x = list("addr_state"),
#> y = response, : The string columns: term, home_ownership, purpose,
#> verification_status were dropped from the dataset
nrow.H2OFrame(test)
#> [1] 40925
nrow.H2OFrame(ext_test)
#> [1] 40925
由 reprex package (v0.2.1)
创建于 2019-03-21
我正在按照以下步骤使用目标编码:Target Encoding
编辑:示例代码
请注意,测试数据集的行数在记录数中已从 40k 增加到 200k。同样从示例数据中,您可以看到 ID 2320
已重复 5 次。
library(h2o)
h2o.init()
#> Connection successful!
#>
#> R is connected to the H2O cluster:
loan <- readr::read_csv("/loan.csv")
#> Parsed with column specification:
#> cols(
#> loan_amnt = col_integer(),
#> term = col_character(),
#> int_rate = col_double(),
#> emp_length = col_integer(),
#> home_ownership = col_character(),
#> annual_inc = col_double(),
#> purpose = col_character(),
#> addr_state = col_character(),
#> dti = col_double(),
#> delinq_2yrs = col_integer(),
#> revol_util = col_double(),
#> total_acc = col_integer(),
#> bad_loan = col_integer(),
#> longest_credit_length = col_integer(),
#> verification_status = col_character()
#> )
loan$ID <- seq.int(nrow(loan))
dplyr::glimpse(loan)
#> Observations: 163,987
#> Variables: 16
#> $ loan_amnt <int> 5000, 2500, 2400, 10000, 5000, 3000, 560...
#> $ term <chr> "36 months", "60 months", "36 months", "...
#> $ int_rate <dbl> 10.65, 15.27, 15.96, 13.49, 7.90, 18.64,...
#> $ emp_length <int> 10, 0, 10, 10, 3, 9, 4, 0, 5, 10, 0, 3, ...
#> $ home_ownership <chr> "RENT", "RENT", "RENT", "RENT", "RENT", ...
#> $ annual_inc <dbl> 24000.00, 30000.00, 12252.00, 49200.00, ...
#> $ purpose <chr> "credit_card", "car", "small_business", ...
#> $ addr_state <chr> "AZ", "GA", "IL", "CA", "AZ", "CA", "CA"...
#> $ dti <dbl> 27.65, 1.00, 8.72, 20.00, 11.20, 5.35, 5...
#> $ delinq_2yrs <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
#> $ revol_util <dbl> 83.70, 9.40, 98.50, 21.00, 28.30, 87.50,...
#> $ total_acc <int> 9, 4, 10, 37, 12, 4, 13, 3, 23, 34, 9, 1...
#> $ bad_loan <int> 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0...
#> $ longest_credit_length <int> 26, 12, 10, 15, 7, 4, 7, 7, 13, 22, 7, 8...
#> $ verification_status <chr> "verified", "verified", "not verified", ...
#> $ ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1...
df <- as.h2o(loan)
df$bad_loan <- as.factor(df$bad_loan)
df$addr_state <- as.factor(df$addr_state)
# Split Frame into training and testing
splits <- h2o.splitFrame(df, seed = 1234,
destination_frames=c("train.hex", "test.hex"),
ratios = 0.75)
train <- splits[[1]]
test <- splits[[2]]
response <- "bad_loan"
predictors <- c("loan_amnt", "int_rate", "emp_length", "annual_inc", "dti",
"delinq_2yrs", "revol_util", "total_acc", "longest_credit_length",
"verification_status", "term", "purpose", "home_ownership",
"addr_state")
train$fold <- h2o.kfold_column(train, 5, seed = 1234)
te_map <- h2o.target_encode_create(train, x = list("addr_state"),
y = response, fold_column = "fold")
head(te_map$addr_state)
#> addr_state fold numerator denominator
#> 1 AK 0 7 52
#> 2 AK 1 8 55
#> 3 AK 2 7 56
#> 4 AK 3 13 68
#> 5 AK 4 8 70
#> 6 AL 0 57 297
ext_train <- h2o.target_encode_apply(train, x = list("addr_state"), y = response,
target_encode_map = te_map, holdout_type = "KFold",
fold_column = "fold",
blended_avg = TRUE, noise_level = 0, seed = 1234)
#> Warning in h2o.target_encode_apply(train, x = list("addr_state"),
#> y = response, : The string columns: term, home_ownership, purpose,
#> verification_status were dropped from the dataset
head(ext_train[c("addr_state", "fold", "TargetEncode_addr_state")])
#> addr_state fold TargetEncode_addr_state
#> 1 AK 0 0.1445783
#> 2 AK 0 0.1445783
#> 3 AK 0 0.1445783
#> 4 AK 0 0.1445783
#> 5 AK 0 0.1445783
#> 6 AK 0 0.1445783
nrow.H2OFrame(test)
#> [1] 40925
ext_test <- h2o.target_encode_apply(test, x = list("addr_state"), y = response,
target_encode_map = te_map, holdout_type = "None",
#fold_column = "fold",
blended_avg = FALSE, noise_level = 0)
#> Warning in h2o.target_encode_apply(test, x = list("addr_state"),
#> y = response, : The string columns: term, home_ownership, purpose,
#> verification_status were dropped from the dataset
nrow.H2OFrame(ext_test)
#> [1] 204614
head(ext_test)
#> addr_state loan_amnt int_rate emp_length annual_inc dti delinq_2yrs
#> 1 AK 14000 12.42 9 72000 19.80 0
#> 2 AK 14000 12.42 9 72000 19.80 0
#> 3 AK 14000 12.42 9 72000 19.80 0
#> 4 AK 14000 12.42 9 72000 19.80 0
#> 5 AK 14000 12.42 9 72000 19.80 0
#> 6 AK 16000 7.90 3 35500 6.59 0
#> revol_util total_acc bad_loan longest_credit_length ID fold
#> 1 74.6 26 0 17 2320 0
#> 2 74.6 26 0 17 2320 1
#> 3 74.6 26 0 17 2320 2
#> 4 74.6 26 0 17 2320 3
#> 5 74.6 26 0 17 2320 4
#> 6 18.1 26 0 14 2574 0
#> TargetEncode_addr_state
#> 1 0.1346154
#> 2 0.1454545
#> 3 0.1250000
#> 4 0.1911765
#> 5 0.1142857
#> 6 0.1346154
由 reprex 创建于 2019-03-14 包 (v0.2.0).
以下代码对我有用,没有重复。与您发布的代码唯一的主要区别是我取消了 fold_column = "fold"
:
library(h2o)
h2o.init()
loan <- readr::read_csv("loan.csv")
#> Parsed with column specification:
#> cols(
#> loan_amnt = col_double(),
#> term = col_character(),
#> int_rate = col_double(),
#> emp_length = col_double(),
#> home_ownership = col_character(),
#> annual_inc = col_double(),
#> purpose = col_character(),
#> addr_state = col_character(),
#> dti = col_double(),
#> delinq_2yrs = col_double(),
#> revol_util = col_double(),
#> total_acc = col_double(),
#> bad_loan = col_double(),
#> longest_credit_length = col_double(),
#> verification_status = col_character()
#> )
loan$ID <- seq.int(nrow(loan))
dplyr::glimpse(loan)
#> Observations: 163,987
#> Variables: 16
#> $ loan_amnt <dbl> 5000, 2500, 2400, 10000, 5000, 3000, 5600,…
#> $ term <chr> "36 months", "60 months", "36 months", "36…
#> $ int_rate <dbl> 10.65, 15.27, 15.96, 13.49, 7.90, 18.64, 2…
#> $ emp_length <dbl> 10, 0, 10, 10, 3, 9, 4, 0, 5, 10, 0, 3, 3,…
#> $ home_ownership <chr> "RENT", "RENT", "RENT", "RENT", "RENT", "R…
#> $ annual_inc <dbl> 24000.00, 30000.00, 12252.00, 49200.00, 36…
#> $ purpose <chr> "credit_card", "car", "small_business", "o…
#> $ addr_state <chr> "AZ", "GA", "IL", "CA", "AZ", "CA", "CA", …
#> $ dti <dbl> 27.65, 1.00, 8.72, 20.00, 11.20, 5.35, 5.5…
#> $ delinq_2yrs <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ revol_util <dbl> 83.70, 9.40, 98.50, 21.00, 28.30, 87.50, 3…
#> $ total_acc <dbl> 9, 4, 10, 37, 12, 4, 13, 3, 23, 34, 9, 11,…
#> $ bad_loan <dbl> 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, …
#> $ longest_credit_length <dbl> 26, 12, 10, 15, 7, 4, 7, 7, 13, 22, 7, 8, …
#> $ verification_status <chr> "verified", "verified", "not verified", "v…
#> $ ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,…
df <- as.h2o(loan)
df$bad_loan <- as.factor(df$bad_loan)
df$addr_state <- as.factor(df$addr_state)
# Split Frame into training and testing
splits <- h2o.splitFrame(df, seed = 1234,
destination_frames=c("train.hex", "test.hex"),
ratios = 0.75)
train <- splits[[1]]
test <- splits[[2]]
response <- "bad_loan"
predictors <- c("loan_amnt", "int_rate", "emp_length", "annual_inc", "dti",
"delinq_2yrs", "revol_util", "total_acc", "longest_credit_length",
"verification_status", "term", "purpose", "home_ownership",
"addr_state")
train$fold <- h2o.kfold_column(train, 5, seed = 1234)
te_map <- h2o.target_encode_create(train, x = list("addr_state"),
y = response, fold_column = "fold")
ext_train <- h2o.target_encode_apply(train, x = list("addr_state"), y = response,
target_encode_map = te_map, holdout_type = "KFold",
fold_column = "fold",
blended_avg = TRUE, noise_level = 0, seed = 1234)
#> Warning in h2o.target_encode_apply(train, x = list("addr_state"),
#> y = response, : The string columns: term, home_ownership, purpose,
#> verification_status were dropped from the dataset
ext_test <- h2o.target_encode_apply(test, x = list("addr_state"), y = response,
target_encode_map = te_map, holdout_type = "None",
fold_column = "fold",
blended_avg = FALSE, noise_level = 0)
#> Warning in h2o.target_encode_apply(test, x = list("addr_state"),
#> y = response, : The string columns: term, home_ownership, purpose,
#> verification_status were dropped from the dataset
nrow.H2OFrame(test)
#> [1] 40925
nrow.H2OFrame(ext_test)
#> [1] 40925
由 reprex package (v0.2.1)
创建于 2019-03-21