验证数据的 WOE 转换
WOE transformation for validation data
我正在进行逻辑回归和 XGBoost,并将所有变量转换为 WOE
。
这是为训练数据集完成的。
现在我想根据我的验证和样本外测试数据来验证我的模型。
WOE
是使用 Hmisc::CUT2
函数生成的,然后应用 InformationValue::WOE
data.work$MAILING_DAYS <- cut2(training$MAILING_DAYS, g=20, cuts=c(16,23,27))
data.work.woe$MAILING_DAYS <- WOE(data.work$MAILING_DAYS,
data.work$SUCCESS,
valueOfGood=1)
可能的信息是:
WOETable(data.work$MAILING_DAYS,
data.work$SUCCESS,
valueOfGood=1)
CAT GOODS BADS TOTAL PCT_G PCT_B WOE IV
1 [ 0,16) 4827 89389 94216 0.58844325 0.4983581 0.16616157 0.014968688
2 [16,23) 1750 41383 43133 0.21333658 0.2307169 -0.07832034 0.001361233
3 [23,27) 987 27323 28310 0.12032183 0.1523301 -0.23588003 0.007550120
4 [27,30] 639 21272 21911 0.07789833 0.1185948 -0.42030843 0.017105085
levels(data.work$MAILING_DAYS)
[1] "[ 0,16)" "[16,23)" "[23,27)" "[27,30]"
我试过类似的东西:
WOE <- data.frame(NAME=character(),
COND=character(),
VALUE=integer(),
WOE =integer(),
stringsAsFactors=FALSE)
a = names(data.work)
WOE.CAT <- c()
WOE.WOE <- c()
k <- 1
for (i in c(4:4)){
temp.var <- a[i]
WOE.CAT <- WOETable(data.work[, temp.var], data.work$SUCCESS, valueOfGood = 1)$CAT
WOE.WOE <- WOETable(data.work[, temp.var], data.work$SUCCESS, valueOfGood = 1)$WOE
for (j in c(2:length(WOE.CAT))){
if (as.integer(gregexpr(pattern=",", WOE.CAT[j]) == -1)){
WOE[k,"NAME"] <- temp.var
WOE[k,"COND"] <- "<"
WOE[k,"VALUE"] <- as.numeric(WOE.CAT[j+1])
WOE[k, "WOE"] <- WOE.WOE[j]
k <- k + 1
} else if (as.integer(gregexpr(pattern=",", WOE.CAT[j]) != -1)){
if (j < (length(WOE.CAT)-1)){
WOE[k,"NAME"] <- temp.var
WOE[k,"COND"] <- "<"
WOE[k,"VALUE"] <- as.numeric(substr(WOE.CAT[j], (as.integer(gregexpr(pattern=",", WOE.CAT[j]))+1), (nchar(WOE.CAT[j])-1)))
WOE[k, "WOE"] <- WOE.WOE[j]
k <- k + 1
} else if(j == (length(WOE.CAT)-1)){
WOE[k,"NAME"] <- temp.var
WOE[k,"COND"] <- ">="
WOE[k,"VALUE"] <- as.numeric(substr(WOE.CAT[j], 2, (as.integer(gregexpr(pattern=",", WOE.CAT[j]))-1)))
WOE[k, "WOE"] <- WOE.WOE[j]
k <- k + 1
}
} else if (WOE.CAT[j] == "missing"){
WOE[k,"NAME"] <- temp.var
WOE[k,"COND"] <- "=="
WOE[k,"VALUE"] <- NA
WOE[k, "WOE"] <- WOE.WOE[j]
k <- k + 1
}
}
}
应该有一种方法可以将 WOE 从训练数据转换为验证数据,是吗?
愚蠢的方法是 if else if
...但是我有超过 250 个特征,所以这会花费很多时间!
非常感谢您的帮助
终于有办法了,
使用包:记分卡很有帮助。在我的例子中,我在 scorecard::woebin 中使用了确定的中断 breaks_list 并使用 scorecard::woebin_ply
创建数据
我正在进行逻辑回归和 XGBoost,并将所有变量转换为 WOE
。
这是为训练数据集完成的。
现在我想根据我的验证和样本外测试数据来验证我的模型。
WOE
是使用 Hmisc::CUT2
函数生成的,然后应用 InformationValue::WOE
data.work$MAILING_DAYS <- cut2(training$MAILING_DAYS, g=20, cuts=c(16,23,27))
data.work.woe$MAILING_DAYS <- WOE(data.work$MAILING_DAYS,
data.work$SUCCESS,
valueOfGood=1)
可能的信息是:
WOETable(data.work$MAILING_DAYS,
data.work$SUCCESS,
valueOfGood=1)
CAT GOODS BADS TOTAL PCT_G PCT_B WOE IV
1 [ 0,16) 4827 89389 94216 0.58844325 0.4983581 0.16616157 0.014968688
2 [16,23) 1750 41383 43133 0.21333658 0.2307169 -0.07832034 0.001361233
3 [23,27) 987 27323 28310 0.12032183 0.1523301 -0.23588003 0.007550120
4 [27,30] 639 21272 21911 0.07789833 0.1185948 -0.42030843 0.017105085
levels(data.work$MAILING_DAYS)
[1] "[ 0,16)" "[16,23)" "[23,27)" "[27,30]"
我试过类似的东西:
WOE <- data.frame(NAME=character(),
COND=character(),
VALUE=integer(),
WOE =integer(),
stringsAsFactors=FALSE)
a = names(data.work)
WOE.CAT <- c()
WOE.WOE <- c()
k <- 1
for (i in c(4:4)){
temp.var <- a[i]
WOE.CAT <- WOETable(data.work[, temp.var], data.work$SUCCESS, valueOfGood = 1)$CAT
WOE.WOE <- WOETable(data.work[, temp.var], data.work$SUCCESS, valueOfGood = 1)$WOE
for (j in c(2:length(WOE.CAT))){
if (as.integer(gregexpr(pattern=",", WOE.CAT[j]) == -1)){
WOE[k,"NAME"] <- temp.var
WOE[k,"COND"] <- "<"
WOE[k,"VALUE"] <- as.numeric(WOE.CAT[j+1])
WOE[k, "WOE"] <- WOE.WOE[j]
k <- k + 1
} else if (as.integer(gregexpr(pattern=",", WOE.CAT[j]) != -1)){
if (j < (length(WOE.CAT)-1)){
WOE[k,"NAME"] <- temp.var
WOE[k,"COND"] <- "<"
WOE[k,"VALUE"] <- as.numeric(substr(WOE.CAT[j], (as.integer(gregexpr(pattern=",", WOE.CAT[j]))+1), (nchar(WOE.CAT[j])-1)))
WOE[k, "WOE"] <- WOE.WOE[j]
k <- k + 1
} else if(j == (length(WOE.CAT)-1)){
WOE[k,"NAME"] <- temp.var
WOE[k,"COND"] <- ">="
WOE[k,"VALUE"] <- as.numeric(substr(WOE.CAT[j], 2, (as.integer(gregexpr(pattern=",", WOE.CAT[j]))-1)))
WOE[k, "WOE"] <- WOE.WOE[j]
k <- k + 1
}
} else if (WOE.CAT[j] == "missing"){
WOE[k,"NAME"] <- temp.var
WOE[k,"COND"] <- "=="
WOE[k,"VALUE"] <- NA
WOE[k, "WOE"] <- WOE.WOE[j]
k <- k + 1
}
}
}
应该有一种方法可以将 WOE 从训练数据转换为验证数据,是吗?
愚蠢的方法是 if else if
...但是我有超过 250 个特征,所以这会花费很多时间!
非常感谢您的帮助
终于有办法了,
使用包:记分卡很有帮助。在我的例子中,我在 scorecard::woebin 中使用了确定的中断 breaks_list 并使用 scorecard::woebin_ply
创建数据