信息价值和证据权重中的信用风险 Analysis_Feature selection_Duplicate 值 table python
Credit Risk Analysis_Feature selection_Duplicate values in Information Value and weight of evidence table python
WOE 和 IV 是信用风险分析中的重要概念,用于找出与预测一个人是否相关的特征是一个可能的 loan defaulter。假设我有一个人有一些变量(可能是年龄、收入、过去的信用评分等,我需要根据过去的数据预测哪些人可能会违约。现在我有 1000 个变量。所有变量都不会在预测贷款违约时同样重要。所以 WOE 和 IV 帮助我找出我应该在任何模型中使用的所有变量中的哪个变量。下面是一个样本 df 和一个通用函数,用于为任何 df 找到 WOE 和 IV。我的疑惑如下。
X_train(信用风险分析中的自变量)
Var1 Var2 ............. Var1000
30 Unknown ............. 80000
33 Success ............. 90000
45 Failure ............. 899900
y_train(信用风险分析中的因变量表示客户是否违约)
0
1
0
0...and so on
从数据框(WOE 和 IV)中查找最相关和最重要的变量的代码
import pandas as pd
import numpy as np
import pandas.core.algorithms as algos
from pandas import Series
import scipy.stats.stats as stats
import re
import traceback
import string
max_bin = 20
force_bin = 3
# define a binning function
def mono_bin(Y, X, n = max_bin):
df1 = pd.DataFrame({"X": X, "Y": Y})
justmiss = df1[['X','Y']][df1.X.isnull()]
notmiss = df1[['X','Y']][df1.X.notnull()]
r = 0
while np.abs(r) < 1:
try:
d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.qcut(notmiss.X, n)})
d2 = d1.groupby('Bucket', as_index=True)
r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
n = n - 1
except Exception as e:
n = n - 1
if len(d2) == 1:
n = force_bin
bins = algos.quantile(notmiss.X, np.linspace(0, 1, n))
if len(np.unique(bins)) == 2:
bins = np.insert(bins, 0, 1)
bins[1] = bins[1]-(bins[1]/2)
d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.cut(notmiss.X, np.unique(bins),include_lowest=True)})
d2 = d1.groupby('Bucket', as_index=True)
d3 = pd.DataFrame({},index=[])
d3["MIN_VALUE"] = d2.min().X
d3["MAX_VALUE"] = d2.max().X
d3["COUNT"] = d2.count().Y
d3["EVENT"] = d2.sum().Y
d3["NONEVENT"] = d2.count().Y - d2.sum().Y
d3=d3.reset_index(drop=True)
if len(justmiss.index) > 0:
d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
d4["MAX_VALUE"] = np.nan
d4["COUNT"] = justmiss.count().Y
d4["EVENT"] = justmiss.sum().Y
d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
d3 = d3.append(d4,ignore_index=True)
d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
d3["VAR_NAME"] = "VAR"
d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]
d3 = d3.replace([np.inf, -np.inf], 0)
d3.IV = d3.IV.sum()
return(d3)
def char_bin(Y, X):
df1 = pd.DataFrame({"X": X, "Y": Y})
justmiss = df1[['X','Y']][df1.X.isnull()]
notmiss = df1[['X','Y']][df1.X.notnull()]
df2 = notmiss.groupby('X',as_index=True)
d3 = pd.DataFrame({},index=[])
d3["COUNT"] = df2.count().Y
d3["MIN_VALUE"] = df2.sum().Y.index
d3["MAX_VALUE"] = d3["MIN_VALUE"]
d3["EVENT"] = df2.sum().Y
d3["NONEVENT"] = df2.count().Y - df2.sum().Y
if len(justmiss.index) > 0:
d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
d4["MAX_VALUE"] = np.nan
d4["COUNT"] = justmiss.count().Y
d4["EVENT"] = justmiss.sum().Y
d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
d3 = d3.append(d4,ignore_index=True)
d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
d3["VAR_NAME"] = "VAR"
d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]
d3 = d3.replace([np.inf, -np.inf], 0)
d3.IV = d3.IV.sum()
d3 = d3.reset_index(drop=True)
return(d3)
def data_vars(df1, target):
stack = traceback.extract_stack()
filename, lineno, function_name, code = stack[-2]
vars_name = re.compile(r'\((.*?)\).*$').search(code).groups()[0]
final = (re.findall(r"[\w']+", vars_name))[-1]
x = df1.dtypes.index
count = -1
for i in x:
if i.upper() not in (final.upper()):
if np.issubdtype(df1[i], np.number) and len(Series.unique(df1[i])) > 2:
conv = mono_bin(target, df1[i])
conv["VAR_NAME"] = i
count = count + 1
else:
conv = char_bin(target, df1[i])
conv["VAR_NAME"] = i
count = count + 1
if count == 0:
iv_df = conv
else:
iv_df = iv_df.append(conv,ignore_index=True)
iv = pd.DataFrame({'IV':iv_df.groupby('VAR_NAME').IV.max()})
iv = iv.reset_index()
return(iv_df,iv)
调用函数
final_iv, IV = data_vars(X_train, y_train)
输出
VAR_NAME MIN_VALUE MAX_VALUE COUNT EVENT EVENT_RATE NONEVENT NON_EVENT_RATE DIST_EVENT DIST_NON_EVENT WOE IV
0 Var1 19 39 2290 259 0.113100 2031 0.886900 0.497121 0.50775 -0.021156 0.000452
1 Var1 40 87 2231 262 0.117436 1969 0.882564 0.502879 0.49225 0.021363 0.000452
64 Var2 failure failure 490 63 0.128571 427 0.871429 0.120921 0.10675 0.124650 0.461890
65 Var2 other other 197 38 0.192893 159 0.807107 0.072937 0.03975 0.606982 0.461890
66 Var2 success success 129 83 0.643411 46 0.356589 0.159309 0.01150 2.628499 0.461890
67 Var2 unknown unknown 3705 337 0.090958 3368 0.909042 0.646833 0.84200 -0.263692 0.461890
等等....
如果您看到同一个变量 Var1
有 2 个值,Var2
变量有 4 个值。我无法理解同一变量的多行的含义。
这个函数 data_vars 试图做什么?
结果被合并,可能是假设某些特征在其分布中可能具有不同的 WOE/IV 分数。例如,Var1 在 [19,39] 和 [40,87] 上的 WOE 分数不同。这种分箱是否对分类变量(如 Var2)有意义是另一回事。
WOE 和 IV 是信用风险分析中的重要概念,用于找出与预测一个人是否相关的特征是一个可能的 loan defaulter。假设我有一个人有一些变量(可能是年龄、收入、过去的信用评分等,我需要根据过去的数据预测哪些人可能会违约。现在我有 1000 个变量。所有变量都不会在预测贷款违约时同样重要。所以 WOE 和 IV 帮助我找出我应该在任何模型中使用的所有变量中的哪个变量。下面是一个样本 df 和一个通用函数,用于为任何 df 找到 WOE 和 IV。我的疑惑如下。
X_train(信用风险分析中的自变量)
Var1 Var2 ............. Var1000
30 Unknown ............. 80000
33 Success ............. 90000
45 Failure ............. 899900
y_train(信用风险分析中的因变量表示客户是否违约)
0
1
0
0...and so on
从数据框(WOE 和 IV)中查找最相关和最重要的变量的代码
import pandas as pd
import numpy as np
import pandas.core.algorithms as algos
from pandas import Series
import scipy.stats.stats as stats
import re
import traceback
import string
max_bin = 20
force_bin = 3
# define a binning function
def mono_bin(Y, X, n = max_bin):
df1 = pd.DataFrame({"X": X, "Y": Y})
justmiss = df1[['X','Y']][df1.X.isnull()]
notmiss = df1[['X','Y']][df1.X.notnull()]
r = 0
while np.abs(r) < 1:
try:
d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.qcut(notmiss.X, n)})
d2 = d1.groupby('Bucket', as_index=True)
r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
n = n - 1
except Exception as e:
n = n - 1
if len(d2) == 1:
n = force_bin
bins = algos.quantile(notmiss.X, np.linspace(0, 1, n))
if len(np.unique(bins)) == 2:
bins = np.insert(bins, 0, 1)
bins[1] = bins[1]-(bins[1]/2)
d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.cut(notmiss.X, np.unique(bins),include_lowest=True)})
d2 = d1.groupby('Bucket', as_index=True)
d3 = pd.DataFrame({},index=[])
d3["MIN_VALUE"] = d2.min().X
d3["MAX_VALUE"] = d2.max().X
d3["COUNT"] = d2.count().Y
d3["EVENT"] = d2.sum().Y
d3["NONEVENT"] = d2.count().Y - d2.sum().Y
d3=d3.reset_index(drop=True)
if len(justmiss.index) > 0:
d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
d4["MAX_VALUE"] = np.nan
d4["COUNT"] = justmiss.count().Y
d4["EVENT"] = justmiss.sum().Y
d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
d3 = d3.append(d4,ignore_index=True)
d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
d3["VAR_NAME"] = "VAR"
d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]
d3 = d3.replace([np.inf, -np.inf], 0)
d3.IV = d3.IV.sum()
return(d3)
def char_bin(Y, X):
df1 = pd.DataFrame({"X": X, "Y": Y})
justmiss = df1[['X','Y']][df1.X.isnull()]
notmiss = df1[['X','Y']][df1.X.notnull()]
df2 = notmiss.groupby('X',as_index=True)
d3 = pd.DataFrame({},index=[])
d3["COUNT"] = df2.count().Y
d3["MIN_VALUE"] = df2.sum().Y.index
d3["MAX_VALUE"] = d3["MIN_VALUE"]
d3["EVENT"] = df2.sum().Y
d3["NONEVENT"] = df2.count().Y - df2.sum().Y
if len(justmiss.index) > 0:
d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
d4["MAX_VALUE"] = np.nan
d4["COUNT"] = justmiss.count().Y
d4["EVENT"] = justmiss.sum().Y
d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
d3 = d3.append(d4,ignore_index=True)
d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
d3["VAR_NAME"] = "VAR"
d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]
d3 = d3.replace([np.inf, -np.inf], 0)
d3.IV = d3.IV.sum()
d3 = d3.reset_index(drop=True)
return(d3)
def data_vars(df1, target):
stack = traceback.extract_stack()
filename, lineno, function_name, code = stack[-2]
vars_name = re.compile(r'\((.*?)\).*$').search(code).groups()[0]
final = (re.findall(r"[\w']+", vars_name))[-1]
x = df1.dtypes.index
count = -1
for i in x:
if i.upper() not in (final.upper()):
if np.issubdtype(df1[i], np.number) and len(Series.unique(df1[i])) > 2:
conv = mono_bin(target, df1[i])
conv["VAR_NAME"] = i
count = count + 1
else:
conv = char_bin(target, df1[i])
conv["VAR_NAME"] = i
count = count + 1
if count == 0:
iv_df = conv
else:
iv_df = iv_df.append(conv,ignore_index=True)
iv = pd.DataFrame({'IV':iv_df.groupby('VAR_NAME').IV.max()})
iv = iv.reset_index()
return(iv_df,iv)
调用函数
final_iv, IV = data_vars(X_train, y_train)
输出
VAR_NAME MIN_VALUE MAX_VALUE COUNT EVENT EVENT_RATE NONEVENT NON_EVENT_RATE DIST_EVENT DIST_NON_EVENT WOE IV
0 Var1 19 39 2290 259 0.113100 2031 0.886900 0.497121 0.50775 -0.021156 0.000452
1 Var1 40 87 2231 262 0.117436 1969 0.882564 0.502879 0.49225 0.021363 0.000452
64 Var2 failure failure 490 63 0.128571 427 0.871429 0.120921 0.10675 0.124650 0.461890
65 Var2 other other 197 38 0.192893 159 0.807107 0.072937 0.03975 0.606982 0.461890
66 Var2 success success 129 83 0.643411 46 0.356589 0.159309 0.01150 2.628499 0.461890
67 Var2 unknown unknown 3705 337 0.090958 3368 0.909042 0.646833 0.84200 -0.263692 0.461890
等等....
如果您看到同一个变量 Var1
有 2 个值,Var2
变量有 4 个值。我无法理解同一变量的多行的含义。
这个函数 data_vars 试图做什么?
结果被合并,可能是假设某些特征在其分布中可能具有不同的 WOE/IV 分数。例如,Var1 在 [19,39] 和 [40,87] 上的 WOE 分数不同。这种分箱是否对分类变量(如 Var2)有意义是另一回事。