Python 递归问题(决策树)
Python Recursion Issue (Decision Tree)
谁能帮我理解为什么我的递归函数不起作用?我创建了一个函数来计算要拆分的特征的信息增益,然后在我的决策树函数中调用它。然而,决策树函数似乎只迭代一次。
IG函数为:
# create function to split dataset on maximum gain ratio
def gain_ratio_split(features, target):
from operator import itemgetter
import numpy as np
target_ent = -sum((target.value_counts()/len(target)) * np.log2((target.value_counts()/len(target))))
gainratios = []
num_cols = features.shape[1]
for i in range(num_cols):
column_name = features.columns[i]
single_column_df = pd.DataFrame(features[column_name])
duo_column_df = single_column_df.merge(target, left_index=True, right_index=True).sort_values(by=column_name)
for j in range(len(duo_column_df)):
sub_df_1 = duo_column_df.head(j)
sub_df_2 = duo_column_df.tail(len(duo_column_df)-j)
sub_df_1_ent = -sum(sub_df_1.iloc[:,1].value_counts()/len(sub_df_1) * np.log2(sub_df_1.iloc[:,1].value_counts()/len(sub_df_1)))
sub_df_2_ent = -sum(sub_df_2.iloc[:,1].value_counts()/len(sub_df_2) * np.log2(sub_df_2.iloc[:,1].value_counts()/len(sub_df_2)))
gain = target_ent - ((len(sub_df_1)/len(duo_column_df)) * sub_df_1_ent) - ((len(sub_df_2)/len(duo_column_df)) * sub_df_2_ent)
splitinfo = -( (len(sub_df_1)/len(duo_column_df)) * np.log2( (len(sub_df_1)/len(duo_column_df)) )) - \
( (len(sub_df_2)/len(duo_column_df)) * np.log2( (len(sub_df_2)/len(duo_column_df)) ))
gainratio = np.nan_to_num(gain / splitinfo)
gainratios.append((column_name, gainratio))
split_col = max(gainratios,key=itemgetter(1))[0]
split_val = max(gainratios,key=itemgetter(1))[1]
return split_col, split_val
决策树函数为:
import warnings
warnings.filterwarnings('ignore')
def dec_t(features, target, depth = 0):
depth = 0
edge = []
num_feats = features.shape[1]
if depth <= 4 and num_feats > 1:
parent = gain_ratio_split(features, target)
child1 = features[features[parent[0]] >= parent[1]]
child2 = features[features[parent[0]] < parent[1]]
# create new dataset to feed into recursive loop (removing feature previously used for splitting)
c1 = child1.loc[:, child1.columns != parent[0]]
c2 = child2.loc[:, child2.columns != parent[0]]
# get appropriate target values for new dataset
tar1 = pd.DataFrame(c1.merge(target, left_index=True, right_index=True).iloc[:,-1])
tar2 = pd.DataFrame(c2.merge(target, left_index=True, right_index=True).iloc[:,-1])
# begin recursion on left and right edges of node
left = dec_t(c1, tar1, depth + 1)
right = dec_t(c2, tar2, depth + 1)
# append splitting values to list so I can print into a tree later
edge.append(left)
edge.append(right)
depth += 1
return edge
如果我调用决策树函数 dec_t(X_train_norm, y_train)
,我要么得到一个空列表(边)returned,要么我使用深度计数器(目前这是错误的,但我可以弄清楚)我收到以下错误:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-193-a790156c1013> in <module>()
----> 1 dec_t(X_train_norm, y_train)
5 frames
<ipython-input-9-0a75ea1fa270> in gain_ratio_split(features, target)
23 gainratios.append((column_name, gainratio))
24
---> 25 split_col = max(gainratios,key=itemgetter(1))[0]
26 split_val = max(gainratios,key=itemgetter(1))[1]
27
ValueError: max() arg is an empty sequence
我已经使用我正在使用的数据集手动计算了 IG 函数的所有可能输出,其中 none return 为空值。任何帮助将不胜感激。
经过一番努力,我设法解决了这个问题。我更改了增益比分割函数以包括用于分割特征的中点值。如果我得到空数据帧,我还包括异常。添加例外是一个很好的学习曲线。
# create function to split dataset on maximum gain ratio
# logic for this function derived from explanation of algorithm at following link
# https://sefiks.com/2018/05/13/a-step-by-step-c4-5-decision-tree-example/
def gain_ratio_split(features, target):
# calculate entropy of target labels before split
target_ent = -sum((target.value_counts()/len(target)) * np.log2((target.value_counts()/len(target))))
gainratios = []
num_cols = features.shape[1]
# get index of columns
for i in range(num_cols):
# get column name via its index
column_name = features.columns[i]
# call input dataframe again and get column to perform entropy calc on
single_column_df = pd.DataFrame(features[column_name])
# add target lables to dataframe to calucluate entropy
duo_column_df = single_column_df.merge(target, left_index=True, right_index=True).sort_values(by=column_name)
# get number of rows to create iterative sub frames to perform comnbined entropy over
for j in range(len(duo_column_df)):
# this call gets the number of rows starting at the beginning corresponding to interative index
sub_df_1 = duo_column_df.head(j)
# this call gets the remainder of the dataframe excluded from the previous call
sub_df_2 = duo_column_df.tail(len(duo_column_df)-j)
# sense check if either sub df is empty then pass as errors will occur
if sub_df_1.empty or sub_df_2.empty:
pass
else:
# find the mid point value between the two subsets (max of lower subset, min of upper subset)
mp_val = ((sub_df_1.max(axis = 0)[0] + sub_df_2.min(axis = 0)[0]) / 2)
# calculate entropy for both sub frames
sub_df_1_ent = -sum(sub_df_1.iloc[:,1].value_counts()/len(sub_df_1) * np.log2(sub_df_1.iloc[:,1].value_counts()/len(sub_df_1)))
sub_df_2_ent = -sum(sub_df_2.iloc[:,1].value_counts()/len(sub_df_2) * np.log2(sub_df_2.iloc[:,1].value_counts()/len(sub_df_2)))
# calculate information gain by from each of the sub frame splits
gain = target_ent - ((len(sub_df_1)/len(duo_column_df)) * sub_df_1_ent) - ((len(sub_df_2)/len(duo_column_df)) * sub_df_2_ent)
# calculate split info to normalise gain (remove bias)
splitinfo = -( (len(sub_df_1)/len(duo_column_df)) * np.log2( (len(sub_df_1)/len(duo_column_df)) )) - \
( (len(sub_df_2)/len(duo_column_df)) * np.log2( (len(sub_df_2)/len(duo_column_df)) ))
# gain ratio calculation
gainratio = np.nan_to_num(gain / splitinfo)
gainratios.append((column_name, mp_val, gainratio))
# get feature, feature value and IG to perform split on by getting max gain ratio across all features
try:
out = max(gainratios,key=itemgetter(2))
except:
out = (None, None, None)
return out
并且实际的决策和树构建函数已更新为设置深度 = 深度,我也更改为如上所述在中点值上拆分。
def dec_t(features, target, depth = 0, stop = 6):
stop = stop
target = target
depth = depth
data = features.merge(target, left_index = True, right_index = True)
n_imp = node_impurity(data)
# check to see if we are at final recursion or if node is pure, if so classify node
if (n_imp == True) or (depth == stop) or (features.empty):
return leaf_classifier(data)
# If the data is not pure or we are still within the max number of recursions:
else:
# call split function to get feature and value to split on
column_name, mp_val, gainratio = gain_ratio_split(features, target)
if column_name is None:
return leaf_classifier(data)
else:
# create empy list to add feature & split values to design tree
split = []
split.append((column_name, mp_val))
depth += 1
# get child datasets after split
# there is a question as to whether or not to remove the feature that the data is being split
# on so it will not be available for the next iteration. If I do not drop them my algorthim
# continuously uses the same feature but splits at different values, if I exclude my algorithm
# choses a new feature each time.
# I am choosing to drop a feature at each recursion: https://machinelearningmastery.com/rfe-feature-selection-in-python/
child1 = features[features[column_name] >= mp_val].drop(columns = column_name)
child2 = features[features[column_name] < mp_val].drop(columns = column_name)
# get apprpritate target values for new child datasets
tar1 = pd.DataFrame(child1.merge(target, left_index=True, right_index=True).iloc[:,-1])
tar2 = pd.DataFrame(child2.merge(target, left_index=True, right_index=True).iloc[:,-1])
# begin recursion and append recursive output to edges dictionary to print tree
left = dec_t(child1, tar1, depth = depth, stop = stop)
right = dec_t(child2, tar2, depth = depth, stop = stop)
if left != right:
split.append(left)
split.append(right)
return split
谁能帮我理解为什么我的递归函数不起作用?我创建了一个函数来计算要拆分的特征的信息增益,然后在我的决策树函数中调用它。然而,决策树函数似乎只迭代一次。
IG函数为:
# create function to split dataset on maximum gain ratio
def gain_ratio_split(features, target):
from operator import itemgetter
import numpy as np
target_ent = -sum((target.value_counts()/len(target)) * np.log2((target.value_counts()/len(target))))
gainratios = []
num_cols = features.shape[1]
for i in range(num_cols):
column_name = features.columns[i]
single_column_df = pd.DataFrame(features[column_name])
duo_column_df = single_column_df.merge(target, left_index=True, right_index=True).sort_values(by=column_name)
for j in range(len(duo_column_df)):
sub_df_1 = duo_column_df.head(j)
sub_df_2 = duo_column_df.tail(len(duo_column_df)-j)
sub_df_1_ent = -sum(sub_df_1.iloc[:,1].value_counts()/len(sub_df_1) * np.log2(sub_df_1.iloc[:,1].value_counts()/len(sub_df_1)))
sub_df_2_ent = -sum(sub_df_2.iloc[:,1].value_counts()/len(sub_df_2) * np.log2(sub_df_2.iloc[:,1].value_counts()/len(sub_df_2)))
gain = target_ent - ((len(sub_df_1)/len(duo_column_df)) * sub_df_1_ent) - ((len(sub_df_2)/len(duo_column_df)) * sub_df_2_ent)
splitinfo = -( (len(sub_df_1)/len(duo_column_df)) * np.log2( (len(sub_df_1)/len(duo_column_df)) )) - \
( (len(sub_df_2)/len(duo_column_df)) * np.log2( (len(sub_df_2)/len(duo_column_df)) ))
gainratio = np.nan_to_num(gain / splitinfo)
gainratios.append((column_name, gainratio))
split_col = max(gainratios,key=itemgetter(1))[0]
split_val = max(gainratios,key=itemgetter(1))[1]
return split_col, split_val
决策树函数为:
import warnings
warnings.filterwarnings('ignore')
def dec_t(features, target, depth = 0):
depth = 0
edge = []
num_feats = features.shape[1]
if depth <= 4 and num_feats > 1:
parent = gain_ratio_split(features, target)
child1 = features[features[parent[0]] >= parent[1]]
child2 = features[features[parent[0]] < parent[1]]
# create new dataset to feed into recursive loop (removing feature previously used for splitting)
c1 = child1.loc[:, child1.columns != parent[0]]
c2 = child2.loc[:, child2.columns != parent[0]]
# get appropriate target values for new dataset
tar1 = pd.DataFrame(c1.merge(target, left_index=True, right_index=True).iloc[:,-1])
tar2 = pd.DataFrame(c2.merge(target, left_index=True, right_index=True).iloc[:,-1])
# begin recursion on left and right edges of node
left = dec_t(c1, tar1, depth + 1)
right = dec_t(c2, tar2, depth + 1)
# append splitting values to list so I can print into a tree later
edge.append(left)
edge.append(right)
depth += 1
return edge
如果我调用决策树函数 dec_t(X_train_norm, y_train)
,我要么得到一个空列表(边)returned,要么我使用深度计数器(目前这是错误的,但我可以弄清楚)我收到以下错误:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-193-a790156c1013> in <module>()
----> 1 dec_t(X_train_norm, y_train)
5 frames
<ipython-input-9-0a75ea1fa270> in gain_ratio_split(features, target)
23 gainratios.append((column_name, gainratio))
24
---> 25 split_col = max(gainratios,key=itemgetter(1))[0]
26 split_val = max(gainratios,key=itemgetter(1))[1]
27
ValueError: max() arg is an empty sequence
我已经使用我正在使用的数据集手动计算了 IG 函数的所有可能输出,其中 none return 为空值。任何帮助将不胜感激。
经过一番努力,我设法解决了这个问题。我更改了增益比分割函数以包括用于分割特征的中点值。如果我得到空数据帧,我还包括异常。添加例外是一个很好的学习曲线。
# create function to split dataset on maximum gain ratio
# logic for this function derived from explanation of algorithm at following link
# https://sefiks.com/2018/05/13/a-step-by-step-c4-5-decision-tree-example/
def gain_ratio_split(features, target):
# calculate entropy of target labels before split
target_ent = -sum((target.value_counts()/len(target)) * np.log2((target.value_counts()/len(target))))
gainratios = []
num_cols = features.shape[1]
# get index of columns
for i in range(num_cols):
# get column name via its index
column_name = features.columns[i]
# call input dataframe again and get column to perform entropy calc on
single_column_df = pd.DataFrame(features[column_name])
# add target lables to dataframe to calucluate entropy
duo_column_df = single_column_df.merge(target, left_index=True, right_index=True).sort_values(by=column_name)
# get number of rows to create iterative sub frames to perform comnbined entropy over
for j in range(len(duo_column_df)):
# this call gets the number of rows starting at the beginning corresponding to interative index
sub_df_1 = duo_column_df.head(j)
# this call gets the remainder of the dataframe excluded from the previous call
sub_df_2 = duo_column_df.tail(len(duo_column_df)-j)
# sense check if either sub df is empty then pass as errors will occur
if sub_df_1.empty or sub_df_2.empty:
pass
else:
# find the mid point value between the two subsets (max of lower subset, min of upper subset)
mp_val = ((sub_df_1.max(axis = 0)[0] + sub_df_2.min(axis = 0)[0]) / 2)
# calculate entropy for both sub frames
sub_df_1_ent = -sum(sub_df_1.iloc[:,1].value_counts()/len(sub_df_1) * np.log2(sub_df_1.iloc[:,1].value_counts()/len(sub_df_1)))
sub_df_2_ent = -sum(sub_df_2.iloc[:,1].value_counts()/len(sub_df_2) * np.log2(sub_df_2.iloc[:,1].value_counts()/len(sub_df_2)))
# calculate information gain by from each of the sub frame splits
gain = target_ent - ((len(sub_df_1)/len(duo_column_df)) * sub_df_1_ent) - ((len(sub_df_2)/len(duo_column_df)) * sub_df_2_ent)
# calculate split info to normalise gain (remove bias)
splitinfo = -( (len(sub_df_1)/len(duo_column_df)) * np.log2( (len(sub_df_1)/len(duo_column_df)) )) - \
( (len(sub_df_2)/len(duo_column_df)) * np.log2( (len(sub_df_2)/len(duo_column_df)) ))
# gain ratio calculation
gainratio = np.nan_to_num(gain / splitinfo)
gainratios.append((column_name, mp_val, gainratio))
# get feature, feature value and IG to perform split on by getting max gain ratio across all features
try:
out = max(gainratios,key=itemgetter(2))
except:
out = (None, None, None)
return out
并且实际的决策和树构建函数已更新为设置深度 = 深度,我也更改为如上所述在中点值上拆分。
def dec_t(features, target, depth = 0, stop = 6):
stop = stop
target = target
depth = depth
data = features.merge(target, left_index = True, right_index = True)
n_imp = node_impurity(data)
# check to see if we are at final recursion or if node is pure, if so classify node
if (n_imp == True) or (depth == stop) or (features.empty):
return leaf_classifier(data)
# If the data is not pure or we are still within the max number of recursions:
else:
# call split function to get feature and value to split on
column_name, mp_val, gainratio = gain_ratio_split(features, target)
if column_name is None:
return leaf_classifier(data)
else:
# create empy list to add feature & split values to design tree
split = []
split.append((column_name, mp_val))
depth += 1
# get child datasets after split
# there is a question as to whether or not to remove the feature that the data is being split
# on so it will not be available for the next iteration. If I do not drop them my algorthim
# continuously uses the same feature but splits at different values, if I exclude my algorithm
# choses a new feature each time.
# I am choosing to drop a feature at each recursion: https://machinelearningmastery.com/rfe-feature-selection-in-python/
child1 = features[features[column_name] >= mp_val].drop(columns = column_name)
child2 = features[features[column_name] < mp_val].drop(columns = column_name)
# get apprpritate target values for new child datasets
tar1 = pd.DataFrame(child1.merge(target, left_index=True, right_index=True).iloc[:,-1])
tar2 = pd.DataFrame(child2.merge(target, left_index=True, right_index=True).iloc[:,-1])
# begin recursion and append recursive output to edges dictionary to print tree
left = dec_t(child1, tar1, depth = depth, stop = stop)
right = dec_t(child2, tar2, depth = depth, stop = stop)
if left != right:
split.append(left)
split.append(right)
return split