Python 递归问题（决策树）

Question

谁能帮我理解为什么我的递归函数不起作用？我创建了一个函数来计算要拆分的特征的信息增益，然后在我的决策树函数中调用它。然而，决策树函数似乎只迭代一次。

IG函数为：

# create function to split dataset on maximum gain ratio
def gain_ratio_split(features, target):
    from operator import itemgetter
    import numpy as np

    target_ent = -sum((target.value_counts()/len(target)) * np.log2((target.value_counts()/len(target))))

    gainratios = []
    num_cols = features.shape[1]

    for i in range(num_cols):
        column_name = features.columns[i]
        single_column_df = pd.DataFrame(features[column_name])
        duo_column_df = single_column_df.merge(target, left_index=True, right_index=True).sort_values(by=column_name)
        for j in range(len(duo_column_df)):
            sub_df_1 = duo_column_df.head(j)
            sub_df_2 = duo_column_df.tail(len(duo_column_df)-j)
            sub_df_1_ent = -sum(sub_df_1.iloc[:,1].value_counts()/len(sub_df_1) * np.log2(sub_df_1.iloc[:,1].value_counts()/len(sub_df_1)))
            sub_df_2_ent = -sum(sub_df_2.iloc[:,1].value_counts()/len(sub_df_2) * np.log2(sub_df_2.iloc[:,1].value_counts()/len(sub_df_2)))
            gain = target_ent - ((len(sub_df_1)/len(duo_column_df)) * sub_df_1_ent) - ((len(sub_df_2)/len(duo_column_df)) * sub_df_2_ent)
            splitinfo = -( (len(sub_df_1)/len(duo_column_df)) * np.log2( (len(sub_df_1)/len(duo_column_df)) )) - \
                         ( (len(sub_df_2)/len(duo_column_df)) * np.log2( (len(sub_df_2)/len(duo_column_df)) ))
            gainratio = np.nan_to_num(gain / splitinfo)
            gainratios.append((column_name, gainratio))

    split_col = max(gainratios,key=itemgetter(1))[0]
    split_val = max(gainratios,key=itemgetter(1))[1]

    return split_col, split_val

决策树函数为：

import warnings
warnings.filterwarnings('ignore')

def dec_t(features, target, depth = 0):
    depth = 0
    edge = []
    num_feats = features.shape[1]
    if depth <= 4 and num_feats > 1:
        parent = gain_ratio_split(features, target)
        child1 = features[features[parent[0]] >= parent[1]]
        child2 = features[features[parent[0]] < parent[1]]
        # create new dataset to feed into recursive loop (removing feature previously used for splitting)
        c1 = child1.loc[:, child1.columns != parent[0]]
        c2 = child2.loc[:, child2.columns != parent[0]]
        # get appropriate target values for new dataset
        tar1 = pd.DataFrame(c1.merge(target, left_index=True, right_index=True).iloc[:,-1])
        tar2 = pd.DataFrame(c2.merge(target, left_index=True, right_index=True).iloc[:,-1])
        # begin recursion on left and right edges of node        
        left = dec_t(c1, tar1, depth + 1)
        right = dec_t(c2, tar2, depth + 1)
        # append splitting values to list so I can print into a tree later
        edge.append(left)
        edge.append(right)

        depth += 1
        
    return edge

如果我调用决策树函数 dec_t(X_train_norm, y_train)，我要么得到一个空列表（边）returned，要么我使用深度计数器（目前这是错误的，但我可以弄清楚）我收到以下错误：

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-193-a790156c1013> in <module>()
----> 1 dec_t(X_train_norm, y_train)

5 frames
<ipython-input-9-0a75ea1fa270> in gain_ratio_split(features, target)
     23             gainratios.append((column_name, gainratio))
     24 
---> 25     split_col = max(gainratios,key=itemgetter(1))[0]
     26     split_val = max(gainratios,key=itemgetter(1))[1]
     27 

ValueError: max() arg is an empty sequence

我已经使用我正在使用的数据集手动计算了 IG 函数的所有可能输出，其中 none return 为空值。任何帮助将不胜感激。

Answer 1

经过一番努力，我设法解决了这个问题。我更改了增益比分割函数以包括用于分割特征的中点值。如果我得到空数据帧，我还包括异常。添加例外是一个很好的学习曲线。

# create function to split dataset on maximum gain ratio
# logic for this function derived from explanation of algorithm at following link
# https://sefiks.com/2018/05/13/a-step-by-step-c4-5-decision-tree-example/
def gain_ratio_split(features, target):
    # calculate entropy of target labels before split
    target_ent = -sum((target.value_counts()/len(target)) * np.log2((target.value_counts()/len(target))))

    gainratios = []
    num_cols = features.shape[1]
    # get index of columns
    for i in range(num_cols):
        # get column name via its index
        column_name = features.columns[i]
        # call input dataframe again and get column to perform entropy calc on
        single_column_df = pd.DataFrame(features[column_name])
        # add target lables to dataframe to calucluate entropy
        duo_column_df = single_column_df.merge(target, left_index=True, right_index=True).sort_values(by=column_name)
        # get number of rows to create iterative sub frames to perform comnbined entropy over
        for j in range(len(duo_column_df)):
            # this call gets the number of rows starting at the beginning corresponding to interative index
            sub_df_1 = duo_column_df.head(j)
            # this call gets the remainder of the dataframe excluded from the previous call
            sub_df_2 = duo_column_df.tail(len(duo_column_df)-j)
            # sense check if either sub df is empty then pass as errors will occur
            if sub_df_1.empty or sub_df_2.empty:
                pass
            else:
                # find the mid point value between the two subsets (max of lower subset, min of upper subset)
                mp_val = ((sub_df_1.max(axis = 0)[0] + sub_df_2.min(axis = 0)[0]) / 2)  
                # calculate entropy for both sub frames
                sub_df_1_ent = -sum(sub_df_1.iloc[:,1].value_counts()/len(sub_df_1) * np.log2(sub_df_1.iloc[:,1].value_counts()/len(sub_df_1)))
                sub_df_2_ent = -sum(sub_df_2.iloc[:,1].value_counts()/len(sub_df_2) * np.log2(sub_df_2.iloc[:,1].value_counts()/len(sub_df_2)))
                # calculate information gain by from each of the sub frame splits
                gain = target_ent - ((len(sub_df_1)/len(duo_column_df)) * sub_df_1_ent) - ((len(sub_df_2)/len(duo_column_df)) * sub_df_2_ent)
                # calculate split info to normalise gain (remove bias)
                splitinfo = -( (len(sub_df_1)/len(duo_column_df)) * np.log2( (len(sub_df_1)/len(duo_column_df)) )) - \
                             ( (len(sub_df_2)/len(duo_column_df)) * np.log2( (len(sub_df_2)/len(duo_column_df)) ))
                # gain ratio calculation 
                gainratio = np.nan_to_num(gain / splitinfo)
                gainratios.append((column_name, mp_val, gainratio))
    # get feature, feature value and IG to perform split on by getting max gain ratio across all features
    try:
        out = max(gainratios,key=itemgetter(2))
    except:
        out = (None, None, None)

    return out

并且实际的决策和树构建函数已更新为设置深度 = 深度，我也更改为如上所述在中点值上拆分。

def dec_t(features, target, depth = 0, stop = 6):
    stop = stop
    target = target
    depth = depth  
    data = features.merge(target, left_index = True, right_index = True)
    n_imp = node_impurity(data)

    # check to see if we are at final recursion or if node is pure, if so classify node
    if (n_imp == True) or (depth == stop) or (features.empty):
        return leaf_classifier(data)
    # If the data is not pure or we are still within the max number of recursions:
    else:
        # call split function to get feature and value to split on
        column_name, mp_val, gainratio = gain_ratio_split(features, target)
        if column_name is None:
            return leaf_classifier(data)
        else:
            # create empy list to add feature & split values to design tree
            split = []
            split.append((column_name, mp_val))
            depth += 1
            # get child datasets after split
            # there is a question as to whether or not to remove the feature that the data is being split
            # on so it will not be available for the next iteration. If I do not drop them my algorthim
            # continuously uses the same feature but splits at different values, if I exclude my algorithm
            # choses a new feature each time. 
            # I am choosing to drop a feature at each recursion: https://machinelearningmastery.com/rfe-feature-selection-in-python/
            child1 = features[features[column_name] >= mp_val].drop(columns = column_name)
            child2 = features[features[column_name] < mp_val].drop(columns = column_name)
            # get apprpritate target values for new child datasets
            tar1 = pd.DataFrame(child1.merge(target, left_index=True, right_index=True).iloc[:,-1])
            tar2 = pd.DataFrame(child2.merge(target, left_index=True, right_index=True).iloc[:,-1])
            # begin recursion and append recursive output to edges dictionary to print tree
            left = dec_t(child1, tar1, depth = depth, stop = stop)
            right = dec_t(child2, tar2, depth = depth, stop = stop)
            if left != right:
                split.append(left)
                split.append(right)
            
            return split

Python 递归问题（决策树）

Python Recursion Issue (Decision Tree)

python

recursion

decision-tree