在 sklearn DecisionTreeClassifier 中修剪不必要的叶子
Prune unnecessary leaves in sklearn DecisionTreeClassifier
我使用 sklearn.tree.DecisionTreeClassifier 构建决策树。通过最佳参数设置,我得到了一棵有不必要叶子的树(参见下面的 example 图片 - 我不需要概率,所以用红色标记的叶子节点是不必要的分裂)
有没有第三方库可以剪掉这些不需要的节点?还是代码片段?我可以写一个,但我真的无法想象我是第一个遇到这个问题的人...
要复制的代码:
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target
mdl = DecisionTreeClassifier(max_leaf_nodes=8)
mdl.fit(X,y)
PS:我尝试了多个关键字搜索,但我很惊讶什么也没找到 - sklearn 中真的没有 post- 剪枝吗?
PPS:针对可能的重复项:虽然 可能会在我自己编写修剪算法时对我有所帮助,但它回答了一个不同的问题——我想去掉那些叶子不改变最终决定,而另一个问题想要分裂节点的最小阈值。
PPPS: 显示的树是显示我的问题的示例。我知道创建树的参数设置不是最理想的。我不是在问优化这棵特定的树,我需要做 post-修剪以去除叶子,如果需要 class 概率,这些叶子可能会有帮助,但如果只对以下内容感兴趣,则没有帮助最有可能 class.
使用 ncfirth 的 link,我能够修改那里的代码以使其适合我的问题:
from sklearn.tree._tree import TREE_LEAF
def is_leaf(inner_tree, index):
# Check whether node is leaf node
return (inner_tree.children_left[index] == TREE_LEAF and
inner_tree.children_right[index] == TREE_LEAF)
def prune_index(inner_tree, decisions, index=0):
# Start pruning from the bottom - if we start from the top, we might miss
# nodes that become leaves during pruning.
# Do not use this directly - use prune_duplicate_leaves instead.
if not is_leaf(inner_tree, inner_tree.children_left[index]):
prune_index(inner_tree, decisions, inner_tree.children_left[index])
if not is_leaf(inner_tree, inner_tree.children_right[index]):
prune_index(inner_tree, decisions, inner_tree.children_right[index])
# Prune children if both children are leaves now and make the same decision:
if (is_leaf(inner_tree, inner_tree.children_left[index]) and
is_leaf(inner_tree, inner_tree.children_right[index]) and
(decisions[index] == decisions[inner_tree.children_left[index]]) and
(decisions[index] == decisions[inner_tree.children_right[index]])):
# turn node into a leaf by "unlinking" its children
inner_tree.children_left[index] = TREE_LEAF
inner_tree.children_right[index] = TREE_LEAF
##print("Pruned {}".format(index))
def prune_duplicate_leaves(mdl):
# Remove leaves if both
decisions = mdl.tree_.value.argmax(axis=2).flatten().tolist() # Decision for each node
prune_index(mdl.tree_, decisions)
在 DecisionTreeClassifier clf 上使用它:
prune_duplicate_leaves(clf)
编辑:修复了更复杂树的错误
DecisionTreeClassifier(max_leaf_nodes=8)
指定(最大)8 个叶子,因此除非树构建器有其他原因停止,否则它将达到最大值。
在所示示例中,与其他 3 个叶子 (>50) 相比,8 个叶子中的 5 个具有非常少量的样本 (<=3),这可能是过度拟合的迹象。
可以指定 min_samples_leaf
或 min_samples_split
来更好地指导训练,而不是在训练后修剪树,这很可能会去除有问题的叶子。例如,对至少 5% 的样本使用值 0.05
。
我这里贴的代码有问题所以我修改了它并不得不添加一小部分(它处理了双方相同但仍然存在比较的情况):
from sklearn.tree._tree import TREE_LEAF, TREE_UNDEFINED
def is_leaf(inner_tree, index):
# Check whether node is leaf node
return (inner_tree.children_left[index] == TREE_LEAF and
inner_tree.children_right[index] == TREE_LEAF)
def prune_index(inner_tree, decisions, index=0):
# Start pruning from the bottom - if we start from the top, we might miss
# nodes that become leaves during pruning.
# Do not use this directly - use prune_duplicate_leaves instead.
if not is_leaf(inner_tree, inner_tree.children_left[index]):
prune_index(inner_tree, decisions, inner_tree.children_left[index])
if not is_leaf(inner_tree, inner_tree.children_right[index]):
prune_index(inner_tree, decisions, inner_tree.children_right[index])
# Prune children if both children are leaves now and make the same decision:
if (is_leaf(inner_tree, inner_tree.children_left[index]) and
is_leaf(inner_tree, inner_tree.children_right[index]) and
(decisions[index] == decisions[inner_tree.children_left[index]]) and
(decisions[index] == decisions[inner_tree.children_right[index]])):
# turn node into a leaf by "unlinking" its children
inner_tree.children_left[index] = TREE_LEAF
inner_tree.children_right[index] = TREE_LEAF
inner_tree.feature[index] = TREE_UNDEFINED
##print("Pruned {}".format(index))
def prune_duplicate_leaves(mdl):
# Remove leaves if both
decisions = mdl.tree_.value.argmax(axis=2).flatten().tolist() # Decision for each node
prune_index(mdl.tree_, decisions)
我使用 sklearn.tree.DecisionTreeClassifier 构建决策树。通过最佳参数设置,我得到了一棵有不必要叶子的树(参见下面的 example 图片 - 我不需要概率,所以用红色标记的叶子节点是不必要的分裂)
有没有第三方库可以剪掉这些不需要的节点?还是代码片段?我可以写一个,但我真的无法想象我是第一个遇到这个问题的人...
要复制的代码:
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target
mdl = DecisionTreeClassifier(max_leaf_nodes=8)
mdl.fit(X,y)
PS:我尝试了多个关键字搜索,但我很惊讶什么也没找到 - sklearn 中真的没有 post- 剪枝吗?
PPS:针对可能的重复项:虽然
PPPS: 显示的树是显示我的问题的示例。我知道创建树的参数设置不是最理想的。我不是在问优化这棵特定的树,我需要做 post-修剪以去除叶子,如果需要 class 概率,这些叶子可能会有帮助,但如果只对以下内容感兴趣,则没有帮助最有可能 class.
使用 ncfirth 的 link,我能够修改那里的代码以使其适合我的问题:
from sklearn.tree._tree import TREE_LEAF
def is_leaf(inner_tree, index):
# Check whether node is leaf node
return (inner_tree.children_left[index] == TREE_LEAF and
inner_tree.children_right[index] == TREE_LEAF)
def prune_index(inner_tree, decisions, index=0):
# Start pruning from the bottom - if we start from the top, we might miss
# nodes that become leaves during pruning.
# Do not use this directly - use prune_duplicate_leaves instead.
if not is_leaf(inner_tree, inner_tree.children_left[index]):
prune_index(inner_tree, decisions, inner_tree.children_left[index])
if not is_leaf(inner_tree, inner_tree.children_right[index]):
prune_index(inner_tree, decisions, inner_tree.children_right[index])
# Prune children if both children are leaves now and make the same decision:
if (is_leaf(inner_tree, inner_tree.children_left[index]) and
is_leaf(inner_tree, inner_tree.children_right[index]) and
(decisions[index] == decisions[inner_tree.children_left[index]]) and
(decisions[index] == decisions[inner_tree.children_right[index]])):
# turn node into a leaf by "unlinking" its children
inner_tree.children_left[index] = TREE_LEAF
inner_tree.children_right[index] = TREE_LEAF
##print("Pruned {}".format(index))
def prune_duplicate_leaves(mdl):
# Remove leaves if both
decisions = mdl.tree_.value.argmax(axis=2).flatten().tolist() # Decision for each node
prune_index(mdl.tree_, decisions)
在 DecisionTreeClassifier clf 上使用它:
prune_duplicate_leaves(clf)
编辑:修复了更复杂树的错误
DecisionTreeClassifier(max_leaf_nodes=8)
指定(最大)8 个叶子,因此除非树构建器有其他原因停止,否则它将达到最大值。
在所示示例中,与其他 3 个叶子 (>50) 相比,8 个叶子中的 5 个具有非常少量的样本 (<=3),这可能是过度拟合的迹象。
可以指定 min_samples_leaf
或 min_samples_split
来更好地指导训练,而不是在训练后修剪树,这很可能会去除有问题的叶子。例如,对至少 5% 的样本使用值 0.05
。
我这里贴的代码有问题所以我修改了它并不得不添加一小部分(它处理了双方相同但仍然存在比较的情况):
from sklearn.tree._tree import TREE_LEAF, TREE_UNDEFINED
def is_leaf(inner_tree, index):
# Check whether node is leaf node
return (inner_tree.children_left[index] == TREE_LEAF and
inner_tree.children_right[index] == TREE_LEAF)
def prune_index(inner_tree, decisions, index=0):
# Start pruning from the bottom - if we start from the top, we might miss
# nodes that become leaves during pruning.
# Do not use this directly - use prune_duplicate_leaves instead.
if not is_leaf(inner_tree, inner_tree.children_left[index]):
prune_index(inner_tree, decisions, inner_tree.children_left[index])
if not is_leaf(inner_tree, inner_tree.children_right[index]):
prune_index(inner_tree, decisions, inner_tree.children_right[index])
# Prune children if both children are leaves now and make the same decision:
if (is_leaf(inner_tree, inner_tree.children_left[index]) and
is_leaf(inner_tree, inner_tree.children_right[index]) and
(decisions[index] == decisions[inner_tree.children_left[index]]) and
(decisions[index] == decisions[inner_tree.children_right[index]])):
# turn node into a leaf by "unlinking" its children
inner_tree.children_left[index] = TREE_LEAF
inner_tree.children_right[index] = TREE_LEAF
inner_tree.feature[index] = TREE_UNDEFINED
##print("Pruned {}".format(index))
def prune_duplicate_leaves(mdl):
# Remove leaves if both
decisions = mdl.tree_.value.argmax(axis=2).flatten().tolist() # Decision for each node
prune_index(mdl.tree_, decisions)