Python3:shap 树解释器:忽略异常:'array_dealloc'
Python3: shap tree explainer: Exception ignored in: 'array_dealloc'
我正在 运行ning xgboost 进行机器学习,在使用 XGBClassifier
成功完成我的机器学习后,我想绘制结果图。
JSON 格式的输入数据的最小工作示例:
[{"age":58,"Deceased":"False","sex":"False"},{"Deceased":"False","age":59,"sex":"False"},{"sex":"False","age":"68","Deceased":"False"},{"Deceased":"False","age":"26","sex":"False"},{"Deceased":"False","age":87,"sex":"False"},{"sex":"True","age":31,"Deceased":"False"},{"Deceased":"False","age":"35","sex":"False"},{"sex":"False","Deceased":"False","age":41},{"age":"78","Deceased":"False","sex":"True"},{"Deceased":"False","age":"45","sex":"True"},{"sex":"False","age":56,"Deceased":"False"},{"sex":"False","Deceased":"False","age":"26"},{"sex":"True","age":"64","Deceased":"False"},{"sex":"False","age":"37","Deceased":"False"},{"age":"86","Deceased":"True","sex":"False"},{"age":76,"Deceased":"True","sex":"True"},{"Deceased":"True","age":69,"sex":"False"},{"Deceased":"True","age":79,"sex":"True"}]
遵循 https://evgenypogorelov.com/multiclass-xgb-shap.html
的建议
我的脚本:
import mlflow
import sys
import json
import mlflow.sklearn
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
import xgboost
import shap
from sklearn.metrics import accuracy_score, precision_score, plot_roc_curve
def ref_to_json_file(data, filename):
json1=json.dumps(data)
f = open(filename,"w+")
print(json1,file=f)
def xgbclassifier_wrapper( json_file, dependent_var, output_stem):
#https://xgboost.readthedocs.io/en/latest/parameter.html
pandasDF = pd.read_json(json_file)
bool_cols = ["Deceased", "sex"]#, 'Hospitalized', 'Respiratory_Support', 'sex']
for col in bool_cols:
pandasDF[col] = pandasDF[col]=='True'
Y = pandasDF[dependent_var]
X = pandasDF.drop([dependent_var], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
mlflow.sklearn.autolog()
# With autolog() enabled, all model parameters, a model score, and the fitted model are automatically logged.
with mlflow.start_run():
# Set the model parameters.
n_estimators = 200
colsample_bytree = 0.3
learning_rate = 0.05
max_depth = 6# default 6; max. depth of a tree. Increasing this value will make the model more complex and more likely to overfit. 0 is only accepted in lossguided growing policy when tree_method is set as hist or gpu_hist and it indicates no limit on depth. Beware that XGBoost aggressively consumes memory when training a deep tree.
#min_child_rate = 0
gamma = 0 # default = 0; Minimum loss reduction required to make a further partition on a leaf node of the tree. The larger gamma is, the more conservative the algorithm will be.
# Create and train model.
xg_clf = xgboost.XGBClassifier( n_estimators=n_estimators, colsample_bytree=colsample_bytree, learning_rate=learning_rate, max_depth=max_depth)
xg_clf.fit(X_train, y_train)
# Use the model to make predictions on the test dataset.
predictions = xg_clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
pre_score = precision_score(y_test, predictions)
feature_importances = pd.DataFrame(xg_clf.feature_importances_, index=X.columns, columns=['importance'])
feature_importances.to_json("data/" + output_stem + '.feature_importances.json')
kfold = KFold(n_splits=10)
results = cross_val_score(xg_clf, X, Y, cv=kfold)
accuracy = results.mean() * 100
roc = plot_roc_curve(xg_clf, X_test, y_test, name = dependent_var)
return accuracy
json_file = 'debug.json'#"/home/con/covid_study2065/data/pat.data.array.json"
if not os.path.isfile(json_file):
sys.exit("json file doesn't exist.")
deceased = xgbclassifier_wrapper(json_file, "Deceased", 'debug')
explainer = shap.TreeExplainer(deceased.xg_clf, model_output = "raw", feature_perturbation="interventional", data = deceased.X)
explainer = shap.TreeExplainer(deceased.xg_clf, model_output = "probability", feature_perturbation="interventional", data = deceased.X)
报错:
Exception ignored in: 'array_dealloc'
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/shap/explainers/_tree.py", line 1353, in __init__
_cext.dense_tree_update_weights(
SystemError: <class 'DeprecationWarning'> returned a result with an error set
Found a NULL input array in _cext_dense_tree_update_weights!
Traceback (most recent call last):
File "debug.py", line 97, in <module>
explainer = shap.TreeExplainer(deceased.xg_clf, model_output = "probability", feature_perturbation="interventional", data = deceased.X)
File "/usr/local/lib/python3.8/dist-packages/shap/explainers/_tree.py", line 147, in __init__
self.model = TreeEnsemble(model, self.data, self.data_missing, model_output)
File "/usr/local/lib/python3.8/dist-packages/shap/explainers/_tree.py", line 827, in __init__
self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing)
File "/usr/local/lib/python3.8/dist-packages/shap/explainers/_tree.py", line 1522, in get_trees
trees.append(SingleTree({
File "/usr/local/lib/python3.8/dist-packages/shap/explainers/_tree.py", line 1353, in __init__
_cext.dense_tree_update_weights(
SystemError: <built-in function dense_tree_update_weights> returned NULL without setting an error
当我查看 deceased.xg_clf
时,它被输入到 shap.TreeExplainer
:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=0.3, gamma=0, gpu_id=-1,
importance_type='gain', interaction_constraints='',
learning_rate=0.05, max_delta_step=0, max_depth=6,
min_child_weight=1, missing=nan, monotone_constraints='()',
n_estimators=200, n_jobs=1, num_parallel_tree=1, random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=None)
将 XGBClassifer
的输入调整为教程使用的相同参数,即
xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1.0,
gamma=0.0, max_delta_step=0.0, min_child_weight=1.0,
missing=None, n_jobs=-1, objective='binary:logistic', random_state=42, reg_alpha=0.0,
reg_lambda=1.0, scale_pos_weight=1.0, tree_method='auto')
也给出了与我的参数相同的错误。
我完全不知道是什么导致了这个错误,这条消息也没有帮助:我从来没有做过类似 array_alloc
的事情,我认为这是 C 级的事情。
做参数grid_search时也会出现这个错误。
我在 Ubuntu 18.04 上 运行ning Python 3.8.0 在虚拟机上,使用 shap 0.38.1
该错误也发生在 Python 3.8 .5. Ubuntu 20.04.2 LTS (Focal Fossa) 64 位 linux 内核 5.8.044-generic x86_64.
也会出现该错误
更新到 shap 版本 0.39.0 没有帮助。
我尝试更新到 Python 3.8.8,但这使情况变得更糟,因为 shap
的依赖项之一与该版本不兼容:
Collecting slicer==0.0.7 (from shap)
Could not find a version that satisfies the requirement slicer==0.0.7 (from shap) (from versions: )
No matching distribution found for slicer==0.0.7 (from shap)
我在他们的 GitHub 页面上打开了一个问题:https://github.com/slundberg/shap/issues/1844
此外,我的 xgboost、numpy 和 scipy 版本都是最新的:
Requirement already up-to-date: xgboost in /usr/local/lib/python3.8/dist-packages (1.3.3)
Requirement already satisfied, skipping upgrade: numpy in /usr/local/lib/python3.8/dist-packages (from xgboost) (1.19.5)
Requirement already satisfied, skipping upgrade: scipy in /usr/local/lib/python3.8/dist-packages (from xgboost) (1.6.1)
如何 运行 shap
库?
或者...是否有我可以使用的 shap
的竞争对手?
解决方法是对 TreeExplainer 的命令有误。问题是错误消息是“Less than Awesome”。解决方案:
import mlflow
import sys, os
import json
import mlflow.sklearn
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
import xgboost
import shap
from sklearn.metrics import accuracy_score, precision_score, plot_roc_curve
def ref_to_json_file(data, filename):
json1=json.dumps(data)
f = open(filename,"w+")
print(json1,file=f)
class xgb_result:
def __init__(self, xgb_result, X_test):
self.xgb_result = xgb_result
self.X_test = X_test
def xgbclassifier_wrapper( json_file, dependent_var, output_stem):
#https://xgboost.readthedocs.io/en/latest/parameter.html
pandasDF = pd.read_json(json_file)
bool_cols = ["Deceased", "sex"]#, 'Hospitalized', 'Respiratory_Support', 'sex']
for col in bool_cols:
pandasDF[col] = pandasDF[col]=='True'
Y = pandasDF[dependent_var]
X = pandasDF.drop([dependent_var], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
mlflow.sklearn.autolog()
# With autolog() enabled, all model parameters, a model score, and the fitted model are automatically logged.
with mlflow.start_run():
# Set the model parameters.
n_estimators = 200
colsample_bytree = 0.3
learning_rate = 0.05
max_depth = 6# default 6; max. depth of a tree. Increasing this value will make the model more complex and more likely to overfit. 0 is only accepted in lossguided growing policy when tree_method is set as hist or gpu_hist and it indicates no limit on depth. Beware that XGBoost aggressively consumes memory when training a deep tree.
#min_child_rate = 0
gamma = 0 # default = 0; Minimum loss reduction required to make a further partition on a leaf node of the tree. The larger gamma is, the more conservative the algorithm will be.
# Create and train model.
xg_clf = xgboost.XGBClassifier( n_estimators=n_estimators, colsample_bytree=colsample_bytree, learning_rate=learning_rate, max_depth=max_depth)
xg_clf.fit(X_train, y_train)
# Use the model to make predictions on the test dataset.
predictions = xg_clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
pre_score = precision_score(y_test, predictions)
feature_importances = pd.DataFrame(xg_clf.feature_importances_, index=X.columns, columns=['importance'])
feature_importances.to_json("data/" + output_stem + '.feature_importances.json')
kfold = KFold(n_splits=10)
results = cross_val_score(xg_clf, X, Y, cv=kfold)
accuracy = results.mean() * 100
roc = plot_roc_curve(xg_clf, X_test, y_test, name = dependent_var)
return_object = xgb_result(xg_clf, X_test)
return return_object
json_file = 'debug.json'#"/home/con/covid_study2065/data/pat.data.array.json"
if not os.path.isfile(json_file):
sys.exit("json file doesn't exist.")
deceased = xgbclassifier_wrapper(json_file, "Deceased", 'debug')
shap_values = shap.TreeExplainer(deceased.xgb_result).shap_values(deceased.X_test)
shap_interaction_values = shap.TreeExplainer(deceased.xgb_result).shap_interaction_values(deceased.X_test)
#explainer = shap.TreeExplainer(deceased, model_output = "raw", feature_perturbation="interventional", data = deceased.X)
#explainer = shap.TreeExplainer(deceased.xg_clf, model_output = "probability", feature_perturbation="interventional", data = deceased.X)
我正在 运行ning xgboost 进行机器学习,在使用 XGBClassifier
成功完成我的机器学习后,我想绘制结果图。
JSON 格式的输入数据的最小工作示例:
[{"age":58,"Deceased":"False","sex":"False"},{"Deceased":"False","age":59,"sex":"False"},{"sex":"False","age":"68","Deceased":"False"},{"Deceased":"False","age":"26","sex":"False"},{"Deceased":"False","age":87,"sex":"False"},{"sex":"True","age":31,"Deceased":"False"},{"Deceased":"False","age":"35","sex":"False"},{"sex":"False","Deceased":"False","age":41},{"age":"78","Deceased":"False","sex":"True"},{"Deceased":"False","age":"45","sex":"True"},{"sex":"False","age":56,"Deceased":"False"},{"sex":"False","Deceased":"False","age":"26"},{"sex":"True","age":"64","Deceased":"False"},{"sex":"False","age":"37","Deceased":"False"},{"age":"86","Deceased":"True","sex":"False"},{"age":76,"Deceased":"True","sex":"True"},{"Deceased":"True","age":69,"sex":"False"},{"Deceased":"True","age":79,"sex":"True"}]
遵循 https://evgenypogorelov.com/multiclass-xgb-shap.html
的建议我的脚本:
import mlflow
import sys
import json
import mlflow.sklearn
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
import xgboost
import shap
from sklearn.metrics import accuracy_score, precision_score, plot_roc_curve
def ref_to_json_file(data, filename):
json1=json.dumps(data)
f = open(filename,"w+")
print(json1,file=f)
def xgbclassifier_wrapper( json_file, dependent_var, output_stem):
#https://xgboost.readthedocs.io/en/latest/parameter.html
pandasDF = pd.read_json(json_file)
bool_cols = ["Deceased", "sex"]#, 'Hospitalized', 'Respiratory_Support', 'sex']
for col in bool_cols:
pandasDF[col] = pandasDF[col]=='True'
Y = pandasDF[dependent_var]
X = pandasDF.drop([dependent_var], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
mlflow.sklearn.autolog()
# With autolog() enabled, all model parameters, a model score, and the fitted model are automatically logged.
with mlflow.start_run():
# Set the model parameters.
n_estimators = 200
colsample_bytree = 0.3
learning_rate = 0.05
max_depth = 6# default 6; max. depth of a tree. Increasing this value will make the model more complex and more likely to overfit. 0 is only accepted in lossguided growing policy when tree_method is set as hist or gpu_hist and it indicates no limit on depth. Beware that XGBoost aggressively consumes memory when training a deep tree.
#min_child_rate = 0
gamma = 0 # default = 0; Minimum loss reduction required to make a further partition on a leaf node of the tree. The larger gamma is, the more conservative the algorithm will be.
# Create and train model.
xg_clf = xgboost.XGBClassifier( n_estimators=n_estimators, colsample_bytree=colsample_bytree, learning_rate=learning_rate, max_depth=max_depth)
xg_clf.fit(X_train, y_train)
# Use the model to make predictions on the test dataset.
predictions = xg_clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
pre_score = precision_score(y_test, predictions)
feature_importances = pd.DataFrame(xg_clf.feature_importances_, index=X.columns, columns=['importance'])
feature_importances.to_json("data/" + output_stem + '.feature_importances.json')
kfold = KFold(n_splits=10)
results = cross_val_score(xg_clf, X, Y, cv=kfold)
accuracy = results.mean() * 100
roc = plot_roc_curve(xg_clf, X_test, y_test, name = dependent_var)
return accuracy
json_file = 'debug.json'#"/home/con/covid_study2065/data/pat.data.array.json"
if not os.path.isfile(json_file):
sys.exit("json file doesn't exist.")
deceased = xgbclassifier_wrapper(json_file, "Deceased", 'debug')
explainer = shap.TreeExplainer(deceased.xg_clf, model_output = "raw", feature_perturbation="interventional", data = deceased.X)
explainer = shap.TreeExplainer(deceased.xg_clf, model_output = "probability", feature_perturbation="interventional", data = deceased.X)
报错:
Exception ignored in: 'array_dealloc'
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/shap/explainers/_tree.py", line 1353, in __init__
_cext.dense_tree_update_weights(
SystemError: <class 'DeprecationWarning'> returned a result with an error set
Found a NULL input array in _cext_dense_tree_update_weights!
Traceback (most recent call last):
File "debug.py", line 97, in <module>
explainer = shap.TreeExplainer(deceased.xg_clf, model_output = "probability", feature_perturbation="interventional", data = deceased.X)
File "/usr/local/lib/python3.8/dist-packages/shap/explainers/_tree.py", line 147, in __init__
self.model = TreeEnsemble(model, self.data, self.data_missing, model_output)
File "/usr/local/lib/python3.8/dist-packages/shap/explainers/_tree.py", line 827, in __init__
self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing)
File "/usr/local/lib/python3.8/dist-packages/shap/explainers/_tree.py", line 1522, in get_trees
trees.append(SingleTree({
File "/usr/local/lib/python3.8/dist-packages/shap/explainers/_tree.py", line 1353, in __init__
_cext.dense_tree_update_weights(
SystemError: <built-in function dense_tree_update_weights> returned NULL without setting an error
当我查看 deceased.xg_clf
时,它被输入到 shap.TreeExplainer
:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=0.3, gamma=0, gpu_id=-1,
importance_type='gain', interaction_constraints='',
learning_rate=0.05, max_delta_step=0, max_depth=6,
min_child_weight=1, missing=nan, monotone_constraints='()',
n_estimators=200, n_jobs=1, num_parallel_tree=1, random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=None)
将 XGBClassifer
的输入调整为教程使用的相同参数,即
xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1.0,
gamma=0.0, max_delta_step=0.0, min_child_weight=1.0,
missing=None, n_jobs=-1, objective='binary:logistic', random_state=42, reg_alpha=0.0,
reg_lambda=1.0, scale_pos_weight=1.0, tree_method='auto')
也给出了与我的参数相同的错误。
我完全不知道是什么导致了这个错误,这条消息也没有帮助:我从来没有做过类似 array_alloc
的事情,我认为这是 C 级的事情。
做参数grid_search时也会出现这个错误。
我在 Ubuntu 18.04 上 运行ning Python 3.8.0 在虚拟机上,使用 shap 0.38.1
该错误也发生在 Python 3.8 .5. Ubuntu 20.04.2 LTS (Focal Fossa) 64 位 linux 内核 5.8.044-generic x86_64.
更新到 shap 版本 0.39.0 没有帮助。
我尝试更新到 Python 3.8.8,但这使情况变得更糟,因为 shap
的依赖项之一与该版本不兼容:
Collecting slicer==0.0.7 (from shap)
Could not find a version that satisfies the requirement slicer==0.0.7 (from shap) (from versions: )
No matching distribution found for slicer==0.0.7 (from shap)
我在他们的 GitHub 页面上打开了一个问题:https://github.com/slundberg/shap/issues/1844
此外,我的 xgboost、numpy 和 scipy 版本都是最新的:
Requirement already up-to-date: xgboost in /usr/local/lib/python3.8/dist-packages (1.3.3)
Requirement already satisfied, skipping upgrade: numpy in /usr/local/lib/python3.8/dist-packages (from xgboost) (1.19.5)
Requirement already satisfied, skipping upgrade: scipy in /usr/local/lib/python3.8/dist-packages (from xgboost) (1.6.1)
如何 运行 shap
库?
或者...是否有我可以使用的 shap
的竞争对手?
解决方法是对 TreeExplainer 的命令有误。问题是错误消息是“Less than Awesome”。解决方案:
import mlflow
import sys, os
import json
import mlflow.sklearn
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
import xgboost
import shap
from sklearn.metrics import accuracy_score, precision_score, plot_roc_curve
def ref_to_json_file(data, filename):
json1=json.dumps(data)
f = open(filename,"w+")
print(json1,file=f)
class xgb_result:
def __init__(self, xgb_result, X_test):
self.xgb_result = xgb_result
self.X_test = X_test
def xgbclassifier_wrapper( json_file, dependent_var, output_stem):
#https://xgboost.readthedocs.io/en/latest/parameter.html
pandasDF = pd.read_json(json_file)
bool_cols = ["Deceased", "sex"]#, 'Hospitalized', 'Respiratory_Support', 'sex']
for col in bool_cols:
pandasDF[col] = pandasDF[col]=='True'
Y = pandasDF[dependent_var]
X = pandasDF.drop([dependent_var], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
mlflow.sklearn.autolog()
# With autolog() enabled, all model parameters, a model score, and the fitted model are automatically logged.
with mlflow.start_run():
# Set the model parameters.
n_estimators = 200
colsample_bytree = 0.3
learning_rate = 0.05
max_depth = 6# default 6; max. depth of a tree. Increasing this value will make the model more complex and more likely to overfit. 0 is only accepted in lossguided growing policy when tree_method is set as hist or gpu_hist and it indicates no limit on depth. Beware that XGBoost aggressively consumes memory when training a deep tree.
#min_child_rate = 0
gamma = 0 # default = 0; Minimum loss reduction required to make a further partition on a leaf node of the tree. The larger gamma is, the more conservative the algorithm will be.
# Create and train model.
xg_clf = xgboost.XGBClassifier( n_estimators=n_estimators, colsample_bytree=colsample_bytree, learning_rate=learning_rate, max_depth=max_depth)
xg_clf.fit(X_train, y_train)
# Use the model to make predictions on the test dataset.
predictions = xg_clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
pre_score = precision_score(y_test, predictions)
feature_importances = pd.DataFrame(xg_clf.feature_importances_, index=X.columns, columns=['importance'])
feature_importances.to_json("data/" + output_stem + '.feature_importances.json')
kfold = KFold(n_splits=10)
results = cross_val_score(xg_clf, X, Y, cv=kfold)
accuracy = results.mean() * 100
roc = plot_roc_curve(xg_clf, X_test, y_test, name = dependent_var)
return_object = xgb_result(xg_clf, X_test)
return return_object
json_file = 'debug.json'#"/home/con/covid_study2065/data/pat.data.array.json"
if not os.path.isfile(json_file):
sys.exit("json file doesn't exist.")
deceased = xgbclassifier_wrapper(json_file, "Deceased", 'debug')
shap_values = shap.TreeExplainer(deceased.xgb_result).shap_values(deceased.X_test)
shap_interaction_values = shap.TreeExplainer(deceased.xgb_result).shap_interaction_values(deceased.X_test)
#explainer = shap.TreeExplainer(deceased, model_output = "raw", feature_perturbation="interventional", data = deceased.X)
#explainer = shap.TreeExplainer(deceased.xg_clf, model_output = "probability", feature_perturbation="interventional", data = deceased.X)