如何使用 TreeExplainer 形状图表的 y 轴标签创建列表?
How to create a list with the y-axis labels of a TreeExplainer shap chart?
如何使用 TreeExplainer
图表的 y 轴标签创建列表?
您好,
我能够生成一个图表,按照 y 轴上的重要性顺序对我的变量进行排序。这是一个以图形形式可视化的重要解决方案,但现在我需要提取有序变量列表,因为它们位于图形的 y 轴上。有谁知道如何做到这一点?我在这里放了一张示例图片。
Obs.:抱歉,我无法添加最小的可重现示例。我不知道如何在此处粘贴 Jupyter Notebook 单元格,所以我在 link 下方粘贴到通过 Github.
共享的代码
在此示例中,列表将是“vB0、mB1、vB1、mB2、mB0、vB2”。
## SHAP GRAPHIC
import pandas as pd
import seaborn as sns
import numpy as np # for sample data
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.inspection import permutation_importance
import shap
from matplotlib import pyplot as plt
# set seed for reproducibility
np.random.seed(1)
# create arrays of random sample data
cl = np.random.choice(range(1, 6), size=(100, 1))
d = np.random.random_sample(size=(100, 6))
# combine the two arrays
data = np.concatenate([cl, d], axis=1)
# create a dataframe
data = pd.DataFrame(data, columns=['classe', 'mB0', 'mB1', 'mB2', 'vB0', 'vB1', 'vB2'])
# create an 'id' column with sequential numbering
#fonte: https://pythonexamples.org/pandas-set-column-as-index/#:~:text=Pandas%20%E2%80%93%20Set%20Column%20as%20Index&text=To%20set%20a%20column%20as,index%2C%20to%20set_index()%20method.
data['id'] = data.index
#Specifying Predictors (X) and Target Variable (y)
X = data.drop(['classe','id'], axis=1) #assigning predictors to X
y = data['classe'] #subsetting only the target variable to y
# implementing train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)
X_test.shape, y_test.shape
#Creating the RF Classifier model with 100 trees
rf = RandomForestClassifier(n_estimators=100)
#Fitting the classifier to the data
rf.fit(X_train, y_train)
#Getting predictions
y_rf_pred = rf.predict(X_test)
##--------------------------- Random Forests Classification ---------------------------
#Creating the RF Classifier model with 100 trees
rf = RandomForestClassifier(n_estimators=100)
#Fitting the classifier to the data
rf.fit(X_train, y_train)
#Getting predictions
y_rf_pred = rf.predict(X_test)
## Finding optimal parameters for this model
from sklearn.model_selection import RandomizedSearchCV
# number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 500, num = 10)]
# number of features at every split
max_features = ['auto', 'sqrt']
# max depth
max_depth = [int(x) for x in np.linspace(1, 50, num = 11)]
max_depth.append(None)
# create random grid
random_grid = {
'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth
}
# Random search of parameters
rfc_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 10, verbose=2, random_state=42, n_jobs = -1, return_train_score=True)
# Fit the model
rfc_random.fit(X_train, y_train)
cross_val = rfc_random.cv_results_
### Running the model with optimized hyperparameters
rf_tuned = RandomForestClassifier(n_estimators=144, max_depth=10, max_features='sqrt')
rf_tuned.fit(X_train,y_train)
y_pred_rftuned = rf_tuned.predict(X_test)
rf_cv_score = cross_val_score(rf_tuned, X, y, cv=10)
# #Summarized classification report
# print("=== Classification Report ===")
# from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
# print(classification_report(y_test, y_pred_rftuned, digits=4))
# print('\n')
# print("=== Confusion Matrix ===")
# print(confusion_matrix(y_test, y_pred_rftuned))
# cm = confusion_matrix(y_test, y_pred_rftuned)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=rf_tuned.classes_)
# ##Saving the tuned model to a external file to be used again without having to train all over again
# from joblib import dump, load
# dump(rf_tuned, 'test3_rf_classifier.joblib')
## Feature Importance Computed with SHAP Values
explainer = shap.TreeExplainer(rf_tuned)
shap_values = explainer.shap_values(X_test, approximate=False, check_additivity=False)
shap.summary_plot(shap_values, X_test, max_display=1000)
#print ordered columns according to shap values
sv = np.array(shap_values)
sv_mean=np.abs(sv).mean(1).sum(0)
order = np.argsort(sv_mean)[::-1]
ordered_cols = X_test.columns[order]
print(ordered_cols)
TL;DR
sv = np.array(shap_values)
sv_mean=np.abs(sv).mean(1).sum(0)
order = np.argsort(sv_mean)[::-1]
ordered_cols = X_test.columns[order]
print(ordered_cols)
将上面的代码块附加到您链接的笔记本中,您将获得所需的内容。
完整答案
summary_plot
显示按 SHAP 值的绝对值的平均值之和排序的列。你不能从图中提取它们,但你可以计算它们。
完全可重现的示例:
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from shap import TreeExplainer, summary_plot
X, y = make_classification(n_samples=1000, n_features=30,
n_classes=5, n_informative=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)
explainer = TreeExplainer(rf_clf)
shap_values = explainer.shap_values(X_test)
summary_plot(shap_values, max_display=10)
sv = np.array(shap_values)
sv_mean=np.abs(sv).mean(1).sum(0)
order = np.argsort(sv_mean)[::-1]
print(order)
array([17, 15, 26, 4, 24, 3, 20, 1, 2, 27, 10, 12, 22, 11, 21, 23, 9,
28, 19, 7, 16, 8, 5, 14, 25, 0, 13, 6, 29, 18])
有什么不明白的请追问
如何使用 TreeExplainer
图表的 y 轴标签创建列表?
您好,
我能够生成一个图表,按照 y 轴上的重要性顺序对我的变量进行排序。这是一个以图形形式可视化的重要解决方案,但现在我需要提取有序变量列表,因为它们位于图形的 y 轴上。有谁知道如何做到这一点?我在这里放了一张示例图片。
Obs.:抱歉,我无法添加最小的可重现示例。我不知道如何在此处粘贴 Jupyter Notebook 单元格,所以我在 link 下方粘贴到通过 Github.
共享的代码在此示例中,列表将是“vB0、mB1、vB1、mB2、mB0、vB2”。
## SHAP GRAPHIC
import pandas as pd
import seaborn as sns
import numpy as np # for sample data
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.inspection import permutation_importance
import shap
from matplotlib import pyplot as plt
# set seed for reproducibility
np.random.seed(1)
# create arrays of random sample data
cl = np.random.choice(range(1, 6), size=(100, 1))
d = np.random.random_sample(size=(100, 6))
# combine the two arrays
data = np.concatenate([cl, d], axis=1)
# create a dataframe
data = pd.DataFrame(data, columns=['classe', 'mB0', 'mB1', 'mB2', 'vB0', 'vB1', 'vB2'])
# create an 'id' column with sequential numbering
#fonte: https://pythonexamples.org/pandas-set-column-as-index/#:~:text=Pandas%20%E2%80%93%20Set%20Column%20as%20Index&text=To%20set%20a%20column%20as,index%2C%20to%20set_index()%20method.
data['id'] = data.index
#Specifying Predictors (X) and Target Variable (y)
X = data.drop(['classe','id'], axis=1) #assigning predictors to X
y = data['classe'] #subsetting only the target variable to y
# implementing train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)
X_test.shape, y_test.shape
#Creating the RF Classifier model with 100 trees
rf = RandomForestClassifier(n_estimators=100)
#Fitting the classifier to the data
rf.fit(X_train, y_train)
#Getting predictions
y_rf_pred = rf.predict(X_test)
##--------------------------- Random Forests Classification ---------------------------
#Creating the RF Classifier model with 100 trees
rf = RandomForestClassifier(n_estimators=100)
#Fitting the classifier to the data
rf.fit(X_train, y_train)
#Getting predictions
y_rf_pred = rf.predict(X_test)
## Finding optimal parameters for this model
from sklearn.model_selection import RandomizedSearchCV
# number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 500, num = 10)]
# number of features at every split
max_features = ['auto', 'sqrt']
# max depth
max_depth = [int(x) for x in np.linspace(1, 50, num = 11)]
max_depth.append(None)
# create random grid
random_grid = {
'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth
}
# Random search of parameters
rfc_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 10, verbose=2, random_state=42, n_jobs = -1, return_train_score=True)
# Fit the model
rfc_random.fit(X_train, y_train)
cross_val = rfc_random.cv_results_
### Running the model with optimized hyperparameters
rf_tuned = RandomForestClassifier(n_estimators=144, max_depth=10, max_features='sqrt')
rf_tuned.fit(X_train,y_train)
y_pred_rftuned = rf_tuned.predict(X_test)
rf_cv_score = cross_val_score(rf_tuned, X, y, cv=10)
# #Summarized classification report
# print("=== Classification Report ===")
# from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
# print(classification_report(y_test, y_pred_rftuned, digits=4))
# print('\n')
# print("=== Confusion Matrix ===")
# print(confusion_matrix(y_test, y_pred_rftuned))
# cm = confusion_matrix(y_test, y_pred_rftuned)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=rf_tuned.classes_)
# ##Saving the tuned model to a external file to be used again without having to train all over again
# from joblib import dump, load
# dump(rf_tuned, 'test3_rf_classifier.joblib')
## Feature Importance Computed with SHAP Values
explainer = shap.TreeExplainer(rf_tuned)
shap_values = explainer.shap_values(X_test, approximate=False, check_additivity=False)
shap.summary_plot(shap_values, X_test, max_display=1000)
#print ordered columns according to shap values
sv = np.array(shap_values)
sv_mean=np.abs(sv).mean(1).sum(0)
order = np.argsort(sv_mean)[::-1]
ordered_cols = X_test.columns[order]
print(ordered_cols)
TL;DR
sv = np.array(shap_values)
sv_mean=np.abs(sv).mean(1).sum(0)
order = np.argsort(sv_mean)[::-1]
ordered_cols = X_test.columns[order]
print(ordered_cols)
将上面的代码块附加到您链接的笔记本中,您将获得所需的内容。
完整答案
summary_plot
显示按 SHAP 值的绝对值的平均值之和排序的列。你不能从图中提取它们,但你可以计算它们。
完全可重现的示例:
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from shap import TreeExplainer, summary_plot
X, y = make_classification(n_samples=1000, n_features=30,
n_classes=5, n_informative=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)
explainer = TreeExplainer(rf_clf)
shap_values = explainer.shap_values(X_test)
summary_plot(shap_values, max_display=10)
sv = np.array(shap_values)
sv_mean=np.abs(sv).mean(1).sum(0)
order = np.argsort(sv_mean)[::-1]
print(order)
array([17, 15, 26, 4, 24, 3, 20, 1, 2, 27, 10, 12, 22, 11, 21, 23, 9,
28, 19, 7, 16, 8, 5, 14, 25, 0, 13, 6, 29, 18])
有什么不明白的请追问