如何使用 TreeExplainer 形状图表的 y 轴标签创建列表?

How to create a list with the y-axis labels of a TreeExplainer shap chart?

如何使用 TreeExplainer 图表的 y 轴标签创建列表?

您好,

我能够生成一个图表,按照 y 轴上的重要性顺序对我的变量进行排序。这是一个以图形形式可视化的重要解决方案,但现在我需要提取有序变量列表,因为它们位于图形的 y 轴上。有谁知道如何做到这一点?我在这里放了一张示例图片。

Obs.:抱歉,我无法添加最小的可重现示例。我不知道如何在此处粘贴 Jupyter Notebook 单元格,所以我在 link 下方粘贴到通过 Github.

共享的代码

在此示例中,列表将是“vB0、mB1、vB1、mB2、mB0、vB2”。

minimal reproducible example

    ## SHAP GRAPHIC

import pandas as pd
import seaborn as sns
import numpy as np  # for sample data

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.inspection import permutation_importance
import shap
from matplotlib import pyplot as plt

# set seed for reproducibility
np.random.seed(1)

# create arrays of random sample data
cl = np.random.choice(range(1, 6), size=(100, 1))
d = np.random.random_sample(size=(100, 6))

# combine the two arrays
data = np.concatenate([cl, d], axis=1)

# create a dataframe
data = pd.DataFrame(data, columns=['classe', 'mB0', 'mB1', 'mB2', 'vB0', 'vB1', 'vB2'])

# create an 'id' column with sequential numbering 
#fonte: https://pythonexamples.org/pandas-set-column-as-index/#:~:text=Pandas%20%E2%80%93%20Set%20Column%20as%20Index&text=To%20set%20a%20column%20as,index%2C%20to%20set_index()%20method.
data['id'] = data.index

#Specifying Predictors (X) and Target Variable (y)
X = data.drop(['classe','id'], axis=1)   #assigning predictors to X
y = data['classe']                #subsetting only the target variable to y



# implementing train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

X_test.shape, y_test.shape


#Creating the RF Classifier model with 100 trees
rf = RandomForestClassifier(n_estimators=100)
#Fitting the classifier to the data
rf.fit(X_train, y_train)

#Getting predictions
y_rf_pred = rf.predict(X_test)

##--------------------------- Random Forests Classification ---------------------------

#Creating the RF Classifier model with 100 trees
rf = RandomForestClassifier(n_estimators=100)
#Fitting the classifier to the data
rf.fit(X_train, y_train)

#Getting predictions
y_rf_pred = rf.predict(X_test)

## Finding optimal parameters for this model
from sklearn.model_selection import RandomizedSearchCV
# number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 500, num = 10)]
# number of features at every split
max_features = ['auto', 'sqrt']

# max depth
max_depth = [int(x) for x in np.linspace(1, 50, num = 11)]
max_depth.append(None)
# create random grid
random_grid = {
 'n_estimators': n_estimators,
 'max_features': max_features,
 'max_depth': max_depth
 }
# Random search of parameters
rfc_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 10, verbose=2, random_state=42, n_jobs = -1, return_train_score=True)
# Fit the model
rfc_random.fit(X_train, y_train)

cross_val = rfc_random.cv_results_


### Running the model with optimized hyperparameters
rf_tuned = RandomForestClassifier(n_estimators=144, max_depth=10, max_features='sqrt')
rf_tuned.fit(X_train,y_train)
y_pred_rftuned = rf_tuned.predict(X_test)
rf_cv_score = cross_val_score(rf_tuned, X, y, cv=10)

# #Summarized classification report
# print("=== Classification Report ===")
# from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
# print(classification_report(y_test, y_pred_rftuned, digits=4))

# print('\n')

# print("=== Confusion Matrix ===")
# print(confusion_matrix(y_test, y_pred_rftuned))

# cm = confusion_matrix(y_test, y_pred_rftuned)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=rf_tuned.classes_)


# ##Saving the tuned model to a external file to be used again without having to train all over again
# from joblib import dump, load
# dump(rf_tuned, 'test3_rf_classifier.joblib')

## Feature Importance Computed with SHAP Values

explainer = shap.TreeExplainer(rf_tuned)
shap_values = explainer.shap_values(X_test, approximate=False, check_additivity=False)

shap.summary_plot(shap_values, X_test, max_display=1000)

#print ordered columns according to shap values
sv = np.array(shap_values)
sv_mean=np.abs(sv).mean(1).sum(0)
order = np.argsort(sv_mean)[::-1]
ordered_cols = X_test.columns[order]
print(ordered_cols)

TL;DR

sv = np.array(shap_values)
sv_mean=np.abs(sv).mean(1).sum(0)
order = np.argsort(sv_mean)[::-1]
ordered_cols = X_test.columns[order]
print(ordered_cols)

将上面的代码块附加到您链接的笔记本中,您将获得所需的内容。

完整答案

summary_plot 显示按 SHAP 值的绝对值的平均值之和排序的列。你不能从图中提取它们,但你可以计算它们。

完全可重现的示例:

from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from shap import TreeExplainer, summary_plot

X, y = make_classification(n_samples=1000, n_features=30,
          n_classes=5, n_informative=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)

explainer = TreeExplainer(rf_clf)
shap_values = explainer.shap_values(X_test)
summary_plot(shap_values, max_display=10)

sv = np.array(shap_values)
sv_mean=np.abs(sv).mean(1).sum(0)
order = np.argsort(sv_mean)[::-1]
print(order)

array([17, 15, 26,  4, 24,  3, 20,  1,  2, 27, 10, 12, 22, 11, 21, 23,  9,
       28, 19,  7, 16,  8,  5, 14, 25,  0, 13,  6, 29, 18])

有什么不明白的请追问