绘制决策树分类器时的交互错误,得到一个值数组..使树很难可视化
Interactive error when plotting the decision tree classifier, get an array of values.. makes the tree very hard to visualize
这是重现决策树分类器树所需的代码,它给出了太多的值来解释图形,我想尽可能避免使用明显的值数组,而使用更简单的值数组。在尝试绘制树之前,需要大部分代码来处理数据集。
import numpy as np
import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/Joseph-Villegas/CST383/main/Building_and_Safety_Customer_Service_Request__Closed_.csv")
df.drop([ 'CSR Number', 'Address House Number', 'Address Street Direction',
'Address Street Name', 'Address Street Suffix',
'Parcel Identification Number (PIN)','Address House Fraction Number',
'Address Street Suffix Direction', 'Case Number Related to CSR'],
axis=1, inplace=True)
# Drop any row found with an NA value
df.dropna(axis=0, how='any', inplace=True)
# Observed date columns
date_columns = ['Date Received', 'Date Closed', 'Due Date']
# Function to reformat date string
str_2_date = lambda date: f"{date[:6]}2{re.split('/', date)[2][1:]}"
# Apply said function to the date columns in the dataframe
df = df.apply(lambda column: df[column.name].apply(str_2_date) if column.name in date_columns else column)
for column in date_columns:
original_dtype = str(df[column].dtypes)
df[column] = pd.to_datetime(df[column])
new_dtype = str(df[column].dtypes)
print("{:<20} {:<20} {:<20}".format(column, original_dtype, new_dtype))
for column in date_columns:
df[f"{column} Day of Week"] = df[column].dt.dayofweek # Monday=0, Sunday=6.
df[f"{column} Month"] = df[column].dt.month
df[f"{column} Year"] = df[column].dt.year
# Remove original date columns
df.drop(date_columns, axis=1, inplace=True)
df['Lat.'] = [literal_eval(x)[0] for x in df['Latitude/Longitude']]
df['Lon.'] = [literal_eval(x)[1] for x in df['Latitude/Longitude']]
df.drop('Latitude/Longitude', axis=1, inplace=True)
# Encode the rest of the columns having dtype 'object' using ordinal encoding
object_columns = df.dtypes[(df.dtypes == "object")].index.tolist()
for column in object_columns:
values_list = df[column].value_counts(ascending=True).index.tolist()
ordinal_map = {value:(index + 1) for index, value in enumerate(values_list)}
df[column] = df[column].map(ordinal_map)
def sincos(x, period):
radians = (2 * np.pi * x) / period
return np.column_stack((np.sin(radians), np.cos(radians)))
# Encode the day of week columns
day_of_week_columns = df.filter(like='Day of Week', axis=1).columns.tolist()
for column in day_of_week_columns:
day_sc = sincos(df[column], 7)
df[f"{column} Sin"] = day_sc[:,0]
df[f"{column} Cosine"] = day_sc[:,1]
# Encode the month columns
month_columns = df.filter(like='Month', axis=1).columns.tolist()
for column in month_columns:
month_sc = sincos(df[column], 12)
df[f"{column} Sin"] = day_sc[:,0]
df[f"{column} Cosine"] = day_sc[:,1]
date_info_columns = day_of_week_columns + month_columns
df.drop(date_info_columns, axis=1, inplace=True)
num_na = df.isna().sum().sum()
num_rows, num_cols = df.shape
# Below is the decision tree plot that gives unwanted array of values, is there a way to avoid this???
#-----------------------------------------------------------------------------------------
from sklearn.tree import export_graphviz
import graphviz # needed for the graph
predictors = ['LADBS Inspection District', 'Address Street Zip', 'Date Received Year',
'Date Closed Year', 'Due Date Year', 'Case Flag', 'CSR Priority',
'Lat.', 'Lon.'] # features to predict from
# we must pass np arrays into our decision tree
X = df[predictors].values # numpy array for predictor variables
y = df['Response Days'].values # numpy array for target variable
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.30, random_state=0)
clf = DecisionTreeClassifier(max_depth=3, random_state=0).fit(X_train, y_train)
# Using Decision Tree classifier model, and fit the model with the training data
dot_data = export_graphviz(clf, precision=3,
feature_names=predictors,
proportion=True,
class_names=predictors,
filled=False, rounded=True,
special_characters=True)
# plot it
graph = graphviz.Source(dot_data)
graph
您的目标变量 Response Days
有很多唯一值,因此使用分类器意味着每个叶子都会跟踪每个叶子有多少样本,因此列表很长。您可能更愿意使用回归模型,如果这样做,每个叶子的报告值就是样本中的(单个!)平均目标值。
这是重现决策树分类器树所需的代码,它给出了太多的值来解释图形,我想尽可能避免使用明显的值数组,而使用更简单的值数组。在尝试绘制树之前,需要大部分代码来处理数据集。
import numpy as np
import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/Joseph-Villegas/CST383/main/Building_and_Safety_Customer_Service_Request__Closed_.csv")
df.drop([ 'CSR Number', 'Address House Number', 'Address Street Direction',
'Address Street Name', 'Address Street Suffix',
'Parcel Identification Number (PIN)','Address House Fraction Number',
'Address Street Suffix Direction', 'Case Number Related to CSR'],
axis=1, inplace=True)
# Drop any row found with an NA value
df.dropna(axis=0, how='any', inplace=True)
# Observed date columns
date_columns = ['Date Received', 'Date Closed', 'Due Date']
# Function to reformat date string
str_2_date = lambda date: f"{date[:6]}2{re.split('/', date)[2][1:]}"
# Apply said function to the date columns in the dataframe
df = df.apply(lambda column: df[column.name].apply(str_2_date) if column.name in date_columns else column)
for column in date_columns:
original_dtype = str(df[column].dtypes)
df[column] = pd.to_datetime(df[column])
new_dtype = str(df[column].dtypes)
print("{:<20} {:<20} {:<20}".format(column, original_dtype, new_dtype))
for column in date_columns:
df[f"{column} Day of Week"] = df[column].dt.dayofweek # Monday=0, Sunday=6.
df[f"{column} Month"] = df[column].dt.month
df[f"{column} Year"] = df[column].dt.year
# Remove original date columns
df.drop(date_columns, axis=1, inplace=True)
df['Lat.'] = [literal_eval(x)[0] for x in df['Latitude/Longitude']]
df['Lon.'] = [literal_eval(x)[1] for x in df['Latitude/Longitude']]
df.drop('Latitude/Longitude', axis=1, inplace=True)
# Encode the rest of the columns having dtype 'object' using ordinal encoding
object_columns = df.dtypes[(df.dtypes == "object")].index.tolist()
for column in object_columns:
values_list = df[column].value_counts(ascending=True).index.tolist()
ordinal_map = {value:(index + 1) for index, value in enumerate(values_list)}
df[column] = df[column].map(ordinal_map)
def sincos(x, period):
radians = (2 * np.pi * x) / period
return np.column_stack((np.sin(radians), np.cos(radians)))
# Encode the day of week columns
day_of_week_columns = df.filter(like='Day of Week', axis=1).columns.tolist()
for column in day_of_week_columns:
day_sc = sincos(df[column], 7)
df[f"{column} Sin"] = day_sc[:,0]
df[f"{column} Cosine"] = day_sc[:,1]
# Encode the month columns
month_columns = df.filter(like='Month', axis=1).columns.tolist()
for column in month_columns:
month_sc = sincos(df[column], 12)
df[f"{column} Sin"] = day_sc[:,0]
df[f"{column} Cosine"] = day_sc[:,1]
date_info_columns = day_of_week_columns + month_columns
df.drop(date_info_columns, axis=1, inplace=True)
num_na = df.isna().sum().sum()
num_rows, num_cols = df.shape
# Below is the decision tree plot that gives unwanted array of values, is there a way to avoid this???
#-----------------------------------------------------------------------------------------
from sklearn.tree import export_graphviz
import graphviz # needed for the graph
predictors = ['LADBS Inspection District', 'Address Street Zip', 'Date Received Year',
'Date Closed Year', 'Due Date Year', 'Case Flag', 'CSR Priority',
'Lat.', 'Lon.'] # features to predict from
# we must pass np arrays into our decision tree
X = df[predictors].values # numpy array for predictor variables
y = df['Response Days'].values # numpy array for target variable
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.30, random_state=0)
clf = DecisionTreeClassifier(max_depth=3, random_state=0).fit(X_train, y_train)
# Using Decision Tree classifier model, and fit the model with the training data
dot_data = export_graphviz(clf, precision=3,
feature_names=predictors,
proportion=True,
class_names=predictors,
filled=False, rounded=True,
special_characters=True)
# plot it
graph = graphviz.Source(dot_data)
graph
您的目标变量 Response Days
有很多唯一值,因此使用分类器意味着每个叶子都会跟踪每个叶子有多少样本,因此列表很长。您可能更愿意使用回归模型,如果这样做,每个叶子的报告值就是样本中的(单个!)平均目标值。