如何创建单个箱线图?
how can I create a single box plot?
数据集:https://github.com/rashida048/Datasets/blob/master/StudentsPerformance.csv
from bokeh.models import Range1d #used to set x and y limits #p.y_range=Range1d(120, 230)
def box_plot(df, vals, label, ylabel=None,xlabel=None,title=None):
# Group Data frame
df_gb = df.groupby(label)
# Get the categories
cats = list(df_gb.groups.keys())
# Compute quartiles for each group
q1 = df_gb[vals].quantile(q=0.25)
q2 = df_gb[vals].quantile(q=0.5)
q3 = df_gb[vals].quantile(q=0.75)
# Compute interquartile region and upper and lower bounds for outliers
iqr = q3 - q1
upper_cutoff = q3 + 1.5*iqr
lower_cutoff = q1 - 1.5*iqr
# Find the outliers for each category
def outliers(group):
cat = group.name
outlier_inds = (group[vals] > upper_cutoff[cat]) \
| (group[vals] < lower_cutoff[cat])
return group[vals][outlier_inds]
# Apply outlier finder
out = df_gb.apply(outliers).dropna()
# Points of outliers for plotting
outx = []
outy = []
for cat in cats:
# only add outliers if they exist
if cat in out and not out[cat].empty:
for value in out[cat]:
outx.append(cat)
outy.append(value)
# If outliers, shrink whiskers to smallest and largest non-outlier
qmin = df_gb[vals].min()
qmax = df_gb[vals].max()
upper = [min([x,y]) for (x,y) in zip(qmax, upper_cutoff)]
lower = [max([x,y]) for (x,y) in zip(qmin, lower_cutoff)]
cats = [str(i) for i in cats]
# Build figure
p = figure(sizing_mode='stretch_width', x_range=cats,height=300,toolbar_location=None)
p.xgrid.grid_line_color = None
p.ygrid.grid_line_width = 2
p.yaxis.axis_label = ylabel
p.xaxis.axis_label = xlabel
p.title=title
p.y_range.start=0
p.title.align = 'center'
# stems
p.segment(cats, upper, cats, q3, line_width=2, line_color="black")
p.segment(cats, lower, cats, q1, line_width=2, line_color="black")
# boxes
p.rect(cats, (q3 + q1)/2, 0.5, q3 - q1, fill_color=['#a50f15', '#de2d26', '#fb6a4a', '#fcae91', '#fee5d9'],
alpha=0.7, line_width=2, line_color="black")
# median (almost-0 height rects simpler than segments)
p.rect(cats, q2, 0.5, 0.01, line_color="black", line_width=2)
# whiskers (almost-0 height rects simpler than segments)
p.rect(cats, lower, 0.2, 0.01, line_color="black")
p.rect(cats, upper, 0.2, 0.01, line_color="black")
# outliers
p.circle(outx, outy, size=6, color="black")
return p
p = box_plot(df, 'Total', 'race/ethnicity', ylabel='Total spread',xlabel='',title='BoxPlot')
show(p)
你好,考虑到我通过了分类变量,我可以根据上面的代码和数据集生成一个箱线图。但是,当我尝试为单个列生成箱线图时,我无法生成任何内容。例如,只是检查数学分数的分布。我试过
cats = df['math score']
但是没用。有什么建议吗?
我不认为最好在一个函数中实现这两个,但如果这是你的目标,一个解决方案可以是,添加一些 if-else
条件。
更改说明如下:
先给label
一个默认值。
# old
# def box_plot(df, vals, label, ylabel=None,xlabel=None,title=None):
# new
def box_plot(df, vals, label=None, ylabel=None,xlabel=None,title=None):
然后为groupby部分添加一个if-else
部分。
# old
# # Group Data frame
# df_gb = df.groupby(label)
# # Get the categories
# cats = list(df_gb.groups.keys())
# new
if label is not None:
# Group Data frame
df_gb = df.groupby(label)
# Get the categories
cats = list(df_gb.groups.keys())
else:
df_gb = df[[vals]]
cats = [vals]
现在大纲的计算有点不同,因为我们不必遍历多个列。只剩下onw列了。
if label is not None:
out = df_gb.apply(outliers).dropna()
else:
out = df[(df[vals] > upper_cutoff) | (df[vals] < lower_cutoff)]
上半部分和下半部分现在是 floats
而不是 list
。
if label is not None:
upper = [min([x,y]) for (x,y) in zip(qmax, upper_cutoff)]
lower = [max([x,y]) for (x,y) in zip(qmin, lower_cutoff)]
else:
upper =min(qmax, upper_cutoff)
lower =max(qmin, lower_cutoff)
我还添加(更改)了下面的行,以避免警告。
colors = ['#a50f15', '#de2d26', '#fb6a4a', '#fcae91', '#fee5d9'][:len(cats)]
p.rect(cats, (q3 + q1)/2, 0.5, q3 - q1, fill_color=colors, alpha=0.7, line_width=2, line_color="black")
通过这些更改,
的输出
p = box_plot(df, 'math score', 'race/ethnicity', ylabel='Total spread',xlabel='',title='BoxPlot')
还是一样,但是
p = box_plot(df, 'math score', ylabel='Total spread',xlabel='',title='BoxPlot')
现在给我们一个箱线图。
数据集:https://github.com/rashida048/Datasets/blob/master/StudentsPerformance.csv
from bokeh.models import Range1d #used to set x and y limits #p.y_range=Range1d(120, 230)
def box_plot(df, vals, label, ylabel=None,xlabel=None,title=None):
# Group Data frame
df_gb = df.groupby(label)
# Get the categories
cats = list(df_gb.groups.keys())
# Compute quartiles for each group
q1 = df_gb[vals].quantile(q=0.25)
q2 = df_gb[vals].quantile(q=0.5)
q3 = df_gb[vals].quantile(q=0.75)
# Compute interquartile region and upper and lower bounds for outliers
iqr = q3 - q1
upper_cutoff = q3 + 1.5*iqr
lower_cutoff = q1 - 1.5*iqr
# Find the outliers for each category
def outliers(group):
cat = group.name
outlier_inds = (group[vals] > upper_cutoff[cat]) \
| (group[vals] < lower_cutoff[cat])
return group[vals][outlier_inds]
# Apply outlier finder
out = df_gb.apply(outliers).dropna()
# Points of outliers for plotting
outx = []
outy = []
for cat in cats:
# only add outliers if they exist
if cat in out and not out[cat].empty:
for value in out[cat]:
outx.append(cat)
outy.append(value)
# If outliers, shrink whiskers to smallest and largest non-outlier
qmin = df_gb[vals].min()
qmax = df_gb[vals].max()
upper = [min([x,y]) for (x,y) in zip(qmax, upper_cutoff)]
lower = [max([x,y]) for (x,y) in zip(qmin, lower_cutoff)]
cats = [str(i) for i in cats]
# Build figure
p = figure(sizing_mode='stretch_width', x_range=cats,height=300,toolbar_location=None)
p.xgrid.grid_line_color = None
p.ygrid.grid_line_width = 2
p.yaxis.axis_label = ylabel
p.xaxis.axis_label = xlabel
p.title=title
p.y_range.start=0
p.title.align = 'center'
# stems
p.segment(cats, upper, cats, q3, line_width=2, line_color="black")
p.segment(cats, lower, cats, q1, line_width=2, line_color="black")
# boxes
p.rect(cats, (q3 + q1)/2, 0.5, q3 - q1, fill_color=['#a50f15', '#de2d26', '#fb6a4a', '#fcae91', '#fee5d9'],
alpha=0.7, line_width=2, line_color="black")
# median (almost-0 height rects simpler than segments)
p.rect(cats, q2, 0.5, 0.01, line_color="black", line_width=2)
# whiskers (almost-0 height rects simpler than segments)
p.rect(cats, lower, 0.2, 0.01, line_color="black")
p.rect(cats, upper, 0.2, 0.01, line_color="black")
# outliers
p.circle(outx, outy, size=6, color="black")
return p
p = box_plot(df, 'Total', 'race/ethnicity', ylabel='Total spread',xlabel='',title='BoxPlot')
show(p)
你好,考虑到我通过了分类变量,我可以根据上面的代码和数据集生成一个箱线图。但是,当我尝试为单个列生成箱线图时,我无法生成任何内容。例如,只是检查数学分数的分布。我试过
cats = df['math score']
但是没用。有什么建议吗?
我不认为最好在一个函数中实现这两个,但如果这是你的目标,一个解决方案可以是,添加一些 if-else
条件。
更改说明如下:
先给label
一个默认值。
# old
# def box_plot(df, vals, label, ylabel=None,xlabel=None,title=None):
# new
def box_plot(df, vals, label=None, ylabel=None,xlabel=None,title=None):
然后为groupby部分添加一个if-else
部分。
# old
# # Group Data frame
# df_gb = df.groupby(label)
# # Get the categories
# cats = list(df_gb.groups.keys())
# new
if label is not None:
# Group Data frame
df_gb = df.groupby(label)
# Get the categories
cats = list(df_gb.groups.keys())
else:
df_gb = df[[vals]]
cats = [vals]
现在大纲的计算有点不同,因为我们不必遍历多个列。只剩下onw列了。
if label is not None:
out = df_gb.apply(outliers).dropna()
else:
out = df[(df[vals] > upper_cutoff) | (df[vals] < lower_cutoff)]
上半部分和下半部分现在是 floats
而不是 list
。
if label is not None:
upper = [min([x,y]) for (x,y) in zip(qmax, upper_cutoff)]
lower = [max([x,y]) for (x,y) in zip(qmin, lower_cutoff)]
else:
upper =min(qmax, upper_cutoff)
lower =max(qmin, lower_cutoff)
我还添加(更改)了下面的行,以避免警告。
colors = ['#a50f15', '#de2d26', '#fb6a4a', '#fcae91', '#fee5d9'][:len(cats)]
p.rect(cats, (q3 + q1)/2, 0.5, q3 - q1, fill_color=colors, alpha=0.7, line_width=2, line_color="black")
通过这些更改,
的输出p = box_plot(df, 'math score', 'race/ethnicity', ylabel='Total spread',xlabel='',title='BoxPlot')
还是一样,但是
p = box_plot(df, 'math score', ylabel='Total spread',xlabel='',title='BoxPlot')
现在给我们一个箱线图。