如何创建单个箱线图?

how can I create a single box plot?

数据集:https://github.com/rashida048/Datasets/blob/master/StudentsPerformance.csv

from bokeh.models import Range1d #used to set x and y limits #p.y_range=Range1d(120, 230)

def box_plot(df, vals, label, ylabel=None,xlabel=None,title=None):

 
    # Group Data frame
    df_gb = df.groupby(label)
    # Get the categories
    cats = list(df_gb.groups.keys())

    # Compute quartiles for each group
    q1 = df_gb[vals].quantile(q=0.25)
    q2 = df_gb[vals].quantile(q=0.5)
    q3 = df_gb[vals].quantile(q=0.75)
                       
    # Compute interquartile region and upper and lower bounds for outliers
    iqr = q3 - q1
    upper_cutoff = q3 + 1.5*iqr
    lower_cutoff = q1 - 1.5*iqr

    # Find the outliers for each category
    def outliers(group):
        cat = group.name
        outlier_inds = (group[vals] > upper_cutoff[cat]) \
                                     | (group[vals] < lower_cutoff[cat])
        return group[vals][outlier_inds]

    # Apply outlier finder
    out = df_gb.apply(outliers).dropna()

    # Points of outliers for plotting
    outx = []
    outy = []
    for cat in cats:
        # only add outliers if they exist
        if cat in out and not out[cat].empty:
            for value in out[cat]:
                outx.append(cat)
                outy.append(value) 
                
    # If outliers, shrink whiskers to smallest and largest non-outlier
    qmin = df_gb[vals].min()
    qmax = df_gb[vals].max()
    upper = [min([x,y]) for (x,y) in zip(qmax, upper_cutoff)]
    lower = [max([x,y]) for (x,y) in zip(qmin, lower_cutoff)]

    cats = [str(i) for i in cats]
    # Build figure
    p = figure(sizing_mode='stretch_width', x_range=cats,height=300,toolbar_location=None)
    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_width = 2
    p.yaxis.axis_label = ylabel
    p.xaxis.axis_label = xlabel
    p.title=title
    p.y_range.start=0
    p.title.align = 'center'
    
    # stems
    p.segment(cats, upper, cats, q3, line_width=2, line_color="black")
    p.segment(cats, lower, cats, q1, line_width=2, line_color="black")

    # boxes
    p.rect(cats, (q3 + q1)/2, 0.5, q3 - q1, fill_color=['#a50f15', '#de2d26', '#fb6a4a', '#fcae91', '#fee5d9'], 
           alpha=0.7, line_width=2, line_color="black")

    # median (almost-0 height rects simpler than segments)
    p.rect(cats, q2, 0.5, 0.01, line_color="black", line_width=2)

    # whiskers (almost-0 height rects simpler than segments)
    p.rect(cats, lower, 0.2, 0.01, line_color="black")
    p.rect(cats, upper, 0.2, 0.01, line_color="black")

    # outliers
    p.circle(outx, outy, size=6, color="black")

    return p

p = box_plot(df, 'Total', 'race/ethnicity', ylabel='Total spread',xlabel='',title='BoxPlot')
show(p)

你好,考虑到我通过了分类变量,我可以根据上面的代码和数据集生成一个箱线图。但是,当我尝试为单个列生成箱线图时,我无法生成任何内容。例如,只是检查数学分数的分布。我试过

cats = df['math score'] 

但是没用。有什么建议吗?

我不认为最好在一个函数中实现这两个,但如果这是你的目标,一个解决方案可以是,添加一些 if-else 条件。

更改说明如下:

先给label一个默认值。

# old
# def box_plot(df, vals, label, ylabel=None,xlabel=None,title=None):
# new
def box_plot(df, vals, label=None, ylabel=None,xlabel=None,title=None):

然后为groupby部分添加一个if-else部分。

# old
# # Group Data frame
# df_gb = df.groupby(label)
# # Get the categories
# cats = list(df_gb.groups.keys())

# new
if label is not None:
    # Group Data frame
    df_gb = df.groupby(label)
    # Get the categories
    cats = list(df_gb.groups.keys())
else:
    df_gb = df[[vals]]
    cats = [vals]

现在大纲的计算有点不同,因为我们不必遍历多个列。只剩下onw列了。

if label is not None:
    out = df_gb.apply(outliers).dropna()
else:
    out = df[(df[vals] > upper_cutoff) | (df[vals] < lower_cutoff)]

上半部分和下半部分现在是 floats 而不是 list

if label is not None:
    upper = [min([x,y]) for (x,y) in zip(qmax, upper_cutoff)]
    lower = [max([x,y]) for (x,y) in zip(qmin, lower_cutoff)]
else:
    upper =min(qmax, upper_cutoff)
    lower =max(qmin, lower_cutoff)

我还添加(更改)了下面的行,以避免警告。

colors = ['#a50f15', '#de2d26', '#fb6a4a', '#fcae91', '#fee5d9'][:len(cats)]
p.rect(cats, (q3 + q1)/2, 0.5, q3 - q1, fill_color=colors, alpha=0.7, line_width=2, line_color="black")

通过这些更改,

的输出
p = box_plot(df, 'math score', 'race/ethnicity', ylabel='Total spread',xlabel='',title='BoxPlot')

还是一样,但是

p = box_plot(df, 'math score', ylabel='Total spread',xlabel='',title='BoxPlot')

现在给我们一个箱线图。