突出显示 CDF 图中的异常值区域

Question

我试图在我的可视化中突出显示 "outliers" 所在的 CDF 区域（可能是浅红色阴影以区分该区域）。

你能协助根据上面的定义对 "outlier" 指向的区域进行着色吗？出于某种原因，当我尝试查看异常值定义时，我得到一个空输出，无论是 print(outliers_iqr(days)) 还是 print(str(outliers_iqr(days)[1:-1])。它只是打印 array([], dtype=int64),

这是我当前的代码：

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

a = [389, 350, 130, 344, 392, 92, 51, 28, 309, 357, 64, 380, 332, 109, 284, 105, 
 50, 66, 156, 116, 75, 315, 155, 34, 155, 241, 320, 50, 97, 41, 274, 99, 133, 
 95, 306, 62, 187, 56, 110, 338, 102, 285, 386, 231, 238, 145, 216, 148, 105, 
 368, 176, 155, 106, 107, 36, 16, 28, 6, 322, 95, 122, 82, 64, 35, 72, 214, 
 192, 91, 117, 277, 101, 159, 96, 325, 79, 154, 314, 142, 147, 138, 48, 50, 
 178, 146, 224, 282, 141, 75, 151, 93, 135, 82, 125, 111, 49, 113, 165, 19, 
 118, 105, 92, 133, 77, 54, 72, 34]

#create CDF definition
def ecdf(data):
    n = len(data)
    x = np.sort(data)
    y = np.arange(1.0, n+1) / n
    return x, y

#Using +-1.5x IQR method for defining outliers
def outliers_iqr(ys):
    quartile_1, quartile_3 = np.percentile(ys, [25, 75])
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * 1.5)
    upper_bound = quartile_3 + (iqr * 1.5)
    return np.where((ys > upper_bound) | (ys < lower_bound))

days = pd.DataFrame({"days" : a})

x, y = ecdf(days)

plt.plot(x, y, marker='.', linestyle='none') 
plt.axvline(x.mean(), color='gray', linestyle='dashed', linewidth=2) #Add mean

x_m = int(x.mean())
y_m = stats.percentileofscore(days.as_matrix(), x.mean())/100.0

ax=plt.gca()
ax.annotate('(%s,%s)' % (x_m,int(y_m*100)) , xy=(x_m,y_m), 
            xytext=(10,-5), textcoords='offset points')

outliers= outliers_iqr(days) 
print(outliers_iqr(days)) #print outliers- doesn't print   
print(str(outliers_iqr(days))[1:-1]) #same

#highlight the outliers area in the CDF plot
ax.fill_between(?, ?, ?, where=?, facecolor='red', alpha=0.3) #between 0 and 1st quartile
ax.fill_between(?, ?, ?, where=?, facecolor='red', alpha=0.3) #between 3rd quartile and 1

percentiles= np.array([25,50,75])
x_p = np.percentile(days, percentiles)
y_p = percentiles/100.0

plt.plot(x_p, y_p, marker='D', color='red', linestyle='none') # Overlay quartiles

for x,y in zip(x_p, y_p):                                        
    ax.annotate('%s' % int(x), xy=(x,y), xytext=(10,-5), textcoords='offset points')

plt.xlabel('Days')
plt.ylabel('ECDF')
plt.legend(('Days', "Mean", 'Quartiles'), loc='lower right')

plt.show()

Answer 1

如果您的异常值数组有时可能为空，则您必须使用 if 语句来处理这种情况。此外，由于您只想遮蔽情节的区域，因此您实际上可以使用 Axes.axvspan 。这是一个对原始示例进行了一些修改的示例（函数内的所有绘图命令并添加了第二个子图，其中包含实际上具有异常值的数据）：

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

a = [389, 350, 130, 344, 392, 92, 51, 28, 309, 357, 64, 380, 332, 109, 284, 105, 
 50, 66, 156, 116, 75, 315, 155, 34, 155, 241, 320, 50, 97, 41, 274, 99, 133, 
 95, 306, 62, 187, 56, 110, 338, 102, 285, 386, 231, 238, 145, 216, 148, 105, 
 368, 176, 155, 106, 107, 36, 16, 28, 6, 322, 95, 122, 82, 64, 35, 72, 214, 
 192, 91, 117, 277, 101, 159, 96, 325, 79, 154, 314, 142, 147, 138, 48, 50, 
 178, 146, 224, 282, 141, 75, 151, 93, 135, 82, 125, 111, 49, 113, 165, 19, 
 118, 105, 92, 133, 77, 54, 72, 34]


#create CDF definition
def ecdf(data):
    n = len(data)
    x = np.sort(data)
    y = np.arange(1.0, n+1) / n
    return x, y

#Using +-1.5x IQR method for defining outliers
def outliers_iqr(ys):
    quartile_1, quartile_3 = np.percentile(ys, [25, 75])
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * 1.5)
    upper_bound = quartile_3 + (iqr * 1.5)

    return  np.where((ys < lower_bound)), np.where((ys > upper_bound))



def generate_plot(ax, df):

    x, y = ecdf(df)

    ax.plot(x, y, marker='.', linestyle='none') 
    ax.axvline(x.mean(), color='gray', linestyle='dashed', linewidth=2) #Add mean

    x_m = int(x.mean())
    y_m = stats.percentileofscore(df.as_matrix(), x.mean())/100.0

    ax.annotate('(%s,%s)' % (x_m,int(y_m*100)) , xy=(x_m,y_m), 
                xytext=(10,-5), textcoords='offset points')

    outliers= outliers_iqr(df.values) 

    #highlight the outliers area in the CDF plot
    for outl in outliers:
        vals = df.values[outl]
        if vals.size>0:
            ax.axvspan(np.min(vals),np.max(vals),alpha=0.5,color='red')


    percentiles= np.array([25,50,75])
    x_p = np.percentile(df, percentiles)
    y_p = percentiles/100.0

    ax.plot(x_p, y_p, marker='D', color='red', linestyle='none') # Overlay quartiles

    for x,y in zip(x_p, y_p):                                        
        ax.annotate('%s' % int(x), xy=(x,y), xytext=(10,-5), textcoords='offset points')

    ax.set_xlabel('Days')
    ax.set_ylabel('ECDF')
    ax.legend(('Days', "Mean", 'Quartiles'), loc='lower right')


fig, axes = plt.subplots(nrows = 1, ncols = 2, figsize=(10,5))

##original data
days = pd.DataFrame({"days" : a})
generate_plot(axes[0],days)

##fake data with outliers
b = np.concatenate([
    np.random.normal(200,50,300),
    np.random.normal(25,10,20),
    np.random.normal(375,10,20),
])
np.random.shuffle(b)
generate_plot(axes[1],pd.DataFrame({"days" : b}))

##naming the subplots
axes[0].set_title('original data')
axes[1].set_title('fake data with outliers')

plt.show()

结果如下所示：

希望对您有所帮助。

突出显示 CDF 图中的异常值区域

Highlight the outliers area in CDF plot

python

matplotlib

python-2.7

ecdf