突出显示 CDF 图中的异常值区域
Highlight the outliers area in CDF plot
我试图在我的可视化中突出显示 "outliers" 所在的 CDF 区域(可能是浅红色阴影以区分该区域)。
你能协助根据上面的定义对 "outlier" 指向的区域进行着色吗?出于某种原因,当我尝试查看异常值定义时,我得到一个空输出,无论是 print(outliers_iqr(days))
还是 print(str(outliers_iqr(days)[1:-1])
。它只是打印 array([], dtype=int64),
这是我当前的代码:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
a = [389, 350, 130, 344, 392, 92, 51, 28, 309, 357, 64, 380, 332, 109, 284, 105,
50, 66, 156, 116, 75, 315, 155, 34, 155, 241, 320, 50, 97, 41, 274, 99, 133,
95, 306, 62, 187, 56, 110, 338, 102, 285, 386, 231, 238, 145, 216, 148, 105,
368, 176, 155, 106, 107, 36, 16, 28, 6, 322, 95, 122, 82, 64, 35, 72, 214,
192, 91, 117, 277, 101, 159, 96, 325, 79, 154, 314, 142, 147, 138, 48, 50,
178, 146, 224, 282, 141, 75, 151, 93, 135, 82, 125, 111, 49, 113, 165, 19,
118, 105, 92, 133, 77, 54, 72, 34]
#create CDF definition
def ecdf(data):
n = len(data)
x = np.sort(data)
y = np.arange(1.0, n+1) / n
return x, y
#Using +-1.5x IQR method for defining outliers
def outliers_iqr(ys):
quartile_1, quartile_3 = np.percentile(ys, [25, 75])
iqr = quartile_3 - quartile_1
lower_bound = quartile_1 - (iqr * 1.5)
upper_bound = quartile_3 + (iqr * 1.5)
return np.where((ys > upper_bound) | (ys < lower_bound))
days = pd.DataFrame({"days" : a})
x, y = ecdf(days)
plt.plot(x, y, marker='.', linestyle='none')
plt.axvline(x.mean(), color='gray', linestyle='dashed', linewidth=2) #Add mean
x_m = int(x.mean())
y_m = stats.percentileofscore(days.as_matrix(), x.mean())/100.0
ax=plt.gca()
ax.annotate('(%s,%s)' % (x_m,int(y_m*100)) , xy=(x_m,y_m),
xytext=(10,-5), textcoords='offset points')
outliers= outliers_iqr(days)
print(outliers_iqr(days)) #print outliers- doesn't print
print(str(outliers_iqr(days))[1:-1]) #same
#highlight the outliers area in the CDF plot
ax.fill_between(?, ?, ?, where=?, facecolor='red', alpha=0.3) #between 0 and 1st quartile
ax.fill_between(?, ?, ?, where=?, facecolor='red', alpha=0.3) #between 3rd quartile and 1
percentiles= np.array([25,50,75])
x_p = np.percentile(days, percentiles)
y_p = percentiles/100.0
plt.plot(x_p, y_p, marker='D', color='red', linestyle='none') # Overlay quartiles
for x,y in zip(x_p, y_p):
ax.annotate('%s' % int(x), xy=(x,y), xytext=(10,-5), textcoords='offset points')
plt.xlabel('Days')
plt.ylabel('ECDF')
plt.legend(('Days', "Mean", 'Quartiles'), loc='lower right')
plt.show()
如果您的异常值数组有时可能为空,则您必须使用 if
语句来处理这种情况。此外,由于您只想遮蔽情节的区域,因此您实际上可以使用 Axes.axvspan
。这是一个对原始示例进行了一些修改的示例(函数内的所有绘图命令并添加了第二个子图,其中包含实际上 具有 异常值的数据):
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
a = [389, 350, 130, 344, 392, 92, 51, 28, 309, 357, 64, 380, 332, 109, 284, 105,
50, 66, 156, 116, 75, 315, 155, 34, 155, 241, 320, 50, 97, 41, 274, 99, 133,
95, 306, 62, 187, 56, 110, 338, 102, 285, 386, 231, 238, 145, 216, 148, 105,
368, 176, 155, 106, 107, 36, 16, 28, 6, 322, 95, 122, 82, 64, 35, 72, 214,
192, 91, 117, 277, 101, 159, 96, 325, 79, 154, 314, 142, 147, 138, 48, 50,
178, 146, 224, 282, 141, 75, 151, 93, 135, 82, 125, 111, 49, 113, 165, 19,
118, 105, 92, 133, 77, 54, 72, 34]
#create CDF definition
def ecdf(data):
n = len(data)
x = np.sort(data)
y = np.arange(1.0, n+1) / n
return x, y
#Using +-1.5x IQR method for defining outliers
def outliers_iqr(ys):
quartile_1, quartile_3 = np.percentile(ys, [25, 75])
iqr = quartile_3 - quartile_1
lower_bound = quartile_1 - (iqr * 1.5)
upper_bound = quartile_3 + (iqr * 1.5)
return np.where((ys < lower_bound)), np.where((ys > upper_bound))
def generate_plot(ax, df):
x, y = ecdf(df)
ax.plot(x, y, marker='.', linestyle='none')
ax.axvline(x.mean(), color='gray', linestyle='dashed', linewidth=2) #Add mean
x_m = int(x.mean())
y_m = stats.percentileofscore(df.as_matrix(), x.mean())/100.0
ax.annotate('(%s,%s)' % (x_m,int(y_m*100)) , xy=(x_m,y_m),
xytext=(10,-5), textcoords='offset points')
outliers= outliers_iqr(df.values)
#highlight the outliers area in the CDF plot
for outl in outliers:
vals = df.values[outl]
if vals.size>0:
ax.axvspan(np.min(vals),np.max(vals),alpha=0.5,color='red')
percentiles= np.array([25,50,75])
x_p = np.percentile(df, percentiles)
y_p = percentiles/100.0
ax.plot(x_p, y_p, marker='D', color='red', linestyle='none') # Overlay quartiles
for x,y in zip(x_p, y_p):
ax.annotate('%s' % int(x), xy=(x,y), xytext=(10,-5), textcoords='offset points')
ax.set_xlabel('Days')
ax.set_ylabel('ECDF')
ax.legend(('Days', "Mean", 'Quartiles'), loc='lower right')
fig, axes = plt.subplots(nrows = 1, ncols = 2, figsize=(10,5))
##original data
days = pd.DataFrame({"days" : a})
generate_plot(axes[0],days)
##fake data with outliers
b = np.concatenate([
np.random.normal(200,50,300),
np.random.normal(25,10,20),
np.random.normal(375,10,20),
])
np.random.shuffle(b)
generate_plot(axes[1],pd.DataFrame({"days" : b}))
##naming the subplots
axes[0].set_title('original data')
axes[1].set_title('fake data with outliers')
plt.show()
结果如下所示:
希望对您有所帮助。
我试图在我的可视化中突出显示 "outliers" 所在的 CDF 区域(可能是浅红色阴影以区分该区域)。
你能协助根据上面的定义对 "outlier" 指向的区域进行着色吗?出于某种原因,当我尝试查看异常值定义时,我得到一个空输出,无论是 print(outliers_iqr(days))
还是 print(str(outliers_iqr(days)[1:-1])
。它只是打印 array([], dtype=int64),
这是我当前的代码:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
a = [389, 350, 130, 344, 392, 92, 51, 28, 309, 357, 64, 380, 332, 109, 284, 105,
50, 66, 156, 116, 75, 315, 155, 34, 155, 241, 320, 50, 97, 41, 274, 99, 133,
95, 306, 62, 187, 56, 110, 338, 102, 285, 386, 231, 238, 145, 216, 148, 105,
368, 176, 155, 106, 107, 36, 16, 28, 6, 322, 95, 122, 82, 64, 35, 72, 214,
192, 91, 117, 277, 101, 159, 96, 325, 79, 154, 314, 142, 147, 138, 48, 50,
178, 146, 224, 282, 141, 75, 151, 93, 135, 82, 125, 111, 49, 113, 165, 19,
118, 105, 92, 133, 77, 54, 72, 34]
#create CDF definition
def ecdf(data):
n = len(data)
x = np.sort(data)
y = np.arange(1.0, n+1) / n
return x, y
#Using +-1.5x IQR method for defining outliers
def outliers_iqr(ys):
quartile_1, quartile_3 = np.percentile(ys, [25, 75])
iqr = quartile_3 - quartile_1
lower_bound = quartile_1 - (iqr * 1.5)
upper_bound = quartile_3 + (iqr * 1.5)
return np.where((ys > upper_bound) | (ys < lower_bound))
days = pd.DataFrame({"days" : a})
x, y = ecdf(days)
plt.plot(x, y, marker='.', linestyle='none')
plt.axvline(x.mean(), color='gray', linestyle='dashed', linewidth=2) #Add mean
x_m = int(x.mean())
y_m = stats.percentileofscore(days.as_matrix(), x.mean())/100.0
ax=plt.gca()
ax.annotate('(%s,%s)' % (x_m,int(y_m*100)) , xy=(x_m,y_m),
xytext=(10,-5), textcoords='offset points')
outliers= outliers_iqr(days)
print(outliers_iqr(days)) #print outliers- doesn't print
print(str(outliers_iqr(days))[1:-1]) #same
#highlight the outliers area in the CDF plot
ax.fill_between(?, ?, ?, where=?, facecolor='red', alpha=0.3) #between 0 and 1st quartile
ax.fill_between(?, ?, ?, where=?, facecolor='red', alpha=0.3) #between 3rd quartile and 1
percentiles= np.array([25,50,75])
x_p = np.percentile(days, percentiles)
y_p = percentiles/100.0
plt.plot(x_p, y_p, marker='D', color='red', linestyle='none') # Overlay quartiles
for x,y in zip(x_p, y_p):
ax.annotate('%s' % int(x), xy=(x,y), xytext=(10,-5), textcoords='offset points')
plt.xlabel('Days')
plt.ylabel('ECDF')
plt.legend(('Days', "Mean", 'Quartiles'), loc='lower right')
plt.show()
如果您的异常值数组有时可能为空,则您必须使用 if
语句来处理这种情况。此外,由于您只想遮蔽情节的区域,因此您实际上可以使用 Axes.axvspan
。这是一个对原始示例进行了一些修改的示例(函数内的所有绘图命令并添加了第二个子图,其中包含实际上 具有 异常值的数据):
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
a = [389, 350, 130, 344, 392, 92, 51, 28, 309, 357, 64, 380, 332, 109, 284, 105,
50, 66, 156, 116, 75, 315, 155, 34, 155, 241, 320, 50, 97, 41, 274, 99, 133,
95, 306, 62, 187, 56, 110, 338, 102, 285, 386, 231, 238, 145, 216, 148, 105,
368, 176, 155, 106, 107, 36, 16, 28, 6, 322, 95, 122, 82, 64, 35, 72, 214,
192, 91, 117, 277, 101, 159, 96, 325, 79, 154, 314, 142, 147, 138, 48, 50,
178, 146, 224, 282, 141, 75, 151, 93, 135, 82, 125, 111, 49, 113, 165, 19,
118, 105, 92, 133, 77, 54, 72, 34]
#create CDF definition
def ecdf(data):
n = len(data)
x = np.sort(data)
y = np.arange(1.0, n+1) / n
return x, y
#Using +-1.5x IQR method for defining outliers
def outliers_iqr(ys):
quartile_1, quartile_3 = np.percentile(ys, [25, 75])
iqr = quartile_3 - quartile_1
lower_bound = quartile_1 - (iqr * 1.5)
upper_bound = quartile_3 + (iqr * 1.5)
return np.where((ys < lower_bound)), np.where((ys > upper_bound))
def generate_plot(ax, df):
x, y = ecdf(df)
ax.plot(x, y, marker='.', linestyle='none')
ax.axvline(x.mean(), color='gray', linestyle='dashed', linewidth=2) #Add mean
x_m = int(x.mean())
y_m = stats.percentileofscore(df.as_matrix(), x.mean())/100.0
ax.annotate('(%s,%s)' % (x_m,int(y_m*100)) , xy=(x_m,y_m),
xytext=(10,-5), textcoords='offset points')
outliers= outliers_iqr(df.values)
#highlight the outliers area in the CDF plot
for outl in outliers:
vals = df.values[outl]
if vals.size>0:
ax.axvspan(np.min(vals),np.max(vals),alpha=0.5,color='red')
percentiles= np.array([25,50,75])
x_p = np.percentile(df, percentiles)
y_p = percentiles/100.0
ax.plot(x_p, y_p, marker='D', color='red', linestyle='none') # Overlay quartiles
for x,y in zip(x_p, y_p):
ax.annotate('%s' % int(x), xy=(x,y), xytext=(10,-5), textcoords='offset points')
ax.set_xlabel('Days')
ax.set_ylabel('ECDF')
ax.legend(('Days', "Mean", 'Quartiles'), loc='lower right')
fig, axes = plt.subplots(nrows = 1, ncols = 2, figsize=(10,5))
##original data
days = pd.DataFrame({"days" : a})
generate_plot(axes[0],days)
##fake data with outliers
b = np.concatenate([
np.random.normal(200,50,300),
np.random.normal(25,10,20),
np.random.normal(375,10,20),
])
np.random.shuffle(b)
generate_plot(axes[1],pd.DataFrame({"days" : b}))
##naming the subplots
axes[0].set_title('original data')
axes[1].set_title('fake data with outliers')
plt.show()
结果如下所示:
希望对您有所帮助。