如何删除线性散点图上的异常值

How to delete outliers on linear scatter plot

import matplotlib.pyplot as plt
import pandas as pd
import math 

我想从我的数据中删除绿线之外的离群值。我的实际数据在一个数据框中,并且有很多异常值。对于较大的 x 值,缩放绿线的宽度也很有帮助。

x = [1,1.1,2,3,4,5,5.5,6,7,8,9,10, 10.10]
y = [1,0.1,2,3,4,5,1,6,7,8,9,10, 16]

df = pd.DataFrame(list(zip(x, y)),
               columns =['x_vals', 'y_vals'])

plt.scatter(df.x_vals,df.y_vals)
z = np.polyfit(df.x_vals,df.y_vals, 1)
p = np.poly1d(z)
z = plt.plot(df.x_vals,p(df.x_vals),"r--")
z_1 = plt.plot(df.x_vals,p(df.x_vals) + 1,"g--")
z_2 = plt.plot(df.x_vals,p(df.x_vals) - 1,"g--")

plt.show()

您可以select异常值作为y-value大于上限或小于下限的点。

以下代码示例用紫色圈出剩余点,并划掉异常值。

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

###import math

x = [1, 1.1, 2, 3, 4, 5, 5.5, 6, 7, 8, 9, 10, 10.10]
y = [1, 0.1, 2, 3, 4, 5, 1, 6, 7, 8, 9, 10, 16]

df = pd.DataFrame(list(zip(x, y)),
                  columns=['x_vals', 'y_vals'])

plt.scatter(df.x_vals, df.y_vals)
z = np.polyfit(df.x_vals, df.y_vals, 1)
p = np.poly1d(z)
z = plt.plot(df.x_vals, p(df.x_vals), "r--")
delta = 1
z_1 = plt.plot(df.x_vals, p(df.x_vals) + delta, "g--")
z_2 = plt.plot(df.x_vals, p(df.x_vals) - delta, "g--")
plt.fill_between(df.x_vals, p(df.x_vals) - delta, p(df.x_vals) + delta, color='g', alpha=0.1)

outliers = df[(df.y_vals < p(df.x_vals) - delta) | (df.y_vals > p(df.x_vals) + delta)]
cleaned = df[(df.y_vals >= p(df.x_vals) - delta) & (df.y_vals <= p(df.x_vals) + delta)]

plt.scatter(cleaned.x_vals, cleaned.y_vals, s=100, fc='none', ec='purple')
plt.scatter(outliers.x_vals, outliers.y_vals, marker='x', s=200, fc='none', ec='r')

plt.show()