如何删除线性散点图上的异常值
How to delete outliers on linear scatter plot
import matplotlib.pyplot as plt
import pandas as pd
import math
我想从我的数据中删除绿线之外的离群值。我的实际数据在一个数据框中,并且有很多异常值。对于较大的 x 值,缩放绿线的宽度也很有帮助。
x = [1,1.1,2,3,4,5,5.5,6,7,8,9,10, 10.10]
y = [1,0.1,2,3,4,5,1,6,7,8,9,10, 16]
df = pd.DataFrame(list(zip(x, y)),
columns =['x_vals', 'y_vals'])
plt.scatter(df.x_vals,df.y_vals)
z = np.polyfit(df.x_vals,df.y_vals, 1)
p = np.poly1d(z)
z = plt.plot(df.x_vals,p(df.x_vals),"r--")
z_1 = plt.plot(df.x_vals,p(df.x_vals) + 1,"g--")
z_2 = plt.plot(df.x_vals,p(df.x_vals) - 1,"g--")
plt.show()
您可以select异常值作为y-value大于上限或小于下限的点。
以下代码示例用紫色圈出剩余点,并划掉异常值。
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
###import math
x = [1, 1.1, 2, 3, 4, 5, 5.5, 6, 7, 8, 9, 10, 10.10]
y = [1, 0.1, 2, 3, 4, 5, 1, 6, 7, 8, 9, 10, 16]
df = pd.DataFrame(list(zip(x, y)),
columns=['x_vals', 'y_vals'])
plt.scatter(df.x_vals, df.y_vals)
z = np.polyfit(df.x_vals, df.y_vals, 1)
p = np.poly1d(z)
z = plt.plot(df.x_vals, p(df.x_vals), "r--")
delta = 1
z_1 = plt.plot(df.x_vals, p(df.x_vals) + delta, "g--")
z_2 = plt.plot(df.x_vals, p(df.x_vals) - delta, "g--")
plt.fill_between(df.x_vals, p(df.x_vals) - delta, p(df.x_vals) + delta, color='g', alpha=0.1)
outliers = df[(df.y_vals < p(df.x_vals) - delta) | (df.y_vals > p(df.x_vals) + delta)]
cleaned = df[(df.y_vals >= p(df.x_vals) - delta) & (df.y_vals <= p(df.x_vals) + delta)]
plt.scatter(cleaned.x_vals, cleaned.y_vals, s=100, fc='none', ec='purple')
plt.scatter(outliers.x_vals, outliers.y_vals, marker='x', s=200, fc='none', ec='r')
plt.show()
import matplotlib.pyplot as plt
import pandas as pd
import math
我想从我的数据中删除绿线之外的离群值。我的实际数据在一个数据框中,并且有很多异常值。对于较大的 x 值,缩放绿线的宽度也很有帮助。
x = [1,1.1,2,3,4,5,5.5,6,7,8,9,10, 10.10]
y = [1,0.1,2,3,4,5,1,6,7,8,9,10, 16]
df = pd.DataFrame(list(zip(x, y)),
columns =['x_vals', 'y_vals'])
plt.scatter(df.x_vals,df.y_vals)
z = np.polyfit(df.x_vals,df.y_vals, 1)
p = np.poly1d(z)
z = plt.plot(df.x_vals,p(df.x_vals),"r--")
z_1 = plt.plot(df.x_vals,p(df.x_vals) + 1,"g--")
z_2 = plt.plot(df.x_vals,p(df.x_vals) - 1,"g--")
plt.show()
您可以select异常值作为y-value大于上限或小于下限的点。
以下代码示例用紫色圈出剩余点,并划掉异常值。
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
###import math
x = [1, 1.1, 2, 3, 4, 5, 5.5, 6, 7, 8, 9, 10, 10.10]
y = [1, 0.1, 2, 3, 4, 5, 1, 6, 7, 8, 9, 10, 16]
df = pd.DataFrame(list(zip(x, y)),
columns=['x_vals', 'y_vals'])
plt.scatter(df.x_vals, df.y_vals)
z = np.polyfit(df.x_vals, df.y_vals, 1)
p = np.poly1d(z)
z = plt.plot(df.x_vals, p(df.x_vals), "r--")
delta = 1
z_1 = plt.plot(df.x_vals, p(df.x_vals) + delta, "g--")
z_2 = plt.plot(df.x_vals, p(df.x_vals) - delta, "g--")
plt.fill_between(df.x_vals, p(df.x_vals) - delta, p(df.x_vals) + delta, color='g', alpha=0.1)
outliers = df[(df.y_vals < p(df.x_vals) - delta) | (df.y_vals > p(df.x_vals) + delta)]
cleaned = df[(df.y_vals >= p(df.x_vals) - delta) & (df.y_vals <= p(df.x_vals) + delta)]
plt.scatter(cleaned.x_vals, cleaned.y_vals, s=100, fc='none', ec='purple')
plt.scatter(outliers.x_vals, outliers.y_vals, marker='x', s=200, fc='none', ec='r')
plt.show()