如何用最接近的值替换离群值,如Matlab中的filloutlier函数?
How to replace outlier values with nearest value, like the filloutlier function in Matlab?
我需要复制 Matlab 中存在的 filloutliers(someList,'nearest','mean') 函数。
我有以下代码,大部分工作正常。但是,当我给它数据集时,它会替换错误的值。它将 453.675231 替换为 0 而不是 -211.71818100000002。我曾尝试以多种不同的方式更改 compareNeighbors
函数,但老实说,此时我完全不知道该怎么做。
我将添加数据,这样您只需复制并粘贴它就可以了。如果我将 compareNeighbors
函数中的 < 切换为 > 适用于此示例,但不适用于其他示例。
import numpy as np
from math import sqrt
from statistics import stdev as std
def compareNeighbors(before, current, after):
valBefore = (before - current)
valAfter = (after - current)
print(valBefore)
print(valAfter)
return(valBefore < valAfter)
def findNearestValue(data, before, current, after):
before = before if before > -1 else 0
after = after if after < len(data) else len(data) - 1
valBefore = data[before] if before != current else 10000000000
valAfter = data[after] if after != current else 10000000000
return valBefore if compareNeighbors(valBefore, valAfter, data[current]) else valAfter
def getOutlierLists(data, distance):
outlierList = []
outlierList.extend(data[data > distance].tolist())
outlierList.extend(data[data < -distance].tolist())
outlierListIndecies = [i for i, j in enumerate(data) if j in outlierList]
return(outlierList, outlierListIndecies)
def filloutliers(data):
stad = std(data)
mean = np.mean(data)
distance = 3*stad + mean
(outlierList, outlierListIndecies) = getOutlierLists(data, distance)
print(outlierList, " | ", outlierListIndecies, " | ", distance, " | ", mean)
for i in range(len(outlierList)):
data[outlierListIndecies[i]] = findNearestValue(data, outlierListIndecies[i] - 1, outlierListIndecies[i], outlierListIndecies[i] + 1)
(outlierList, outlierListIndecies) = getOutlierLists(data, distance)
if(len(outlierList) != 0):
for i in reversed(range(len(outlierList))):
data[outlierListIndecies[i]] = findNearestValue(data, outlierListIndecies[i] - 1, outlierListIndecies[i], outlierListIndecies[i] + 1)
return data
Outlier value: [453.675231]
Position in array: [46]
Max value
for after which a value is an outlier: +/-415.67922821410116
Mean: 99.86239028000001
Input data:
[0.0, 195.47146400000003, 0.0, 143.1795457,
19.7727047, 0.0, 37.9259413, 67.4346233, 175.714837, 140.72522700000002, 42.116339999999994, 0.0, 11.829232000000005, 0.0, 225.20435399999997, 25.939856999999996, 9.875561000000005, 0.0, 30.22819100000001, 141.658386, 191.42069600000002, 182.451406, 188.27667599999998, 0.0, 192.48585400000002, 0.0, 79.817566, 94.469158, 97.0669257, 153.0584423, 87.5491337, 0.0, 87.5491337, 0.0, 377.6008777, 176.6662877, 397.683778, 82.18773, 136.917358, 79.201378, 57.71598, 1.795560000000009, 1.795560000000009, 19.405960000000007, 135.51628, 0.0, 453.675231, 211.71818100000002, 109.460083, 13.761809999999997, 0.0, 114.462883, 7.609375, 159.630814, 9.943822999999998, 0.0, 93.460329, 55.87061700000001, 46.083324000000005, 58.686195999999995, 18.636627, 0.0, 22.810349000000002, 144.659505, 0.0, 267.669085, 290.303405, 110.52316300000001, 52.656178, 110.52316300000001, 52.656178, 123.26508600000001, 61.89890700000001, 158.23855600000002, 194.428161, 181.365445, 264.36523, 0.0, 274.60668, 48.543030000000016, 308.51727600000004, 357.209626, 24.18412, 46.621155, 70.805275, 181.781889, 364.741453, 0.0, 143.62354900000003, 0.0, 4.201691000000004, 0.0, 0.0, 0.0, 135.2808976, 87.3988186, 216.920091, 84.215256, 161.518512, 0.0]
Output data:
[0.0, 195.47146400000003, 0.0, 143.1795457,
19.7727047, 0.0, 37.9259413, 67.4346233, 175.714837, 140.72522700000002, 42.116339999999994, 0.0, 11.829232000000005, 0.0, 225.20435399999997, 25.939856999999996, 9.875561000000005, 0.0, 30.22819100000001, 141.658386, 191.42069600000002, 182.451406, 188.27667599999998, 0.0, 192.48585400000002, 0.0, 79.817566, 94.469158, 97.0669257, 153.0584423, 87.5491337, 0.0, 87.5491337, 0.0, 377.6008777, 176.6662877, 397.683778, 82.18773, 136.917358, 79.201378, 57.71598, 1.795560000000009, 1.795560000000009, 19.405960000000007, 135.51628, 0.0, 0.0, 211.71818100000002, 109.460083, 13.761809999999997, 0.0, 114.462883, 7.609375, 159.630814, 9.943822999999998, 0.0, 93.460329, 55.87061700000001, 46.083324000000005, 58.686195999999995, 18.636627, 0.0, 22.810349000000002, 144.659505, 0.0, 267.669085, 290.303405, 110.52316300000001, 52.656178, 110.52316300000001, 52.656178, 123.26508600000001, 61.89890700000001, 158.23855600000002, 194.428161, 181.365445, 264.36523, 0.0, 274.60668, 48.543030000000016, 308.51727600000004, 357.209626, 24.18412, 46.621155, 70.805275, 181.781889, 364.741453, 0.0, 143.62354900000003, 0.0, 4.201691000000004, 0.0, 0.0, 0.0, 135.2808976, 87.3988186, 216.920091, 84.215256, 161.518512, 0.0]
这仅适用于您需要用距均值 3 个标准差的最接近值填充异常值的特定用例。
import numpy as np
from math import sqrt
from statistics import stdev as std
def isNotOutlier(point, upper, lower):
return (point < upper and point > lower)
def findNearestValue(data, before, current, after, threshAbove, threshBelow):
before = before if before > -1 else 0
after = after if after < len(data) else len(data) - 1
while(True):
if(after < len(data) and isNotOutlier(data[after],threshAbove,threshBelow)):
return data[after]
after += 1
if(before >= 0 and isNotOutlier(data[before],threshAbove,threshBelow)):
return data[before]
before -= 1
def getOutlierLists(data, distancePos, distanceNeg):
outlierList = []
outlierList.extend(data[data > distancePos].tolist())
outlierList.extend(data[data < distanceNeg].tolist())
outlierListIndecies = [i for i, j in enumerate(data) if j in outlierList]
return(outlierList, outlierListIndecies)
def filloutliers(data):
stad = std(data)
mean = np.mean(data)
distancePos = 3*stad + mean
distanceNeg = (-3*stad) + mean
(outlierList, outlierListIndecies) = getOutlierLists(data, distancePos, distanceNeg)
toReplace =[]
for i in range(len(outlierList)):
toReplace.append(findNearestValue(data, outlierListIndecies[i] - 1, outlierListIndecies[i], outlierListIndecies[i] + 1, distancePos, distanceNeg))
for i in range(len(outlierListIndecies)):
data[outlierListIndecies[i]] = toReplace[i]
return data
我需要复制 Matlab 中存在的 filloutliers(someList,'nearest','mean') 函数。
我有以下代码,大部分工作正常。但是,当我给它数据集时,它会替换错误的值。它将 453.675231 替换为 0 而不是 -211.71818100000002。我曾尝试以多种不同的方式更改 compareNeighbors
函数,但老实说,此时我完全不知道该怎么做。
我将添加数据,这样您只需复制并粘贴它就可以了。如果我将 compareNeighbors
函数中的 < 切换为 > 适用于此示例,但不适用于其他示例。
import numpy as np
from math import sqrt
from statistics import stdev as std
def compareNeighbors(before, current, after):
valBefore = (before - current)
valAfter = (after - current)
print(valBefore)
print(valAfter)
return(valBefore < valAfter)
def findNearestValue(data, before, current, after):
before = before if before > -1 else 0
after = after if after < len(data) else len(data) - 1
valBefore = data[before] if before != current else 10000000000
valAfter = data[after] if after != current else 10000000000
return valBefore if compareNeighbors(valBefore, valAfter, data[current]) else valAfter
def getOutlierLists(data, distance):
outlierList = []
outlierList.extend(data[data > distance].tolist())
outlierList.extend(data[data < -distance].tolist())
outlierListIndecies = [i for i, j in enumerate(data) if j in outlierList]
return(outlierList, outlierListIndecies)
def filloutliers(data):
stad = std(data)
mean = np.mean(data)
distance = 3*stad + mean
(outlierList, outlierListIndecies) = getOutlierLists(data, distance)
print(outlierList, " | ", outlierListIndecies, " | ", distance, " | ", mean)
for i in range(len(outlierList)):
data[outlierListIndecies[i]] = findNearestValue(data, outlierListIndecies[i] - 1, outlierListIndecies[i], outlierListIndecies[i] + 1)
(outlierList, outlierListIndecies) = getOutlierLists(data, distance)
if(len(outlierList) != 0):
for i in reversed(range(len(outlierList))):
data[outlierListIndecies[i]] = findNearestValue(data, outlierListIndecies[i] - 1, outlierListIndecies[i], outlierListIndecies[i] + 1)
return data
Outlier value: [453.675231]
Position in array: [46]
Max value for after which a value is an outlier: +/-415.67922821410116
Mean: 99.86239028000001Input data:
[0.0, 195.47146400000003, 0.0, 143.1795457, 19.7727047, 0.0, 37.9259413, 67.4346233, 175.714837, 140.72522700000002, 42.116339999999994, 0.0, 11.829232000000005, 0.0, 225.20435399999997, 25.939856999999996, 9.875561000000005, 0.0, 30.22819100000001, 141.658386, 191.42069600000002, 182.451406, 188.27667599999998, 0.0, 192.48585400000002, 0.0, 79.817566, 94.469158, 97.0669257, 153.0584423, 87.5491337, 0.0, 87.5491337, 0.0, 377.6008777, 176.6662877, 397.683778, 82.18773, 136.917358, 79.201378, 57.71598, 1.795560000000009, 1.795560000000009, 19.405960000000007, 135.51628, 0.0, 453.675231, 211.71818100000002, 109.460083, 13.761809999999997, 0.0, 114.462883, 7.609375, 159.630814, 9.943822999999998, 0.0, 93.460329, 55.87061700000001, 46.083324000000005, 58.686195999999995, 18.636627, 0.0, 22.810349000000002, 144.659505, 0.0, 267.669085, 290.303405, 110.52316300000001, 52.656178, 110.52316300000001, 52.656178, 123.26508600000001, 61.89890700000001, 158.23855600000002, 194.428161, 181.365445, 264.36523, 0.0, 274.60668, 48.543030000000016, 308.51727600000004, 357.209626, 24.18412, 46.621155, 70.805275, 181.781889, 364.741453, 0.0, 143.62354900000003, 0.0, 4.201691000000004, 0.0, 0.0, 0.0, 135.2808976, 87.3988186, 216.920091, 84.215256, 161.518512, 0.0]Output data:
[0.0, 195.47146400000003, 0.0, 143.1795457, 19.7727047, 0.0, 37.9259413, 67.4346233, 175.714837, 140.72522700000002, 42.116339999999994, 0.0, 11.829232000000005, 0.0, 225.20435399999997, 25.939856999999996, 9.875561000000005, 0.0, 30.22819100000001, 141.658386, 191.42069600000002, 182.451406, 188.27667599999998, 0.0, 192.48585400000002, 0.0, 79.817566, 94.469158, 97.0669257, 153.0584423, 87.5491337, 0.0, 87.5491337, 0.0, 377.6008777, 176.6662877, 397.683778, 82.18773, 136.917358, 79.201378, 57.71598, 1.795560000000009, 1.795560000000009, 19.405960000000007, 135.51628, 0.0, 0.0, 211.71818100000002, 109.460083, 13.761809999999997, 0.0, 114.462883, 7.609375, 159.630814, 9.943822999999998, 0.0, 93.460329, 55.87061700000001, 46.083324000000005, 58.686195999999995, 18.636627, 0.0, 22.810349000000002, 144.659505, 0.0, 267.669085, 290.303405, 110.52316300000001, 52.656178, 110.52316300000001, 52.656178, 123.26508600000001, 61.89890700000001, 158.23855600000002, 194.428161, 181.365445, 264.36523, 0.0, 274.60668, 48.543030000000016, 308.51727600000004, 357.209626, 24.18412, 46.621155, 70.805275, 181.781889, 364.741453, 0.0, 143.62354900000003, 0.0, 4.201691000000004, 0.0, 0.0, 0.0, 135.2808976, 87.3988186, 216.920091, 84.215256, 161.518512, 0.0]
这仅适用于您需要用距均值 3 个标准差的最接近值填充异常值的特定用例。
import numpy as np
from math import sqrt
from statistics import stdev as std
def isNotOutlier(point, upper, lower):
return (point < upper and point > lower)
def findNearestValue(data, before, current, after, threshAbove, threshBelow):
before = before if before > -1 else 0
after = after if after < len(data) else len(data) - 1
while(True):
if(after < len(data) and isNotOutlier(data[after],threshAbove,threshBelow)):
return data[after]
after += 1
if(before >= 0 and isNotOutlier(data[before],threshAbove,threshBelow)):
return data[before]
before -= 1
def getOutlierLists(data, distancePos, distanceNeg):
outlierList = []
outlierList.extend(data[data > distancePos].tolist())
outlierList.extend(data[data < distanceNeg].tolist())
outlierListIndecies = [i for i, j in enumerate(data) if j in outlierList]
return(outlierList, outlierListIndecies)
def filloutliers(data):
stad = std(data)
mean = np.mean(data)
distancePos = 3*stad + mean
distanceNeg = (-3*stad) + mean
(outlierList, outlierListIndecies) = getOutlierLists(data, distancePos, distanceNeg)
toReplace =[]
for i in range(len(outlierList)):
toReplace.append(findNearestValue(data, outlierListIndecies[i] - 1, outlierListIndecies[i], outlierListIndecies[i] + 1, distancePos, distanceNeg))
for i in range(len(outlierListIndecies)):
data[outlierListIndecies[i]] = toReplace[i]
return data