功能:apply in apply,去除异常值

Function: sapply in apply, removing outliers

我正在开发一个函数,该函数将根据 3 西格玛规则消除给定数据集中的异常值。我的代码如下所示。 "data"是待处理的数据集。

rm.outlier <- function(data){

  apply(data, 2, function(var) {
      sigma3.plus <- mean(var) + 3 * sd(var) 
      sigma3.min <- mean(var) - 3 * sd(var)
      sapply(var, function(y) {
        if (y > sigma3.plus){
          y <- sigma3.plus
        } else if (y < sigma3.min){
          y <- sigma3.min
        } else {y <- y}
      })
    })
    as.data.frame(data)
}

为了检查函数是否有效我写了一个简短的测试:

set.seed(123)
a <- data.frame("var1" = rnorm(10000, 0, 1))
b <- a
sum(a$var1 > mean(a$var1) + 3 * sd(a$var1)) # number of outliers in a

结果,我得到:

[1] 12

所以数据框a中的变量var1有12个异常值。接下来,我尝试在这个对象上应用我的函数:

a2 <- rm.outlier(a)
sum(b$var1 - a2$var1)

不幸的是,它给出了 0,这清楚地表明某些东西不起作用。我已经算出sapply的实现是正确的,所以肯定是我的apply出错了。任何帮助,将不胜感激。

您似乎只是忘记将 apply 函数的结果分配给新的数据框。 (将第 3 行与您的代码进行比较)

rm.outlier <- function(data){

  # Assign the result to a new dataframe
  data_new <- apply(data, 2, function(var) {
    sigma3.plus <- mean(var) + 3 * sd(var) 
    sigma3.min <- mean(var) - 3 * sd(var)
    sapply(var, function(y) {
      if (y > sigma3.plus){
        y <- sigma3.plus
      } else if (y < sigma3.min){
        y <- sigma3.min
      } else {y <- y}
    })
  })

  # Print the new dataframe
  as.data.frame(data_new)
}

set.seed(123)
a <- data.frame("var1" = rnorm(10000, 0, 1))
sum(a$var1 > mean(a$var1) + 3 * sd(a$var1)) # number of too big outliers
# 15
sum(a$var1 < mean(a$var1) - 3 * sd(a$var1)) # number of too small outliers
# 13
# Overall 28 outliers

# Check the function for the number of outliers
a2 <- rm.outlier(a)
sum(a2$var1 == a$var1) - length(a$var1)

如果运行时间对您很重要,您可以考虑另一种方法。您可以矢量化此过滤,例如通过使用 pminpmax,它们同样具有可读性并且速度提高了 15 倍以上。如果你喜欢它更复杂一点,你可以使用 findInterval 并获得更快的速度:

rm.outlier2 <- function(x) {
  ## calculate -3/3 * sigma borders
  s <- mean(x) + c(-3, 3) * sd(x)
  pmin(pmax(x, s[1]), s[2])
}

rm.outlier3 <- function(x) {
  ## calculate -3/3 * sigma borders
  s <- mean(x) + c(-3, 3) * sd(x)
  ## sorts x into intervals 0 == left of s[1], 2 == right of s[2], 1
  ## between both s
  i <- findInterval(x, s)
  ## which values are left/right of the interval
  j <- which(i != 1L)
  ## add a value between s to directly use output of findInterval for subsetting
  s2 <- c(s[1], 0, s[2])
  ## replace all values that are left/right of the interval
  x[j] <- s2[i[j] + 1L]
  x
}

基准测试:

## slightly modified OP version
rm.outlier <- function(x) {
  sigma3 <- mean(x) + c(-3,3) * sd(x)
  sapply(x, function(y) {
    if (y > sigma3[2]){
      y <- sigma3[2]
    } else if (y < sigma3[1]){
      y <- sigma3[1]
    } else {y <- y}
  })
}

set.seed(123)
a <- rnorm(10000, 0, 1)

# check output
all.equal(rm.outlier(a), rm.outlier2(a))
all.equal(rm.outlier2(a), rm.outlier3(a))

library("rbenchmark")

benchmark(rm.outlier(a), rm.outlier2(a), rm.outlier3(a),
          order = "relative",
          columns = c("test", "replications", "elapsed", "relative"))
#            test replications elapsed relative
#3 rm.outlier3(a)          100   0.028    1.000
#2 rm.outlier2(a)          100   0.102    3.643
#1  rm.outlier(a)          100   1.825   65.179