如何找到异常值最大的一天

How to find a day with maximum outliers

我有包含几天数据的时间序列。我需要找到异常值最多的一天并仅绘制这一天的数据。

我是怎么做的:

#generate sample data

Sys.setlocale("LC_ALL","English")

Values <- sample(0:100,24241,  replace = T)

Values <- rpois(24241, lambda=75)

start <- as.POSIXct("2012-01-15 06:10:00")
interval <- 15
end <- start + as.difftime(4, units="days") + as.difftime(5, units = "hours")

DateTimes <-  seq(from=start, by=interval, to=end) 

cpu_df <- tibble(datetime = DateTimes, Value =  Values)


# find and plot outliers of all days ========================================

upper_bound <- quantile(cpu_df$Value, 0.975)
outlier_ind <- which(cpu_df$Value > upper_bound)
cpu_df_susp <- cpu_df[outlier_ind, ]

alldays_plot <-   ggplot(data = cpu_df, aes(x = datetime, y = Value)) +
  geom_point(size = 0.9, color = "darkgreen") +
  geom_point(data = cpu_df_susp, color = "red", size = 1) + 
  geom_hline(yintercept=upper_bound, linetype="dashed", color = "red") +
  theme_bw()  +
  labs(x="", title = paste0("% Processor Time, _Total, Percentile: 0.975, Threshold: ", round(upper_bound,2)))


# ========== convert to xts  ====================================================
suppressMessages(library(xts))

cpu_df_xts <- xts(x = cpu_df$Value, order.by = cpu_df$datetime)

days <- split(cpu_df_xts, f="days")


#========= find worst day - with biggest number of outliers
outliers_number <- 0
worstday_index <- 0

for (i in 1:(length(days))) {
 
  upper_bound <- quantile( coredata(days[[i]]), 0.975)
  outlier_ind <- which(coredata(days[[i]]) > upper_bound)
  outlier_day_number <- length(outlier_ind)
  
 
  if ( outlier_day_number > outliers_number 
  ){
    worstday_index <- i
    outliers_number <- outlier_day_number
    worst_day_outliers_ind <- outlier_ind 
  }
  
}

WorstDay <- days[[worstday_index]]

# find outliers of worst day ====================================================

worst_day_outliers <- WorstDay[worst_day_outliers_ind, ]

# convert xts back to tibble

WorstDayTibble <-  tibble( datetime = index(WorstDay),
                          Value = coredata(WorstDay) )


outliersTibble <- tibble( datetime = index(worst_day_outliers),
                          Value = coredata(worst_day_outliers) )

# plot worst day  ====================================================

worstDay_Plot <-   ggplot(data = WorstDayTibble, aes(x = datetime, y = Value)) +
  geom_point(size = 0.9, color = "darkgreen") +
  geom_point(data = outliersTibble, color = "red", size = 1) + 
  geom_hline(yintercept=upper_bound, linetype="dashed", color = "red") +
  theme_bw()  +
  labs(x="", title = paste0("% Processor Time, _Total, Percentile: 0.975, Threshold: ", round(upper_bound,2)))


library(ggpubr)
ggpubr::ggarrange(alldays_plot, worstDay_Plot)

结果如下:

我不喜欢我的代码 - 将数据拆分为几天并在其中搜索我需要将其转换为 xts。要通过 ggplot2 绘制数据,我必须将数据转换回 tibble。是否可以避免这种双重转换并使代码更简单?

您不需要将数据转换为 xts 并返回。将数据保存在 dateframe/tibble 中,您可以使用 :

获得最糟糕的一天
library(dplyr)

#Add date column
cpu_df <- cpu_df %>% mutate(date = as.Date(datetime))

#For each date count number of Value greater than 0.975 quantile 
#and select the date with max outliers.
WorstDay <- cpu_df %>%
             group_by(date) %>%
             summarise(n = sum(Value > quantile(Value, 0.975))) %>%
             slice(which.max(n)) %>%
             left_join(cpu_df, by = 'date')

您可以使用此数据进行绘图。