如何找到异常值最大的一天
How to find a day with maximum outliers
我有包含几天数据的时间序列。我需要找到异常值最多的一天并仅绘制这一天的数据。
我是怎么做的:
#generate sample data
Sys.setlocale("LC_ALL","English")
Values <- sample(0:100,24241, replace = T)
Values <- rpois(24241, lambda=75)
start <- as.POSIXct("2012-01-15 06:10:00")
interval <- 15
end <- start + as.difftime(4, units="days") + as.difftime(5, units = "hours")
DateTimes <- seq(from=start, by=interval, to=end)
cpu_df <- tibble(datetime = DateTimes, Value = Values)
# find and plot outliers of all days ========================================
upper_bound <- quantile(cpu_df$Value, 0.975)
outlier_ind <- which(cpu_df$Value > upper_bound)
cpu_df_susp <- cpu_df[outlier_ind, ]
alldays_plot <- ggplot(data = cpu_df, aes(x = datetime, y = Value)) +
geom_point(size = 0.9, color = "darkgreen") +
geom_point(data = cpu_df_susp, color = "red", size = 1) +
geom_hline(yintercept=upper_bound, linetype="dashed", color = "red") +
theme_bw() +
labs(x="", title = paste0("% Processor Time, _Total, Percentile: 0.975, Threshold: ", round(upper_bound,2)))
# ========== convert to xts ====================================================
suppressMessages(library(xts))
cpu_df_xts <- xts(x = cpu_df$Value, order.by = cpu_df$datetime)
days <- split(cpu_df_xts, f="days")
#========= find worst day - with biggest number of outliers
outliers_number <- 0
worstday_index <- 0
for (i in 1:(length(days))) {
upper_bound <- quantile( coredata(days[[i]]), 0.975)
outlier_ind <- which(coredata(days[[i]]) > upper_bound)
outlier_day_number <- length(outlier_ind)
if ( outlier_day_number > outliers_number
){
worstday_index <- i
outliers_number <- outlier_day_number
worst_day_outliers_ind <- outlier_ind
}
}
WorstDay <- days[[worstday_index]]
# find outliers of worst day ====================================================
worst_day_outliers <- WorstDay[worst_day_outliers_ind, ]
# convert xts back to tibble
WorstDayTibble <- tibble( datetime = index(WorstDay),
Value = coredata(WorstDay) )
outliersTibble <- tibble( datetime = index(worst_day_outliers),
Value = coredata(worst_day_outliers) )
# plot worst day ====================================================
worstDay_Plot <- ggplot(data = WorstDayTibble, aes(x = datetime, y = Value)) +
geom_point(size = 0.9, color = "darkgreen") +
geom_point(data = outliersTibble, color = "red", size = 1) +
geom_hline(yintercept=upper_bound, linetype="dashed", color = "red") +
theme_bw() +
labs(x="", title = paste0("% Processor Time, _Total, Percentile: 0.975, Threshold: ", round(upper_bound,2)))
library(ggpubr)
ggpubr::ggarrange(alldays_plot, worstDay_Plot)
结果如下:
我不喜欢我的代码 - 将数据拆分为几天并在其中搜索我需要将其转换为 xts。要通过 ggplot2 绘制数据,我必须将数据转换回 tibble。是否可以避免这种双重转换并使代码更简单?
您不需要将数据转换为 xts
并返回。将数据保存在 dateframe/tibble 中,您可以使用 :
获得最糟糕的一天
library(dplyr)
#Add date column
cpu_df <- cpu_df %>% mutate(date = as.Date(datetime))
#For each date count number of Value greater than 0.975 quantile
#and select the date with max outliers.
WorstDay <- cpu_df %>%
group_by(date) %>%
summarise(n = sum(Value > quantile(Value, 0.975))) %>%
slice(which.max(n)) %>%
left_join(cpu_df, by = 'date')
您可以使用此数据进行绘图。
我有包含几天数据的时间序列。我需要找到异常值最多的一天并仅绘制这一天的数据。
我是怎么做的:
#generate sample data
Sys.setlocale("LC_ALL","English")
Values <- sample(0:100,24241, replace = T)
Values <- rpois(24241, lambda=75)
start <- as.POSIXct("2012-01-15 06:10:00")
interval <- 15
end <- start + as.difftime(4, units="days") + as.difftime(5, units = "hours")
DateTimes <- seq(from=start, by=interval, to=end)
cpu_df <- tibble(datetime = DateTimes, Value = Values)
# find and plot outliers of all days ========================================
upper_bound <- quantile(cpu_df$Value, 0.975)
outlier_ind <- which(cpu_df$Value > upper_bound)
cpu_df_susp <- cpu_df[outlier_ind, ]
alldays_plot <- ggplot(data = cpu_df, aes(x = datetime, y = Value)) +
geom_point(size = 0.9, color = "darkgreen") +
geom_point(data = cpu_df_susp, color = "red", size = 1) +
geom_hline(yintercept=upper_bound, linetype="dashed", color = "red") +
theme_bw() +
labs(x="", title = paste0("% Processor Time, _Total, Percentile: 0.975, Threshold: ", round(upper_bound,2)))
# ========== convert to xts ====================================================
suppressMessages(library(xts))
cpu_df_xts <- xts(x = cpu_df$Value, order.by = cpu_df$datetime)
days <- split(cpu_df_xts, f="days")
#========= find worst day - with biggest number of outliers
outliers_number <- 0
worstday_index <- 0
for (i in 1:(length(days))) {
upper_bound <- quantile( coredata(days[[i]]), 0.975)
outlier_ind <- which(coredata(days[[i]]) > upper_bound)
outlier_day_number <- length(outlier_ind)
if ( outlier_day_number > outliers_number
){
worstday_index <- i
outliers_number <- outlier_day_number
worst_day_outliers_ind <- outlier_ind
}
}
WorstDay <- days[[worstday_index]]
# find outliers of worst day ====================================================
worst_day_outliers <- WorstDay[worst_day_outliers_ind, ]
# convert xts back to tibble
WorstDayTibble <- tibble( datetime = index(WorstDay),
Value = coredata(WorstDay) )
outliersTibble <- tibble( datetime = index(worst_day_outliers),
Value = coredata(worst_day_outliers) )
# plot worst day ====================================================
worstDay_Plot <- ggplot(data = WorstDayTibble, aes(x = datetime, y = Value)) +
geom_point(size = 0.9, color = "darkgreen") +
geom_point(data = outliersTibble, color = "red", size = 1) +
geom_hline(yintercept=upper_bound, linetype="dashed", color = "red") +
theme_bw() +
labs(x="", title = paste0("% Processor Time, _Total, Percentile: 0.975, Threshold: ", round(upper_bound,2)))
library(ggpubr)
ggpubr::ggarrange(alldays_plot, worstDay_Plot)
结果如下:
我不喜欢我的代码 - 将数据拆分为几天并在其中搜索我需要将其转换为 xts。要通过 ggplot2 绘制数据,我必须将数据转换回 tibble。是否可以避免这种双重转换并使代码更简单?
您不需要将数据转换为 xts
并返回。将数据保存在 dateframe/tibble 中,您可以使用 :
library(dplyr)
#Add date column
cpu_df <- cpu_df %>% mutate(date = as.Date(datetime))
#For each date count number of Value greater than 0.975 quantile
#and select the date with max outliers.
WorstDay <- cpu_df %>%
group_by(date) %>%
summarise(n = sum(Value > quantile(Value, 0.975))) %>%
slice(which.max(n)) %>%
left_join(cpu_df, by = 'date')
您可以使用此数据进行绘图。