在 R 中使用 dplyr 循环获取异常值数据
Looping to get outliers data using dplyr in R
我的任务是查找异常值数据,这是我的数据:
# combination 1
datex <- c(rep("07/01/2021", 24), rep("07/02/2021", 24), rep("07/03/2021", 24), rep("07/04/2021", 24), rep("07/05/2021", 24))
datex <- as.Date(datex, "%m/%d/%y")
hourx <- c (0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23)
seller <- c("do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1")
product <- rep(0, 120)
detail <- rep(0, 120)
status <- rep(0, 120)
channel <- rep(0, 120)
transaction <-c(5664,4797,2515,1744,2166,2164,3513,6548,7620,8662,11295,11372,12094,14064,15412,13042,12779,14653,13586,12922,11321,9709,7899,5916,5791,5544,3567,1783,2900,4488,1830,4946,6735,16673,12024,8614,16545,11628,8856,13660,10913,11928,12359,9267,7672,6487,10677,4271,3351,4264,3764,3313,1492,4324,4277,4928,7752,8940,10545,10488,13766,11594,8317,12139,14274,11617,7513,8215,7687,4374,5465,4548,3419,2136,2679,2714,3072,2984,3203,6689,6113,8923,6755,6968,7711,5305,3827,4341,5915,6554,7376,6707,3685,4366,3086,1277,2218,1089,282 ,156 ,907,1691,2786,5229,6081,7133,8617,9759,12984,15060,11906,15909,21934,14993,9776,9721,8707,8080,2245,4702)
mycomb1 <- data.frame(datex, hourx, seller, product, detail, status, channel, transaction)
# combination 2
datex <- c(rep("07/01/2021", 24), rep("07/02/2021", 24), rep("07/03/2021", 24), rep("07/04/2021", 24), rep("07/05/2021", 24))
datex <- as.Date(datex, "%m/%d/%y")
hourx <- c (0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23)
seller <- c("do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5")
product <- rep(0, 120)
detail <- rep(1, 120)
status <- rep(0, 120)
channel <- rep(1, 120)
transaction <-c(5564,4588,3256,1034,2479,3678,5454,6104,8199,9261,10115,13665,12030,11996,12610,15061,15957,19130,15086,11779,14274,10614,7442,10216,4937,9178,5871,6702,3150,6505,4855,4744,10661,10485,10805,9321,14260,9831,15602,10599,14739,14117,8549,9638,9161,8282,7877,2060,2492,2816,3983,2053,4758,5717,2816,6141,8322,9745,9677,14478,11905,9580,8742,11012,5775,6773,8583,9261,10890,11950,5248,3579,3176,7268,605 ,1642,1122,6046,3241,4189,6534,7445,8518,7585,9574,5453,5467,4302,6664,8297,6801,5637,4323,2963,1872,1466,1472,1129,581 ,275 ,716 ,1963,2333,5507,7601,7478,7760,16975,11986,15282,12122,10815,16060,21552,11587,11873,7778,7058,6153,3423)
mycomb2 <- data.frame(datex, hourx, seller, product, detail, status, channel, transaction)
# combination 3
datex <- c(rep("07/01/2021", 22), rep("07/02/2021", 24), rep("07/03/2021", 24), rep("07/04/2021", 24), rep("07/05/2021", 24))
datex <- as.Date(datex, "%m/%d/%y")
hourx <- c (2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23)
seller <- c("do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9")
product <- rep(1, 118)
detail <- rep(2, 118)
status <- rep(1, 118)
channel <- rep(2, 118)
transaction <- c(12,120 ,120 ,120 ,140 ,144 ,120 ,112 ,106 ,120 ,150 ,120 ,116 ,120 ,96,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,143 ,120 ,120 ,102 ,96,120 ,120 ,120 ,120 ,125 ,120 ,94,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,200 ,118 ,120 ,120 ,120 ,180 ,120 ,100 ,92,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,140 ,120 ,120 ,165 ,120 ,120 ,120 ,120 ,120 ,120 ,100 ,110 ,120 ,120 ,88,66,120 ,118 ,120 ,120 ,118 ,120 ,120 ,120 ,120 ,120 ,120 ,120 )
mycomb3 <- data.frame(datex, hourx, seller, product, detail, status, channel, transaction)
# my data
mydata <- rbind(mycomb1, mycomb2, mycomb3)
mydata
# A tibble: 358 x 8
# datex hourx seller product detail status channel transaction
# <date> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2021-07-01 0 do1 0 0 0 0 5664
# 2 2021-07-01 1 do1 0 0 0 0 4797
# 3 2021-07-01 2 do1 0 0 0 0 2515
# 4 2021-07-01 3 do1 0 0 0 0 1744
# 5 2021-07-01 4 do1 0 0 0 0 2166
# 6 2021-07-01 5 do1 0 0 0 0 2164
# 7 2021-07-01 6 do1 0 0 0 0 3513
# 8 2021-07-01 7 do1 0 0 0 0 6548
# 9 2021-07-01 8 do1 0 0 0 0 7620
#10 2021-07-01 9 do1 0 0 0 0 8662
# … with 348 more rows
这是重新排序列的附加功能,使结果变得更好。
# Function
moveme <- function (invec, movecommand){
movecommand <- lapply(strsplit(strsplit(movecommand, ";")[[1]], ",|\s+"), function(x) x[x != ""])
movelist <- lapply(movecommand, function(x){
Where <- x[which(x %in% c("before", "after", "first", "last")):length(x)]
ToMove <- setdiff(x, Where)
list(ToMove, Where)
})
myVec <- invec
for (i in seq_along(movelist)){
temp <- setdiff(myVec, movelist[[i]][[1]])
A <- movelist[[i]][[2]][1]
if (A %in% c("before", "after")){
ba <- movelist[[i]][[2]][2]
if (A == "before"){
after <- match(ba, temp)-1
}
else if (A == "after"){
after <- match(ba, temp)
}
}
else if (A == "first"){
after <- 0
}
else if (A == "last"){
after <- length(myVec)
}
myVec <- append(temp, values = movelist[[i]][[1]], after = after)
}
myVec
}
我想使用循环从 mydata 中的许多组合数据中获取异常值数据,这是获取异常值数据的手动过程。
- 首先,我从 mydata 中提取数据。
- 其次,我检查数据的季节性。
- 第三,如果为真,那么我们使用 R 中的“timetk”包来获取异常值。但是,如果它是 FALSE,那么我们在 R 中使用“qcc”包。
- 循环完成后收集异常值数据。
这是我手动为第一次和第二次循环获取异常值。
第一次循环
- 首先,我从 mydata 中提取数据。
## Looping 1
mydata.comb1 <- subset(mydata, seller == "do1" & product == 0 & detail == 0 & status == 0 & channel == 0)
- 其次,我检查数据的季节性。
# Checking Seasonality
library(seastests)
isSeasonal(as.ts(mydata.comb1$transaction), test = "wo", freq = 24)
#TRUE
- 第三,它是TRUE,那么我们使用 then 我们使用“timetk”
library(dplyr)
library(timetk)
mydata.comb1 %>%
group_by(across(seller:channel)) %>%
tk_anomaly_diagnostics(datex, transaction) %>%
ungroup -> model.anomaly.seasonal
model.anomaly.seasonal.data <- subset(model.anomaly.seasonal, anomaly == "Yes")
model.anomaly.seasonal.data2 <- model.anomaly.seasonal.data[moveme(names(model.anomaly.seasonal.data),"datex first")]
model.anomaly.seasonal.data3 <- model.anomaly.seasonal.data2[,c(1:7)]
colnames(model.anomaly.seasonal.data3)[7] <- "transaction"
model.anomaly.seasonal.data3 %>%
left_join(mydata.comb1) -> model.anomaly.seasonal.data4
model.anomaly.seasonal.data5 <- na.omit(model.anomaly.seasonal.data4[moveme(names(model.anomaly.seasonal.data4),"hourx before seller")])
looping1 <- model.anomaly.seasonal.data5
looping1
# A tibble: 6 x 8
# datex hourx seller product detail status channel transaction
# <date> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 2021-07-01 14 do1 0 0 0 0 15412
#2 2021-07-02 9 do1 0 0 0 0 16673
#3 2021-07-02 12 do1 0 0 0 0 16545
#4 2021-07-02 22 do1 0 0 0 0 10677
#5 2021-07-05 16 do1 0 0 0 0 21934
#6 2021-07-05 22 do1 0 0 0 0 2245
第二次循环
- 首先,我从 mydata 中提取数据。
mydata.comb2 <- subset(mydata, seller == "do9" & product == 1 & detail == 2 & status == 1 & channel == 2)
- 其次,我检查数据的季节性。
# Checking Seasonality
library(seastests)
y <- mydata.comb2$transaction
isSeasonal(as.ts(y), test = "wo", freq = 24)
#FALSE
- 第三,是FALSE,那我们用then 我们用"qcc"
library(dplyr)
library(qcc)
model.anomaly.non.seasonal <- qcc(as.ts(y), type = "xbar.one", plot = F)
model.anomaly.non.seasonal.data <- data.frame(ind = model.anomaly.non.seasonal$violations$beyond.limits, transaction = y[model.anomaly.non.seasonal$violations$beyond.limits])
model.anomaly.non.seasonal.conf <- model.anomaly.non.seasonal$limits[2]
model.anomaly.non.seasonal.indeks <- subset(model.anomaly.non.seasonal.data, transaction > model.anomaly.non.seasonal.conf)$ind
model.anomaly.non.seasonal.result <- mydata.comb2[model.anomaly.non.seasonal.indeks,]
looping2 <- model.anomaly.non.seasonal.result
looping2
# datex hourx seller product detail status channel transaction
#6 2021-07-01 7 do9 1 2 1 2 144
#11 2021-07-01 12 do9 1 2 1 2 150
#48 2021-07-03 1 do9 1 2 1 2 200
#53 2021-07-03 6 do9 1 2 1 2 180
#94 2021-07-04 23 do9 1 2 1 2 165
然后我们从循环中收集结果。这是结果:
myresult <- rbind(looping1, looping2)
myresult
# A tibble: 11 x 8
# datex hourx seller product detail status channel transaction
# * <date> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2021-07-01 14 do1 0 0 0 0 15412
# 2 2021-07-02 9 do1 0 0 0 0 16673
# 3 2021-07-02 12 do1 0 0 0 0 16545
# 4 2021-07-02 22 do1 0 0 0 0 10677
# 5 2021-07-05 16 do1 0 0 0 0 21934
# 6 2021-07-05 22 do1 0 0 0 0 2245
# 7 2021-07-01 7 do9 1 2 1 2 144
# 8 2021-07-01 12 do9 1 2 1 2 150
# 9 2021-07-03 1 do9 1 2 1 2 200
#10 2021-07-03 6 do9 1 2 1 2 180
#11 2021-07-04 23 do9 1 2 1 2 165
我有 120K 循环,如果我手动这样做会很痛苦。那么,我如何使用 dplyr 使这个循环过程更简单?非常感谢。
可能可以进一步优化代码,但将您已有的代码放入您可以执行的函数中 -
library(dplyr)
library(seastests)
library(timetk)
library(qcc)
library(purrr)
custom_fn <- function(mydata.comb1) {
if(isSeasonal(as.ts(mydata.comb1$transaction), test = "wo", freq = 24)) {
mydata.comb1 %>%
group_by(across(seller:channel)) %>%
tk_anomaly_diagnostics(datex, transaction) %>%
ungroup -> model.anomaly.seasonal
model.anomaly.seasonal.data <- subset(model.anomaly.seasonal, anomaly == "Yes")
model.anomaly.seasonal.data2 <- model.anomaly.seasonal.data[moveme(names(model.anomaly.seasonal.data),"datex first")]
model.anomaly.seasonal.data3 <- model.anomaly.seasonal.data2[,c(1:7)]
colnames(model.anomaly.seasonal.data3)[7] <- "transaction"
model.anomaly.seasonal.data3 %>%
left_join(mydata.comb1) -> model.anomaly.seasonal.data4
model.anomaly.seasonal.data5 <- na.omit(model.anomaly.seasonal.data4[moveme(names(model.anomaly.seasonal.data4),"hourx before seller")])
looping1 <- model.anomaly.seasonal.data5
} else {
y <- mydata.comb1$transaction
model.anomaly.non.seasonal <- qcc(as.ts(y), type = "xbar.one", plot = F)
model.anomaly.non.seasonal.data <- data.frame(ind = model.anomaly.non.seasonal$violations$beyond.limits, transaction = y[model.anomaly.non.seasonal$violations$beyond.limits])
model.anomaly.non.seasonal.conf <- model.anomaly.non.seasonal$limits[2]
model.anomaly.non.seasonal.indeks <- subset(model.anomaly.non.seasonal.data, transaction > model.anomaly.non.seasonal.conf)$ind
model.anomaly.non.seasonal.result <- mydata.comb1[model.anomaly.non.seasonal.indeks,]
looping1 <- model.anomaly.non.seasonal.result
}
return(looping1)
}
将数据分成几组并将此函数应用于每个子集。
mydata %>%
group_split(seller, product, detail, status, channel) %>%
map_df(custom_fn)
# datex hourx seller product detail status channel transaction
# <date> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2020-07-01 14 do1 0 0 0 0 15412
# 2 2020-07-02 9 do1 0 0 0 0 16673
# 3 2020-07-02 12 do1 0 0 0 0 16545
# 4 2020-07-02 22 do1 0 0 0 0 10677
# 5 2020-07-05 16 do1 0 0 0 0 21934
# 6 2020-07-05 22 do1 0 0 0 0 2245
# 7 2020-07-03 16 do5 0 1 0 1 5775
# 8 2020-07-05 11 do5 0 1 0 1 16975
# 9 2020-07-05 17 do5 0 1 0 1 21552
#10 2020-07-01 7 do9 1 2 1 2 144
#11 2020-07-01 12 do9 1 2 1 2 150
#12 2020-07-03 1 do9 1 2 1 2 200
#13 2020-07-03 6 do9 1 2 1 2 180
#14 2020-07-04 23 do9 1 2 1 2 165
我的任务是查找异常值数据,这是我的数据:
# combination 1
datex <- c(rep("07/01/2021", 24), rep("07/02/2021", 24), rep("07/03/2021", 24), rep("07/04/2021", 24), rep("07/05/2021", 24))
datex <- as.Date(datex, "%m/%d/%y")
hourx <- c (0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23)
seller <- c("do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1","do1")
product <- rep(0, 120)
detail <- rep(0, 120)
status <- rep(0, 120)
channel <- rep(0, 120)
transaction <-c(5664,4797,2515,1744,2166,2164,3513,6548,7620,8662,11295,11372,12094,14064,15412,13042,12779,14653,13586,12922,11321,9709,7899,5916,5791,5544,3567,1783,2900,4488,1830,4946,6735,16673,12024,8614,16545,11628,8856,13660,10913,11928,12359,9267,7672,6487,10677,4271,3351,4264,3764,3313,1492,4324,4277,4928,7752,8940,10545,10488,13766,11594,8317,12139,14274,11617,7513,8215,7687,4374,5465,4548,3419,2136,2679,2714,3072,2984,3203,6689,6113,8923,6755,6968,7711,5305,3827,4341,5915,6554,7376,6707,3685,4366,3086,1277,2218,1089,282 ,156 ,907,1691,2786,5229,6081,7133,8617,9759,12984,15060,11906,15909,21934,14993,9776,9721,8707,8080,2245,4702)
mycomb1 <- data.frame(datex, hourx, seller, product, detail, status, channel, transaction)
# combination 2
datex <- c(rep("07/01/2021", 24), rep("07/02/2021", 24), rep("07/03/2021", 24), rep("07/04/2021", 24), rep("07/05/2021", 24))
datex <- as.Date(datex, "%m/%d/%y")
hourx <- c (0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23)
seller <- c("do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5","do5")
product <- rep(0, 120)
detail <- rep(1, 120)
status <- rep(0, 120)
channel <- rep(1, 120)
transaction <-c(5564,4588,3256,1034,2479,3678,5454,6104,8199,9261,10115,13665,12030,11996,12610,15061,15957,19130,15086,11779,14274,10614,7442,10216,4937,9178,5871,6702,3150,6505,4855,4744,10661,10485,10805,9321,14260,9831,15602,10599,14739,14117,8549,9638,9161,8282,7877,2060,2492,2816,3983,2053,4758,5717,2816,6141,8322,9745,9677,14478,11905,9580,8742,11012,5775,6773,8583,9261,10890,11950,5248,3579,3176,7268,605 ,1642,1122,6046,3241,4189,6534,7445,8518,7585,9574,5453,5467,4302,6664,8297,6801,5637,4323,2963,1872,1466,1472,1129,581 ,275 ,716 ,1963,2333,5507,7601,7478,7760,16975,11986,15282,12122,10815,16060,21552,11587,11873,7778,7058,6153,3423)
mycomb2 <- data.frame(datex, hourx, seller, product, detail, status, channel, transaction)
# combination 3
datex <- c(rep("07/01/2021", 22), rep("07/02/2021", 24), rep("07/03/2021", 24), rep("07/04/2021", 24), rep("07/05/2021", 24))
datex <- as.Date(datex, "%m/%d/%y")
hourx <- c (2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23)
seller <- c("do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9","do9")
product <- rep(1, 118)
detail <- rep(2, 118)
status <- rep(1, 118)
channel <- rep(2, 118)
transaction <- c(12,120 ,120 ,120 ,140 ,144 ,120 ,112 ,106 ,120 ,150 ,120 ,116 ,120 ,96,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,143 ,120 ,120 ,102 ,96,120 ,120 ,120 ,120 ,125 ,120 ,94,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,200 ,118 ,120 ,120 ,120 ,180 ,120 ,100 ,92,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,120 ,140 ,120 ,120 ,165 ,120 ,120 ,120 ,120 ,120 ,120 ,100 ,110 ,120 ,120 ,88,66,120 ,118 ,120 ,120 ,118 ,120 ,120 ,120 ,120 ,120 ,120 ,120 )
mycomb3 <- data.frame(datex, hourx, seller, product, detail, status, channel, transaction)
# my data
mydata <- rbind(mycomb1, mycomb2, mycomb3)
mydata
# A tibble: 358 x 8
# datex hourx seller product detail status channel transaction
# <date> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2021-07-01 0 do1 0 0 0 0 5664
# 2 2021-07-01 1 do1 0 0 0 0 4797
# 3 2021-07-01 2 do1 0 0 0 0 2515
# 4 2021-07-01 3 do1 0 0 0 0 1744
# 5 2021-07-01 4 do1 0 0 0 0 2166
# 6 2021-07-01 5 do1 0 0 0 0 2164
# 7 2021-07-01 6 do1 0 0 0 0 3513
# 8 2021-07-01 7 do1 0 0 0 0 6548
# 9 2021-07-01 8 do1 0 0 0 0 7620
#10 2021-07-01 9 do1 0 0 0 0 8662
# … with 348 more rows
这是重新排序列的附加功能,使结果变得更好。
# Function
moveme <- function (invec, movecommand){
movecommand <- lapply(strsplit(strsplit(movecommand, ";")[[1]], ",|\s+"), function(x) x[x != ""])
movelist <- lapply(movecommand, function(x){
Where <- x[which(x %in% c("before", "after", "first", "last")):length(x)]
ToMove <- setdiff(x, Where)
list(ToMove, Where)
})
myVec <- invec
for (i in seq_along(movelist)){
temp <- setdiff(myVec, movelist[[i]][[1]])
A <- movelist[[i]][[2]][1]
if (A %in% c("before", "after")){
ba <- movelist[[i]][[2]][2]
if (A == "before"){
after <- match(ba, temp)-1
}
else if (A == "after"){
after <- match(ba, temp)
}
}
else if (A == "first"){
after <- 0
}
else if (A == "last"){
after <- length(myVec)
}
myVec <- append(temp, values = movelist[[i]][[1]], after = after)
}
myVec
}
我想使用循环从 mydata 中的许多组合数据中获取异常值数据,这是获取异常值数据的手动过程。
- 首先,我从 mydata 中提取数据。
- 其次,我检查数据的季节性。
- 第三,如果为真,那么我们使用 R 中的“timetk”包来获取异常值。但是,如果它是 FALSE,那么我们在 R 中使用“qcc”包。
- 循环完成后收集异常值数据。
这是我手动为第一次和第二次循环获取异常值。
第一次循环
- 首先,我从 mydata 中提取数据。
## Looping 1
mydata.comb1 <- subset(mydata, seller == "do1" & product == 0 & detail == 0 & status == 0 & channel == 0)
- 其次,我检查数据的季节性。
# Checking Seasonality
library(seastests)
isSeasonal(as.ts(mydata.comb1$transaction), test = "wo", freq = 24)
#TRUE
- 第三,它是TRUE,那么我们使用 then 我们使用“timetk”
library(dplyr)
library(timetk)
mydata.comb1 %>%
group_by(across(seller:channel)) %>%
tk_anomaly_diagnostics(datex, transaction) %>%
ungroup -> model.anomaly.seasonal
model.anomaly.seasonal.data <- subset(model.anomaly.seasonal, anomaly == "Yes")
model.anomaly.seasonal.data2 <- model.anomaly.seasonal.data[moveme(names(model.anomaly.seasonal.data),"datex first")]
model.anomaly.seasonal.data3 <- model.anomaly.seasonal.data2[,c(1:7)]
colnames(model.anomaly.seasonal.data3)[7] <- "transaction"
model.anomaly.seasonal.data3 %>%
left_join(mydata.comb1) -> model.anomaly.seasonal.data4
model.anomaly.seasonal.data5 <- na.omit(model.anomaly.seasonal.data4[moveme(names(model.anomaly.seasonal.data4),"hourx before seller")])
looping1 <- model.anomaly.seasonal.data5
looping1
# A tibble: 6 x 8
# datex hourx seller product detail status channel transaction
# <date> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 2021-07-01 14 do1 0 0 0 0 15412
#2 2021-07-02 9 do1 0 0 0 0 16673
#3 2021-07-02 12 do1 0 0 0 0 16545
#4 2021-07-02 22 do1 0 0 0 0 10677
#5 2021-07-05 16 do1 0 0 0 0 21934
#6 2021-07-05 22 do1 0 0 0 0 2245
第二次循环
- 首先,我从 mydata 中提取数据。
mydata.comb2 <- subset(mydata, seller == "do9" & product == 1 & detail == 2 & status == 1 & channel == 2)
- 其次,我检查数据的季节性。
# Checking Seasonality
library(seastests)
y <- mydata.comb2$transaction
isSeasonal(as.ts(y), test = "wo", freq = 24)
#FALSE
- 第三,是FALSE,那我们用then 我们用"qcc"
library(dplyr)
library(qcc)
model.anomaly.non.seasonal <- qcc(as.ts(y), type = "xbar.one", plot = F)
model.anomaly.non.seasonal.data <- data.frame(ind = model.anomaly.non.seasonal$violations$beyond.limits, transaction = y[model.anomaly.non.seasonal$violations$beyond.limits])
model.anomaly.non.seasonal.conf <- model.anomaly.non.seasonal$limits[2]
model.anomaly.non.seasonal.indeks <- subset(model.anomaly.non.seasonal.data, transaction > model.anomaly.non.seasonal.conf)$ind
model.anomaly.non.seasonal.result <- mydata.comb2[model.anomaly.non.seasonal.indeks,]
looping2 <- model.anomaly.non.seasonal.result
looping2
# datex hourx seller product detail status channel transaction
#6 2021-07-01 7 do9 1 2 1 2 144
#11 2021-07-01 12 do9 1 2 1 2 150
#48 2021-07-03 1 do9 1 2 1 2 200
#53 2021-07-03 6 do9 1 2 1 2 180
#94 2021-07-04 23 do9 1 2 1 2 165
然后我们从循环中收集结果。这是结果:
myresult <- rbind(looping1, looping2)
myresult
# A tibble: 11 x 8
# datex hourx seller product detail status channel transaction
# * <date> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2021-07-01 14 do1 0 0 0 0 15412
# 2 2021-07-02 9 do1 0 0 0 0 16673
# 3 2021-07-02 12 do1 0 0 0 0 16545
# 4 2021-07-02 22 do1 0 0 0 0 10677
# 5 2021-07-05 16 do1 0 0 0 0 21934
# 6 2021-07-05 22 do1 0 0 0 0 2245
# 7 2021-07-01 7 do9 1 2 1 2 144
# 8 2021-07-01 12 do9 1 2 1 2 150
# 9 2021-07-03 1 do9 1 2 1 2 200
#10 2021-07-03 6 do9 1 2 1 2 180
#11 2021-07-04 23 do9 1 2 1 2 165
我有 120K 循环,如果我手动这样做会很痛苦。那么,我如何使用 dplyr 使这个循环过程更简单?非常感谢。
可能可以进一步优化代码,但将您已有的代码放入您可以执行的函数中 -
library(dplyr)
library(seastests)
library(timetk)
library(qcc)
library(purrr)
custom_fn <- function(mydata.comb1) {
if(isSeasonal(as.ts(mydata.comb1$transaction), test = "wo", freq = 24)) {
mydata.comb1 %>%
group_by(across(seller:channel)) %>%
tk_anomaly_diagnostics(datex, transaction) %>%
ungroup -> model.anomaly.seasonal
model.anomaly.seasonal.data <- subset(model.anomaly.seasonal, anomaly == "Yes")
model.anomaly.seasonal.data2 <- model.anomaly.seasonal.data[moveme(names(model.anomaly.seasonal.data),"datex first")]
model.anomaly.seasonal.data3 <- model.anomaly.seasonal.data2[,c(1:7)]
colnames(model.anomaly.seasonal.data3)[7] <- "transaction"
model.anomaly.seasonal.data3 %>%
left_join(mydata.comb1) -> model.anomaly.seasonal.data4
model.anomaly.seasonal.data5 <- na.omit(model.anomaly.seasonal.data4[moveme(names(model.anomaly.seasonal.data4),"hourx before seller")])
looping1 <- model.anomaly.seasonal.data5
} else {
y <- mydata.comb1$transaction
model.anomaly.non.seasonal <- qcc(as.ts(y), type = "xbar.one", plot = F)
model.anomaly.non.seasonal.data <- data.frame(ind = model.anomaly.non.seasonal$violations$beyond.limits, transaction = y[model.anomaly.non.seasonal$violations$beyond.limits])
model.anomaly.non.seasonal.conf <- model.anomaly.non.seasonal$limits[2]
model.anomaly.non.seasonal.indeks <- subset(model.anomaly.non.seasonal.data, transaction > model.anomaly.non.seasonal.conf)$ind
model.anomaly.non.seasonal.result <- mydata.comb1[model.anomaly.non.seasonal.indeks,]
looping1 <- model.anomaly.non.seasonal.result
}
return(looping1)
}
将数据分成几组并将此函数应用于每个子集。
mydata %>%
group_split(seller, product, detail, status, channel) %>%
map_df(custom_fn)
# datex hourx seller product detail status channel transaction
# <date> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2020-07-01 14 do1 0 0 0 0 15412
# 2 2020-07-02 9 do1 0 0 0 0 16673
# 3 2020-07-02 12 do1 0 0 0 0 16545
# 4 2020-07-02 22 do1 0 0 0 0 10677
# 5 2020-07-05 16 do1 0 0 0 0 21934
# 6 2020-07-05 22 do1 0 0 0 0 2245
# 7 2020-07-03 16 do5 0 1 0 1 5775
# 8 2020-07-05 11 do5 0 1 0 1 16975
# 9 2020-07-05 17 do5 0 1 0 1 21552
#10 2020-07-01 7 do9 1 2 1 2 144
#11 2020-07-01 12 do9 1 2 1 2 150
#12 2020-07-03 1 do9 1 2 1 2 200
#13 2020-07-03 6 do9 1 2 1 2 180
#14 2020-07-04 23 do9 1 2 1 2 165