在 R 中使用 dplyr 循环获取所有组合

Looping to get all combination using dplyr in R

这是我的数据

## Data
datex <- c(rep("2021-01-18", 61), rep("2021-01-19", 125))
hourx <- c(0,1,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,16,10,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,11,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4,4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,14,15,15,15,15,16,16,16,16)
seller <- c("dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2")
product <- c
detail <- c("E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","notEnoughBalance","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","notEnoughBalance","E99","success","success","success","E99","success","success","E99","success","E99","success","E99","E99","success","E99","E99","success","E99","success","E99","success","E99","success","E99","success","success","E99","E99","E99","success","success","E99","success","E99","success","E99","success","success","E99","E99","E99","success","E99","success","success","E99","E99","success","E99","success","E99","success","success","E99","E99","success","success","E99","E99","success","E99","success","success","E99","success","E99","success","E99","E99","success","success","E99","E99","success","E99","success","success","E99","E99","E99","success","success","notEnoughBalance","E99","success","success","E99","success","E99","success","notEnoughBalance","E99","success","E99","E99","success","E99","success","success","E99","success","E99","E99","success","E99","success","success","E99","success","success","E99","E99","success","notEnoughBalance","E99","E99","success","E99","success","success","E99","E99","success","success","E99")
status <- c("FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","OK01","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","OK01","FI04","OK00","OK00","OK00","FI04","OK00","OK00","FI04","OK00","FI04","OK00","FI04","FI04","OK00","FI04","FI04","OK00","FI04","OK00","FI04","OK00","FI04","OK00","FI04","OK00","OK00","FI04","FI04","FI04","OK00","OK00","FI04","OK00","FI04","OK00","FI04","OK00","OK00","FI04","FI04","FI04","OK00","FI04","OK00","OK00","FI04","FI04","OK00","FI04","OK00","FI04","OK00","OK00","FI04","FI04","OK00","OK00","FI04","FI04","OK00","FI04","OK00","OK00","FI04","OK00","FI04","OK00","FI04","FI04","OK00","OK00","FI04","FI04","OK00","FI04","OK00","OK00","FI04","FI04","FI04","OK00","OK00","OK01","FI04","OK00","OK00","FI04","OK00","FI04","OK00","OK01","FI04","OK00","FI04","FI04","OK00","FI04","OK00","OK00","FI04","OK00","FI04","FI04","OK00","FI04","OK00","OK00","FI04","OK00","OK00","FI04","FI04","OK00","OK01","FI04","FI04","OK00","FI04","OK00","OK00","FI04","FI04","OK00","OK00","FI04")
channel <- c("f2","f2","f2","f3","f2","f3","f2","f3","f2","f3","f2","f3","f2","f2","f3","f2","f3","f2","f3","f2","f3","f2","f3","f3","f2","f3","f2","f2","f2","f3","f3","f2","f3","f3","f2","f2","f3","f3","f2","f3","f2","f3","f2","f3","f2","f2","f3","f3","f2","f2","f3","f3","f2","f2","f3","f2","f3","f2","f3","f2","f3","f3","f2","f3","f3","f3","f2","f3","f3","f2","f2","f3","f3","f2","f2","f3","f2","f2","f2","f3","f3","f2","f3","f3","f2","f3","f2","f2","f3","f2","f3","f2","f3","f3","f2","f2","f2","f2","f3","f3","f2","f3","f3","f2","f2","f3","f3","f3","f2","f2","f3","f2","f3","f2","f3","f3","f3","f2","f2","f2","f2","f3","f3","f3","f3","f2","f2","f3","f3","f2","f2","f3","f3","f2","f2","f3","f2","f3","f2","f2","f3","f2","f3","f2","f2","f3","f2","f3","f2","f3","f3","f2","f2","f3","f3","f2","f2","f2","f3","f2","f3","f3","f3","f2","f2","f3","f3","f2","f2","f2","f3","f3","f2","f3","f3","f3","f2","f2","f2","f2","f2","f2","f2","f2","f2","f2")
transaction <- c(1,120,50,5,1,2,1,9,6,12,5,25,14,6,22,9,10,14,15,12,220,12,12,14,9,11,100,90,110,12,13,4,3,1,2,3,3,5,7,5,5,6,9,16,8,13,10,20,15,18,10,19,15,5,13,12,10,12,26,14,0,4,0,0,0,2,0,0,2,0,4,0,6,8,0,2,3,0,2,0,1,0,1,0,2,0,0,2,1,1,0,0,3,0,1,0,3,0,0,6,5,2,0,8,0,0,12,11,0,2,0,11,0,0,14,21,0,0,13,7,0,17,0,0,18,0,7,0,4,4,0,0,7,12,0,13,0,0,130,160,9,0,0,0,16,0,0,16,0,14,0,0,9,0,11,8,0,8,0,0,8,0,10,5,0,15,0,0,3,0,0,8,8,0,0,6,5,0,8,0,0,5,1,0,0,95)
mydatax <- data.frame(datex, hourx, seller, product, detail, status, channel, transaction)

我的任务是使用 tsoutliers 包从我的数据中的任何组合中找出离群值。对于示例,我使用两种组合。第一个组合:

# Process 1
library(tsoutliers)
combination1 <- subset(mydatax, seller == "dombsdpapp1" &
                         product == "00021460" &
                         detail == "E99" &
                         status == "FI04" &
                         channel == "f2")

model.anomaly1 <- tso(as.ts(combination1$transaction))
find.anomaly.index1 <- subset(model.anomaly1$outliers, coefhat > 0)[,2]
data.anomaly1 <- combination1[find.anomaly.index1,]
data.anomaly1

#datex hourx      seller  product detail status channel transaction
#2   2021-01-18     1 dombsdpapp1 00021460    E99   FI04      f2         120
#27  2021-01-18    14 dombsdpapp1 00021460    E99   FI04      f2         100
#29  2021-01-18    16 dombsdpapp1 00021460    E99   FI04      f2         110
#139 2021-01-19    10 dombsdpapp1 00021460    E99   FI04      f2         130

第二个组合:

# Process 2
library(tsoutliers)
combination2 <- subset(mydatax, seller == "dombsdpapp2" &
                         product == "00021460" &
                         detail == "E99" &
                         status == "FI04" &
                         channel == "f2")

model.anomaly2 <- tso(as.ts(combination2$transaction))
find.anomaly.index2 <- subset(model.anomaly2$outliers, coefhat > 0)[,2]
data.anomaly2 <- combination2[find.anomaly.index2,]
data.anomaly2

#datex hourx      seller  product detail status channel transaction
#140 2021-01-19    10 dombsdpapp2 00021460    E99   FI04      f2         160
#186 2021-01-19    16 dombsdpapp2 00021460    E99   FI04      f2          95

之后,全部循环插入1 table:

my.anomaly.result <- rbind(data.anomaly1, data.anomaly2)
my.anomaly.result

#         datex hourx      seller  product detail status channel transaction
#2   2021-01-18     1 dombsdpapp1 00021460    E99   FI04      f2         120
#27  2021-01-18    14 dombsdpapp1 00021460    E99   FI04      f2         100
#29  2021-01-18    16 dombsdpapp1 00021460    E99   FI04      f2         110
#139 2021-01-19    10 dombsdpapp1 00021460    E99   FI04      f2         130
#140 2021-01-19    10 dombsdpapp2 00021460    E99   FI04      f2         160
#186 2021-01-19    16 dombsdpapp2 00021460    E99   FI04      f2          95

痛苦的一点是我如何使用 dplyr 循环所有过程以获得所有结果?因为我有 100K 组合。谢谢。

在数据中,某些组只有 1 行或 2 行。对于这样的组 tso 函数 returns 一个错误。我编写了一个自定义函数,其中设置了 5 行的阈值。因此,如果一个组的行数少于 5 行,则该组的所有行都会被选择用于其余行,我们将应用该函数。您可以根据您的数据将此 5 调整为任意数字。

library(dplyr)
library(tsoutliers)

get_outlier_index <- function(x) {
  if(length(x) < 5) return(seq_along(x))
  model.anomaly <- tso(as.ts(x))
  model.anomaly$outliers$ind[model.anomaly$outliers$coefhat > 0]
}

mydatax %>%
  group_by(across(seller:channel)) %>%
  slice(get_outlier_index(transaction)) %>%
  ungroup

#   datex      hourx seller      product  detail           status channel transaction
#   <chr>      <dbl> <chr>       <chr>    <chr>            <chr>  <chr>         <dbl>
# 1 2021-01-18     7 dombsdpapp1 00021459 E99              FI04   f3               25
# 2 2021-01-18    11 dombsdpapp1 00021459 E99              FI04   f3              220
# 3 2021-01-19     5 dombsdpapp1 00021459 E99              FI04   f3                6
# 4 2021-01-18    10 dombsdpapp1 00021459 notEnoughBalance OK01   f3               12
# 5 2021-01-18     1 dombsdpapp1 00021460 E99              FI04   f2              120
# 6 2021-01-18    14 dombsdpapp1 00021460 E99              FI04   f2              100
# 7 2021-01-18    16 dombsdpapp1 00021460 E99              FI04   f2              110
# 8 2021-01-19    10 dombsdpapp1 00021460 E99              FI04   f2              130
# 9 2021-01-19    11 dombsdpapp1 00021460 notEnoughBalance OK01   f2                0
#10 2021-01-18    11 dombsdpapp2 00021459 notEnoughBalance OK01   f3                0
#11 2021-01-19    14 dombsdpapp2 00021459 notEnoughBalance OK01   f3                0
#12 2021-01-19    10 dombsdpapp2 00021460 E99              FI04   f2              160
#13 2021-01-19    16 dombsdpapp2 00021460 E99              FI04   f2               95
#14 2021-01-19    11 dombsdpapp2 00021460 notEnoughBalance OK01   f2                0