使用R中的dplyr循环删除数据框中的条件组合

Looping to delete conditional combination in dataframe using dplyr in R

这是我的数据

datex <- c(rep("2021-01-18", 61), rep("2021-01-19", 139))
hourx <- c(0,1,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,16,10,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,11,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4,4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,14,15,15,15,15,16,16,16,16,0,0,0,0,1,2,3,4,5,6,7,8,9,10)
seller <- c("dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp3","dombsdpapp3","dombsdpapp3","dombsdpapp4","dombsdpapp4","dombsdpapp4","dombsdpapp4","dombsdpapp4","dombsdpapp4","dombsdpapp4","dombsdpapp4","dombsdpapp4","dombsdpapp4","dombsdpapp4")
product <- c
detail <- c("E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","notEnoughBalance","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","notEnoughBalance","E99","success","success","success","E99","success","success","E99","success","E99","success","E99","E99","success","E99","E99","success","E99","success","E99","success","E99","success","E99","success","success","E99","E99","E99","success","success","E99","success","E99","success","E99","success","success","E99","E99","E99","success","E99","success","success","E99","E99","success","E99","success","E99","success","success","E99","E99","success","success","E99","E99","success","E99","success","success","E99","success","E99","success","E99","E99","success","success","E99","E99","success","E99","success","success","E99","E99","E99","success","success","notEnoughBalance","E99","success","success","E99","success","E99","success","notEnoughBalance","E99","success","E99","E99","success","E99","success","success","E99","success","E99","E99","success","E99","success","success","E99","success","success","E99","E99","success","notEnoughBalance","E99","E99","success","E99","success","success","E99","E99","success","success","E99","success","success","success","success","success","success","success","success","success","success","success","success","success","success")
status <- c("FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","OK01","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","OK01","FI04","OK00","OK00","OK00","FI04","OK00","OK00","FI04","OK00","FI04","OK00","FI04","FI04","OK00","FI04","FI04","OK00","FI04","OK00","FI04","OK00","FI04","OK00","FI04","OK00","OK00","FI04","FI04","FI04","OK00","OK00","FI04","OK00","FI04","OK00","FI04","OK00","OK00","FI04","FI04","FI04","OK00","FI04","OK00","OK00","FI04","FI04","OK00","FI04","OK00","FI04","OK00","OK00","FI04","FI04","OK00","OK00","FI04","FI04","OK00","FI04","OK00","OK00","FI04","OK00","FI04","OK00","FI04","FI04","OK00","OK00","FI04","FI04","OK00","FI04","OK00","OK00","FI04","FI04","FI04","OK00","OK00","OK01","FI04","OK00","OK00","FI04","OK00","FI04","OK00","OK01","FI04","OK00","FI04","FI04","OK00","FI04","OK00","OK00","FI04","OK00","FI04","FI04","OK00","FI04","OK00","OK00","FI04","OK00","OK00","FI04","FI04","OK00","OK01","FI04","FI04","OK00","FI04","OK00","OK00","FI04","FI04","OK00","OK00","FI04","OK00","OK00","OK00","OK00","OK00","OK00","OK00","OK00","OK00","OK00","OK00","OK00","OK00","OK00")
channel <- c("f2","f2","f2","f3","f2","f3","f2","f3","f2","f3","f2","f3","f2","f2","f3","f2","f3","f2","f3","f2","f3","f2","f3","f3","f2","f3","f2","f2","f2","f3","f3","f2","f3","f3","f2","f2","f3","f3","f2","f3","f2","f3","f2","f3","f2","f2","f3","f3","f2","f2","f3","f3","f2","f2","f3","f2","f3","f2","f3","f2","f3","f3","f2","f3","f3","f3","f2","f3","f3","f2","f2","f3","f3","f2","f2","f3","f2","f2","f2","f3","f3","f2","f3","f3","f2","f3","f2","f2","f3","f2","f3","f2","f3","f3","f2","f2","f2","f2","f3","f3","f2","f3","f3","f2","f2","f3","f3","f3","f2","f2","f3","f2","f3","f2","f3","f3","f3","f2","f2","f2","f2","f3","f3","f3","f3","f2","f2","f3","f3","f2","f2","f3","f3","f2","f2","f3","f2","f3","f2","f2","f3","f2","f3","f2","f2","f3","f2","f3","f2","f3","f3","f2","f2","f3","f3","f2","f2","f2","f3","f2","f3","f3","f3","f2","f2","f3","f3","f2","f2","f2","f3","f3","f2","f3","f3","f3","f2","f2","f2","f2","f2","f2","f2","f2","f2","f2","f2","f2","f2","f2","f2","f2","f2","f2","f2","f2","f2","f2","f2","f2")
transaction <- c(1,120,50,5,1,2,1,9,6,12,5,25,14,6,22,9,10,14,15,12,220,12,12,14,9,11,100,90,110,12,13,4,3,1,2,3,3,5,7,5,5,6,9,16,8,13,10,20,15,18,10,19,15,5,13,12,10,12,26,14,0,4,0,0,0,2,0,0,2,0,4,0,6,8,0,2,3,0,2,0,1,0,1,0,2,0,0,2,1,1,0,0,3,0,1,0,3,0,0,6,5,2,0,8,0,0,12,11,0,2,0,11,0,0,14,21,0,0,13,7,0,17,0,0,18,0,7,0,4,4,0,0,7,12,0,13,0,0,130,160,9,0,0,0,16,0,0,16,0,14,0,0,9,0,11,8,0,8,0,0,8,0,10,5,0,15,0,0,3,0,0,8,8,0,0,6,5,0,8,0,0,5,1,0,0,95,11,15,20,100,100,100,100,100,100,100,100,100,100,100)
mydatax <- data.frame(datex, hourx, seller, product, detail, status, channel, transaction)

我有两个任务,我的第一个任务是我需要删除行少于 5 行的组合。示例组合:

我需要从我的数据中删除这种类型组合。

mycomb1 <- subset(mydatax, seller == "dombsdpapp3" & product == "21459" & 
                    detail == "success" & status == "OK00" &
                    channel == "f2")

mycomb1
#         datex hourx      seller product  detail status channel transaction
#187 2021-01-19     0 dombsdpapp3   21459 success   OK00      f2          11
#188 2021-01-19     0 dombsdpapp3   21459 success   OK00      f2          15
#189 2021-01-19     0 dombsdpapp3   21459 success   OK00      f2          20

我的第二个任务是我需要删除具有相同行的组合。示例组合:

我需要从我的数据中删除这种类型组合。

mycomb2 <- subset(mydatax, seller == "dombsdpapp4" & product == "21459" & 
                    detail == "success" & status == "OK00" &
                    channel == "f2") 

mycomb2
#         datex hourx      seller product  detail status channel transaction
#190 2021-01-19     0 dombsdpapp4   21459 success   OK00      f2         100
#191 2021-01-19     1 dombsdpapp4   21459 success   OK00      f2         100
#192 2021-01-19     2 dombsdpapp4   21459 success   OK00      f2         100
#193 2021-01-19     3 dombsdpapp4   21459 success   OK00      f2         100
#194 2021-01-19     4 dombsdpapp4   21459 success   OK00      f2         100
#195 2021-01-19     5 dombsdpapp4   21459 success   OK00      f2         100
#196 2021-01-19     6 dombsdpapp4   21459 success   OK00      f2         100
#197 2021-01-19     7 dombsdpapp4   21459 success   OK00      f2         100
#198 2021-01-19     8 dombsdpapp4   21459 success   OK00      f2         100
#199 2021-01-19     9 dombsdpapp4   21459 success   OK00      f2         100
#200 2021-01-19    10 dombsdpapp4   21459 success   OK00      f2         100

我希望得到的结果是:

# Result
datex <- c(rep("2021-01-18", 59), rep("2021-01-19", 61))
hourx <- c(0,1,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,16,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,16,16)
seller <- c("dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp2","dombsdpapp1","dombsdpapp2")
product <- c
detail <- c
status <- c
channel <- c("f2","f2","f2","f3","f2","f3","f2","f3","f2","f3","f2","f3","f2","f2","f3","f2","f3","f2","f3","f2","f3","f2","f3","f3","f2","f3","f2","f2","f2","f3","f2","f3","f3","f2","f2","f3","f3","f2","f3","f2","f3","f2","f3","f2","f2","f3","f3","f2","f2","f3","f3","f2","f2","f3","f2","f3","f2","f3","f2","f3","f3","f3","f2","f3","f2","f3","f2","f2","f3","f3","f2","f2","f3","f2","f3","f2","f2","f3","f2","f3","f2","f3","f3","f2","f2","f3","f3","f2","f2","f3","f3","f2","f3","f2","f3","f2","f3","f2","f2","f3","f2","f3","f3","f2","f3","f2","f2","f3","f3","f2","f3","f2","f3","f2","f3","f2","f2","f2","f2","f2")
transaction <- c(1,120,50,5,1,2,1,9,6,12,5,25,14,6,22,9,10,14,15,12,220,12,12,14,9,11,100,90,110,13,4,3,1,2,3,3,5,7,5,5,6,9,16,8,13,10,20,15,18,10,19,15,5,13,12,10,12,26,14,4,2,2,4,6,8,2,3,2,1,1,2,2,1,1,3,1,3,6,5,2,8,12,11,2,11,14,21,13,7,17,18,7,4,4,7,12,13,130,160,9,16,16,14,9,11,8,8,8,10,5,15,3,8,8,6,5,8,5,1,95)
myresult <- data.frame(datex, hourx, seller, product, detail, status, channel, transaction)

痛苦的一点是我如何使用 dplyr 循环所有过程以获得所有结果?因为我有 100K 组合。谢谢。

也许,像这样?

library(dplyr)

mydatax %>%
  group_by(seller, product, detail, status, channel) %>%
  filter(n() >= 5 && n_distinct(transaction) > 1) %>%
  ungroup
  

# A tibble: 120 x 8
#   datex      hourx seller      product  detail status channel transaction
#   <chr>      <dbl> <chr>       <chr>    <chr>  <chr>  <chr>         <dbl>
# 1 2021-01-18     0 dombsdpapp1 00021460 E99    FI04   f2                1
# 2 2021-01-18     1 dombsdpapp1 00021460 E99    FI04   f2              120
# 3 2021-01-18     2 dombsdpapp1 00021460 E99    FI04   f2               50
# 4 2021-01-18     3 dombsdpapp1 00021459 E99    FI04   f3                5
# 5 2021-01-18     3 dombsdpapp1 00021460 E99    FI04   f2                1
# 6 2021-01-18     4 dombsdpapp1 00021459 E99    FI04   f3                2
# 7 2021-01-18     4 dombsdpapp1 00021460 E99    FI04   f2                1
# 8 2021-01-18     5 dombsdpapp1 00021459 E99    FI04   f3                9
# 9 2021-01-18     5 dombsdpapp1 00021460 E99    FI04   f2                6
#10 2021-01-18     6 dombsdpapp1 00021459 E99    FI04   f3               12
# … with 110 more rows

filter(n() >= 5) 仅保留具有 >= 5 行且 transaction 的唯一值大于 1 的组。

如果我理解了,这会起作用:

mydatax %>% 
  ###########
  #First task
  ###########
  #Create a variable with the count of the following variables
  add_count(seller,product,detail,status,channel,name = "n1") %>% 
  #Filter those with less than 5
  filter(n1 < 5) %>% 
  ###########
  #Second task
  ###########
  #Create a variable with the count of the following variables
  add_count(seller,product,detail,status,channel,transaction,name = "n2") %>% 
  #Filter those with less than 2
  filter(n2 < 2) 

使用plyr::count

plyr::count((mydatax[,c("seller", "product", "detail", "status", "channel")])) %>%
  filter(freq > 5) %>%
  select(-freq)

       seller  product  detail status channel
1 dombsdpapp1 00021459     E99   FI04      f3
2 dombsdpapp1 00021459 success   OK00      f3
3 dombsdpapp1 00021460     E99   FI04      f2
4 dombsdpapp1 00021460 success   OK00      f2
5 dombsdpapp2 00021459     E99   FI04      f3
6 dombsdpapp2 00021459 success   OK00      f3
7 dombsdpapp2 00021460     E99   FI04      f2
8 dombsdpapp2 00021460 success   OK00      f2
9 dombsdpapp4    21459 success   OK00      f2