使用 R 包 dplyr 将中位数添加到所有数据组合中

Adding median into all combinations of data using R package dpylr

这是我的数据

## Data
datex <- c(rep("2021-01-18", 61), rep("2021-01-19", 125))
hourx <- c(0,1,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,16,10,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,11,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4,4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,14,15,15,15,15,16,16,16,16)
seller <- c("dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2","dombsdpapp1","dombsdpapp1","dombsdpapp2","dombsdpapp2")
product <- c
detail <- c("E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","notEnoughBalance","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","E99","notEnoughBalance","E99","success","success","success","E99","success","success","E99","success","E99","success","E99","E99","success","E99","E99","success","E99","success","E99","success","E99","success","E99","success","success","E99","E99","E99","success","success","E99","success","E99","success","E99","success","success","E99","E99","E99","success","E99","success","success","E99","E99","success","E99","success","E99","success","success","E99","E99","success","success","E99","E99","success","E99","success","success","E99","success","E99","success","E99","E99","success","success","E99","E99","success","E99","success","success","E99","E99","E99","success","success","notEnoughBalance","E99","success","success","E99","success","E99","success","notEnoughBalance","E99","success","E99","E99","success","E99","success","success","E99","success","E99","E99","success","E99","success","success","E99","success","success","E99","E99","success","notEnoughBalance","E99","E99","success","E99","success","success","E99","E99","success","success","E99")
status <- c("FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","OK01","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","FI04","OK01","FI04","OK00","OK00","OK00","FI04","OK00","OK00","FI04","OK00","FI04","OK00","FI04","FI04","OK00","FI04","FI04","OK00","FI04","OK00","FI04","OK00","FI04","OK00","FI04","OK00","OK00","FI04","FI04","FI04","OK00","OK00","FI04","OK00","FI04","OK00","FI04","OK00","OK00","FI04","FI04","FI04","OK00","FI04","OK00","OK00","FI04","FI04","OK00","FI04","OK00","FI04","OK00","OK00","FI04","FI04","OK00","OK00","FI04","FI04","OK00","FI04","OK00","OK00","FI04","OK00","FI04","OK00","FI04","FI04","OK00","OK00","FI04","FI04","OK00","FI04","OK00","OK00","FI04","FI04","FI04","OK00","OK00","OK01","FI04","OK00","OK00","FI04","OK00","FI04","OK00","OK01","FI04","OK00","FI04","FI04","OK00","FI04","OK00","OK00","FI04","OK00","FI04","FI04","OK00","FI04","OK00","OK00","FI04","OK00","OK00","FI04","FI04","OK00","OK01","FI04","FI04","OK00","FI04","OK00","OK00","FI04","FI04","OK00","OK00","FI04")
channel <- c("f2","f2","f2","f3","f2","f3","f2","f3","f2","f3","f2","f3","f2","f2","f3","f2","f3","f2","f3","f2","f3","f2","f3","f3","f2","f3","f2","f2","f2","f3","f3","f2","f3","f3","f2","f2","f3","f3","f2","f3","f2","f3","f2","f3","f2","f2","f3","f3","f2","f2","f3","f3","f2","f2","f3","f2","f3","f2","f3","f2","f3","f3","f2","f3","f3","f3","f2","f3","f3","f2","f2","f3","f3","f2","f2","f3","f2","f2","f2","f3","f3","f2","f3","f3","f2","f3","f2","f2","f3","f2","f3","f2","f3","f3","f2","f2","f2","f2","f3","f3","f2","f3","f3","f2","f2","f3","f3","f3","f2","f2","f3","f2","f3","f2","f3","f3","f3","f2","f2","f2","f2","f3","f3","f3","f3","f2","f2","f3","f3","f2","f2","f3","f3","f2","f2","f3","f2","f3","f2","f2","f3","f2","f3","f2","f2","f3","f2","f3","f2","f3","f3","f2","f2","f3","f3","f2","f2","f2","f3","f2","f3","f3","f3","f2","f2","f3","f3","f2","f2","f2","f3","f3","f2","f3","f3","f3","f2","f2","f2","f2","f2","f2","f2","f2","f2","f2")
transaction <- c(1,6,2,5,1,2,1,9,6,12,5,25,14,6,22,9,10,14,15,12,22,12,12,14,9,11,3,3,4,0,1,4,3,1,2,3,3,5,7,5,5,6,9,16,8,13,10,20,15,18,10,19,15,5,13,12,10,12,26,14,0,4,0,0,0,2,0,0,2,0,4,0,6,8,0,2,3,0,2,0,1,0,1,0,2,0,0,2,1,1,0,0,3,0,1,0,3,0,0,6,5,2,0,8,0,0,12,11,0,2,0,11,0,0,14,21,0,0,13,7,0,17,0,0,18,0,7,0,4,4,0,0,7,12,0,13,0,0,13,6,9,0,0,0,16,0,0,16,0,14,0,0,9,0,11,8,0,8,0,0,8,0,10,5,0,15,0,0,3,0,0,8,8,0,0,6,5,0,8,0,0,5,1,0,0,3)
mydata <- data.frame(datex, hourx, seller, product, detail, status, channel, transaction)

我的任务是将中位数添加到组合中。 这就是我的意思。这是一个示例:

combination1 <- mydata[(mydata$seller == "dombsdpapp1" & mydata$product == "00021460" & mydata$detail == "E99" & mydata$status == "FI04" & mydata$channel == "f2"),]

combination1
# datex hourx      seller  product detail status channel transaction
# 1   2021-01-18     0 dombsdpapp1 00021460    E99   FI04      f2           1
# 2   2021-01-18     1 dombsdpapp1 00021460    E99   FI04      f2           6
# 3   2021-01-18     2 dombsdpapp1 00021460    E99   FI04      f2           2
# 5   2021-01-18     3 dombsdpapp1 00021460    E99   FI04      f2           1
# 7   2021-01-18     4 dombsdpapp1 00021460    E99   FI04      f2           1
# 9   2021-01-18     5 dombsdpapp1 00021460    E99   FI04      f2           6
# 11  2021-01-18     6 dombsdpapp1 00021460    E99   FI04      f2           5
# 13  2021-01-18     7 dombsdpapp1 00021460    E99   FI04      f2          14
# 14  2021-01-18     8 dombsdpapp1 00021460    E99   FI04      f2           6
# 16  2021-01-18     9 dombsdpapp1 00021460    E99   FI04      f2           9
# 18  2021-01-18    10 dombsdpapp1 00021460    E99   FI04      f2          14
# 20  2021-01-18    11 dombsdpapp1 00021460    E99   FI04      f2          12
# 22  2021-01-18    12 dombsdpapp1 00021460    E99   FI04      f2          12
# 25  2021-01-18    13 dombsdpapp1 00021460    E99   FI04      f2           9
# 27  2021-01-18    14 dombsdpapp1 00021460    E99   FI04      f2           3
# 28  2021-01-18    15 dombsdpapp1 00021460    E99   FI04      f2           3
# 29  2021-01-18    16 dombsdpapp1 00021460    E99   FI04      f2           4
# 71  2021-01-19     1 dombsdpapp1 00021460    E99   FI04      f2           4
# 77  2021-01-19     2 dombsdpapp1 00021460    E99   FI04      f2           3
# 85  2021-01-19     3 dombsdpapp1 00021460    E99   FI04      f2           2
# 90  2021-01-19     4 dombsdpapp1 00021460    E99   FI04      f2           1
# 97  2021-01-19     5 dombsdpapp1 00021460    E99   FI04      f2           3
# 104 2021-01-19     6 dombsdpapp1 00021460    E99   FI04      f2           8
# 112 2021-01-19     7 dombsdpapp1 00021460    E99   FI04      f2          11
# 120 2021-01-19     8 dombsdpapp1 00021460    E99   FI04      f2           7
# 130 2021-01-19     9 dombsdpapp1 00021460    E99   FI04      f2           4
# 139 2021-01-19    10 dombsdpapp1 00021460    E99   FI04      f2          13
# 145 2021-01-19    11 dombsdpapp1 00021460    E99   FI04      f2          16
# 156 2021-01-19    12 dombsdpapp1 00021460    E99   FI04      f2           8
# 164 2021-01-19    13 dombsdpapp1 00021460    E99   FI04      f2           5
# 173 2021-01-19    14 dombsdpapp1 00021460    E99   FI04      f2           8
# 179 2021-01-19    15 dombsdpapp1 00021460    E99   FI04      f2           8
# 183 2021-01-19    16 dombsdpapp1 00021460    E99   FI04      f2           1

从结果可以看出,datex "2021-01-18" 和 "2021-01-19" 从17到23错过了hourx,所以我们需要将combination1的中位数添加到hourx 17-23。我是这样手动做的

## Add Median into combination1
add_datex1 <- rep("2021-01-18",7); add_datex2 <- rep("2021-01-19",7); add_hourx <- 17:23; add_seller <- rep("dombsdpapp1",7); add_product <- rep("00021460",7); add_detail <- rep("E99",7); add_status <- rep("FI04",7); add_channel <- rep("f2",7); add_transaction <- rep(median(combination1$transaction),7)
add_18 <- data.frame(add_datex1, add_hourx, add_seller, add_product, add_detail, add_status, add_channel, add_transaction)
colnames(add_18) <- colnames(mydata)
add_19 <- data.frame(add_datex2, add_hourx, add_seller, add_product, add_detail, add_status, add_channel, add_transaction)
colnames(add_19) <- colnames(mydata)
new_combination1 <- rbind(combination1[1:17,],add_18, combination1[-(1:17),], add_19)
rownames(new_combination1) <- 1:47
new_combination1
        datex hourx      seller  product detail status channel transaction
#1  2021-01-18     0 dombsdpapp1 00021460    E99   FI04      f2           1
#2  2021-01-18     1 dombsdpapp1 00021460    E99   FI04      f2           6
#3  2021-01-18     2 dombsdpapp1 00021460    E99   FI04      f2           2
#4  2021-01-18     3 dombsdpapp1 00021460    E99   FI04      f2           1
#5  2021-01-18     4 dombsdpapp1 00021460    E99   FI04      f2           1
#6  2021-01-18     5 dombsdpapp1 00021460    E99   FI04      f2           6
#7  2021-01-18     6 dombsdpapp1 00021460    E99   FI04      f2           5
#8  2021-01-18     7 dombsdpapp1 00021460    E99   FI04      f2          14
#9  2021-01-18     8 dombsdpapp1 00021460    E99   FI04      f2           6
#10 2021-01-18     9 dombsdpapp1 00021460    E99   FI04      f2           9
#11 2021-01-18    10 dombsdpapp1 00021460    E99   FI04      f2          14
#12 2021-01-18    11 dombsdpapp1 00021460    E99   FI04      f2          12
#13 2021-01-18    12 dombsdpapp1 00021460    E99   FI04      f2          12
#14 2021-01-18    13 dombsdpapp1 00021460    E99   FI04      f2           9
#15 2021-01-18    14 dombsdpapp1 00021460    E99   FI04      f2           3
#16 2021-01-18    15 dombsdpapp1 00021460    E99   FI04      f2           3
#17 2021-01-18    16 dombsdpapp1 00021460    E99   FI04      f2           4
#18 2021-01-18    17 dombsdpapp1 00021460    E99   FI04      f2           6
#19 2021-01-18    18 dombsdpapp1 00021460    E99   FI04      f2           6
#20 2021-01-18    19 dombsdpapp1 00021460    E99   FI04      f2           6
#21 2021-01-18    20 dombsdpapp1 00021460    E99   FI04      f2           6
#22 2021-01-18    21 dombsdpapp1 00021460    E99   FI04      f2           6
#23 2021-01-18    22 dombsdpapp1 00021460    E99   FI04      f2           6
#24 2021-01-18    23 dombsdpapp1 00021460    E99   FI04      f2           6
#25 2021-01-19     1 dombsdpapp1 00021460    E99   FI04      f2           4
#26 2021-01-19     2 dombsdpapp1 00021460    E99   FI04      f2           3
#27 2021-01-19     3 dombsdpapp1 00021460    E99   FI04      f2           2
#28 2021-01-19     4 dombsdpapp1 00021460    E99   FI04      f2           1
#29 2021-01-19     5 dombsdpapp1 00021460    E99   FI04      f2           3
#30 2021-01-19     6 dombsdpapp1 00021460    E99   FI04      f2           8
#31 2021-01-19     7 dombsdpapp1 00021460    E99   FI04      f2          11
#32 2021-01-19     8 dombsdpapp1 00021460    E99   FI04      f2           7
#33 2021-01-19     9 dombsdpapp1 00021460    E99   FI04      f2           4
#34 2021-01-19    10 dombsdpapp1 00021460    E99   FI04      f2          13
#35 2021-01-19    11 dombsdpapp1 00021460    E99   FI04      f2          16
#36 2021-01-19    12 dombsdpapp1 00021460    E99   FI04      f2           8
#37 2021-01-19    13 dombsdpapp1 00021460    E99   FI04      f2           5
#38 2021-01-19    14 dombsdpapp1 00021460    E99   FI04      f2           8
#39 2021-01-19    15 dombsdpapp1 00021460    E99   FI04      f2           8
#40 2021-01-19    16 dombsdpapp1 00021460    E99   FI04      f2           1
#41 2021-01-19    17 dombsdpapp1 00021460    E99   FI04      f2           6
#42 2021-01-19    18 dombsdpapp1 00021460    E99   FI04      f2           6
#43 2021-01-19    19 dombsdpapp1 00021460    E99   FI04      f2           6
#44 2021-01-19    20 dombsdpapp1 00021460    E99   FI04      f2           6
#45 2021-01-19    21 dombsdpapp1 00021460    E99   FI04      f2           6
#46 2021-01-19    22 dombsdpapp1 00021460    E99   FI04      f2           6
#47 2021-01-19    23 dombsdpapp1 00021460    E99   FI04      f2           6

如何使用 dplyr 为所有组合自动将中位数添加到缺失的“hourx”中? 非常感谢。

这是一种使用 dplyr 确定每个组的中位数的方法,然后使用 lubridate 将日期和小时合并到一个字段中,然后使用 padr 填充所有缺失的小时行,最后使用 join & coalesce 添加那些缺失行的中位数。

library(padr); library(dplyr); library(lubridate)
medians <- mydata %>%
  group_by(seller, product, detail, status, channel) %>%
  summarise(median = median(transaction), .groups = "drop") 

mydata %>%
  mutate(timestamp =  lubridate::ymd_h(paste(datex, hourx))) %>%
  padr::pad(by = "timestamp", 
            group = c("seller", "product", "detail", "status", "channel")) %>%
  left_join(medians) %>%
  mutate(filled_transaction = coalesce(transaction, median))

创建函数以完成 hourx 并填充 median:

fillHour <- function(df){
  
  data.frame(
    hourx = 0:23
  ) %>%
    left_join(df, by = "hourx") %>%
    replace_na(list(transaction = median(.[["transaction"]], na.rm = TRUE)))
}

nest 数据框并在 data:

上使用 fillHour
mydata %>% 
  group_by(datex, seller, product, detail, status, channel) %>% 
  nest() %>% 
  mutate(data = map(data, fillHour)) %>% 
  unnest(cols = "data") %>% 
  ungroup()

您需要 NA imputationcomplete。建议采用以下策略-

  • group_by 一次对所有需要的分组变量进行处理(因为在进行 NA 插补时无论如何你都需要这些变量)
  • 使用 tidyr::complete 完成每个组的 0:23 小时序列。
  • 最后使用 coalesce 进行 NA 插补

library(tidyverse)

mydata %>% group_by(datex, seller, product, detail, status, channel) %>%
  complete(hourx = 0:23) %>%
  mutate(transaction = coalesce(transaction, median(transaction, na.rm = T)))
# A tibble: 408 x 8
# Groups:   datex, seller, product, detail, status, channel [17]
   datex      seller      product  detail status channel hourx transaction
   <chr>      <chr>       <chr>    <chr>  <chr>  <chr>   <dbl>       <dbl>
 1 2021-01-18 dombsdpapp1 00021459 E99    FI04   f3          0          12
 2 2021-01-18 dombsdpapp1 00021459 E99    FI04   f3          1          12
 3 2021-01-18 dombsdpapp1 00021459 E99    FI04   f3          2          12
 4 2021-01-18 dombsdpapp1 00021459 E99    FI04   f3          3           5
 5 2021-01-18 dombsdpapp1 00021459 E99    FI04   f3          4           2
 6 2021-01-18 dombsdpapp1 00021459 E99    FI04   f3          5           9
 7 2021-01-18 dombsdpapp1 00021459 E99    FI04   f3          6          12
 8 2021-01-18 dombsdpapp1 00021459 E99    FI04   f3          7          25
 9 2021-01-18 dombsdpapp1 00021459 E99    FI04   f3          8          22
10 2021-01-18 dombsdpapp1 00021459 E99    FI04   f3          9          10
# ... with 398 more rows

reprex package (v2.0.0)

于 2021-05-18 创建