使用 R 中另一个数据帧的日期时间过滤和汇总来自一个数据帧的数据

Filtering and summarising data from one dataframe using datetimes from another in R

我正在尝试使用一个数据框中的日期时间戳来过滤和汇总另一个数据框中的数据。这也是通过分组变量(一个 UserId)来完成的。以下是我正在处理的数据的一些虚构示例:-

df1

df1<-structure(list(UserId = c("6i9Gla", "6i9Gla", "6i9Gla", "6i9Gla", 
"6i9Gla", "6i9Gla", "59hGIY", "59hGIY", "LzDaPX", "LzDaPX", "LzDaPX", 
"LzDaPX", "LzDaPX", "LzDaPX", "LzDaPX", "LzDaPX", "LzDaPX", "LzDaPX", 
"o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", 
"o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", 
"o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", "o3wUUC", "o3wUUC", "o3wUUC", 
"gXbJAq", "gXbJAq"), Duration = c(632L, 167L, 868L, 27L, 309L, 
671L, 7L, 8L, 7L, 19L, 81L, 600L, 391L, 615L, 332L, 197L, 168L, 
27L, 836L, 257L, 24L, 555L, 99L, 286L, 387L, 11L, 79L, 181L, 
293L, 126L, 6L, 10L, 1247L, 259L, 11L, 547L, 28L, 19L, 17L, 7L, 
10L), Genre = c("Sport", "Sport", "Sport", "Sport", "Sport", 
"Sport", "Sport", "Sport", "Sport", "Sport", "Sport", "Sport", 
"Sport", "Sport", "Sport", "Sport", "Sport", "Sport", "Sport", 
"Sport", "Sport", "Sport", "Sport", "Sport", "Sport", "Sport", 
"Sport", "Sport", "Sport", "Sport", "Sport", "Sport", "Sport", 
"Sport", "Sport", "Sport", "Sport", "Sport", "Sport", "Sport", 
"Sport"), DateTime_Start = structure(c(1614292441.754, 1614291282.352, 
1614291509.308, 1614288742.042, 1614294373.856, 1614293122.735, 
1614294911.325, 1614289403.922, 1614289358.205, 1614290574.724, 
1614293909.406, 1614295977.859, 1614294049.531, 1614294711.345, 
1614295613.728, 1614294488.27, 1614295400.927, 1614293277.01, 
1614290364.352, 1614293571.4, 1614293873.58, 1614292529.847, 
1614291353.127, 1614296784.14, 1614295424.46, 1614294012.164, 
1614293909.164, 1614292191.902, 1614291777.142, 1614295930.443, 
1614292521.197, 1614291208.605, 1614294092.039, 1614293283.587, 
1614294040.841, 1614296214.851, 1614292701.846, 1614296929.017, 
1614294151.79, 1614292835.834, 1614288948.473), class = c("POSIXct", 
"POSIXt"), tzone = "UTC")), row.names = c(NA, -41L), class = "data.frame")


head(df1)
  UserId Duration Genre      DateTime_Start
1 6i9Gla      632 Sport 2021-02-25 22:34:01
2 6i9Gla      167 Sport 2021-02-25 22:14:42
3 6i9Gla      868 Sport 2021-02-25 22:18:29
4 6i9Gla       27 Sport 2021-02-25 21:32:22
5 6i9Gla      309 Sport 2021-02-25 23:06:13
6 6i9Gla      671 Sport 2021-02-25 22:45:22


df2

df2<-structure(list(UserId = c("6i9Gla", "59hGIY", "LzDaPX", "o0fsPt", 
"o3wUUC", "gXbJAq"), OrigTime = structure(c(1614288742.042, 1614289403.922, 
1614289358.205, 1614290364.352, 1614292701.846, 1614288948.473
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), LastTime = structure(c(1614291509.308, 
1614289403.922, 1614290574.724, 1614293909.164, 1614294151.79, 
1614288948.473), class = c("POSIXct", "POSIXt"), tzone = "UTC"), 
    events_recount = c(3L, 1L, 2L, 11L, 2L, 1L)), row.names = c(NA, 
-6L), class = "data.frame")


head(df2)

  UserId            OrigTime            LastTime events_recount
1 6i9Gla 2021-02-25 21:32:22 2021-02-25 22:18:29              3
2 59hGIY 2021-02-25 21:43:23 2021-02-25 21:43:23              1
3 LzDaPX 2021-02-25 21:42:38 2021-02-25 22:02:54              2
4 o0fsPt 2021-02-25 21:59:24 2021-02-25 22:58:29             11
5 o3wUUC 2021-02-25 22:38:21 2021-02-25 23:02:31              2
6 gXbJAq 2021-02-25 21:35:48 2021-02-25 21:35:48              1

本质上,我试图按 UserId 进行分组,然后过滤并汇总 df1 中发生在 df2 中 OrigTimeLastTime 之间的行。以下是我想在过滤后的行之间总结的内容:-

在此之后,我希望将过滤和摘要的输出合并回 df2。

关于如何做到这一点的任何指示?非常感谢!

此代码可能有效。

change_ftn <- function(x){
  count <- 0
  eval <- 1
  if (length(x) == 1){
    count <- 0
  }
  else {
    for (i in 1:(length(x)-1)){
      if(eval == 1){
        if(!(x[i] == x[i+1]) ){
          count <- count + 1
          eval <- 0
        }
      } else {
        eval <- 1
      }
    }
  }
  count
}

df1 %>%
  left_join(df2, "UserId") %>%
  filter(DateTime_Start > OrigTime, DateTime_Start < LastTime) %>%
  group_by(UserId) %>%
  summarise(UserId = unique(UserId),
            OrigTime = unique(OrigTime),
            LastTime = unique(LastTime),
            events_recount = unique(events_recount),
            sum = sum(Duration),
            mean = mean(Duration),
            max = which.max(table(Genre)) %>% names,
            change = change_ftn(Genre)
         )

  UserId OrigTime            LastTime            events_recount   sum  mean max   change
  <chr>  <dttm>              <dttm>                       <int> <int> <dbl> <chr>  <dbl>
1 6i9Gla 2021-02-25 21:32:22 2021-02-25 22:18:29              3   167  167  Sport      0
2 o0fsPt 2021-02-25 21:59:24 2021-02-25 22:58:29             11  1684  187. Sport      0
library(data.table)
# Convert df1 and df2 to data.table format
#  keep rownames in column 'rn' and set keys 
setDT(df1, keep.rownames = "rn1")
setDT(df2, keep.rownames = "rn2")
# create an end-dummyvariable in df1
df1[, temp_time := DateTime_Start]
# set keys
setkey(df1, UserId, DateTime_Start, temp_time)
setkey(df2, UserId, OrigTime, LastTime)
# perform overlap join
answer <- foverlaps(df2, df1)
# now you can summarise based on this data.table

这是我们如何做的分步指南:

library(tidyverse)

# Question 1 and 2
sum_mean_Duration <- df1 %>% 
  left_join(df2, by="UserId") %>% 
  group_by(UserId) %>% 
  mutate(condition = ifelse(between(DateTime_Start, OrigTime, LastTime), 1, 0)) %>% 
  filter(condition == 1) %>% 
  summarise(Total_duration = sum(Duration), Mean_duration = mean(Duration)) 


# Question 3 (I added some mock data "Thriller")
Max_watched <- df1 %>% 
  left_join(df2, by="UserId") %>% 
  group_by(UserId) %>%
  count(Genre) %>% 
  filter(n == max(n))

# Question 4 (I added some mock data "Thriller")
Change_Genre <- df1 %>% 
  left_join(df2, by="UserId") %>% 
  group_by(UserId) %>% 
  summarise(Change_Genre = n_distinct(Genre)) 

# the single dataframes
sum_mean_Duration
Max_watched
Change_Genre

# Bring them all together with df2
list(df2, sum_mean_Duration, Max_watched, Change_Genre) %>% 
  reduce(left_join, by = "UserId")

输出:

  UserId            OrigTime            LastTime events_recount Total_duration Mean_duration Genre  n Change_Genre
1 6i9Gla 2021-02-25 21:32:22 2021-02-25 22:18:29              3           1062      354.0000 Sport  4            2
2 59hGIY 2021-02-25 21:43:23 2021-02-25 21:43:23              1              8        8.0000 Sport  2            1
3 LzDaPX 2021-02-25 21:42:38 2021-02-25 22:02:54              2             26       13.0000 Sport  9            2
4 o0fsPt 2021-02-25 21:59:24 2021-02-25 22:58:29             11           2599      236.2727 Sport 17            2
5 o3wUUC 2021-02-25 22:38:21 2021-02-25 23:02:31              2             45       22.5000 Sport  3            1
6 gXbJAq 2021-02-25 21:35:48 2021-02-25 21:35:48              1             10       10.0000 Sport  2            1