使用 R 中另一个数据帧的日期时间过滤和汇总来自一个数据帧的数据
Filtering and summarising data from one dataframe using datetimes from another in R
我正在尝试使用一个数据框中的日期时间戳来过滤和汇总另一个数据框中的数据。这也是通过分组变量(一个 UserId)来完成的。以下是我正在处理的数据的一些虚构示例:-
df1
df1<-structure(list(UserId = c("6i9Gla", "6i9Gla", "6i9Gla", "6i9Gla",
"6i9Gla", "6i9Gla", "59hGIY", "59hGIY", "LzDaPX", "LzDaPX", "LzDaPX",
"LzDaPX", "LzDaPX", "LzDaPX", "LzDaPX", "LzDaPX", "LzDaPX", "LzDaPX",
"o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt",
"o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt",
"o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", "o3wUUC", "o3wUUC", "o3wUUC",
"gXbJAq", "gXbJAq"), Duration = c(632L, 167L, 868L, 27L, 309L,
671L, 7L, 8L, 7L, 19L, 81L, 600L, 391L, 615L, 332L, 197L, 168L,
27L, 836L, 257L, 24L, 555L, 99L, 286L, 387L, 11L, 79L, 181L,
293L, 126L, 6L, 10L, 1247L, 259L, 11L, 547L, 28L, 19L, 17L, 7L,
10L), Genre = c("Sport", "Sport", "Sport", "Sport", "Sport",
"Sport", "Sport", "Sport", "Sport", "Sport", "Sport", "Sport",
"Sport", "Sport", "Sport", "Sport", "Sport", "Sport", "Sport",
"Sport", "Sport", "Sport", "Sport", "Sport", "Sport", "Sport",
"Sport", "Sport", "Sport", "Sport", "Sport", "Sport", "Sport",
"Sport", "Sport", "Sport", "Sport", "Sport", "Sport", "Sport",
"Sport"), DateTime_Start = structure(c(1614292441.754, 1614291282.352,
1614291509.308, 1614288742.042, 1614294373.856, 1614293122.735,
1614294911.325, 1614289403.922, 1614289358.205, 1614290574.724,
1614293909.406, 1614295977.859, 1614294049.531, 1614294711.345,
1614295613.728, 1614294488.27, 1614295400.927, 1614293277.01,
1614290364.352, 1614293571.4, 1614293873.58, 1614292529.847,
1614291353.127, 1614296784.14, 1614295424.46, 1614294012.164,
1614293909.164, 1614292191.902, 1614291777.142, 1614295930.443,
1614292521.197, 1614291208.605, 1614294092.039, 1614293283.587,
1614294040.841, 1614296214.851, 1614292701.846, 1614296929.017,
1614294151.79, 1614292835.834, 1614288948.473), class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -41L), class = "data.frame")
head(df1)
UserId Duration Genre DateTime_Start
1 6i9Gla 632 Sport 2021-02-25 22:34:01
2 6i9Gla 167 Sport 2021-02-25 22:14:42
3 6i9Gla 868 Sport 2021-02-25 22:18:29
4 6i9Gla 27 Sport 2021-02-25 21:32:22
5 6i9Gla 309 Sport 2021-02-25 23:06:13
6 6i9Gla 671 Sport 2021-02-25 22:45:22
df2
df2<-structure(list(UserId = c("6i9Gla", "59hGIY", "LzDaPX", "o0fsPt",
"o3wUUC", "gXbJAq"), OrigTime = structure(c(1614288742.042, 1614289403.922,
1614289358.205, 1614290364.352, 1614292701.846, 1614288948.473
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), LastTime = structure(c(1614291509.308,
1614289403.922, 1614290574.724, 1614293909.164, 1614294151.79,
1614288948.473), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
events_recount = c(3L, 1L, 2L, 11L, 2L, 1L)), row.names = c(NA,
-6L), class = "data.frame")
head(df2)
UserId OrigTime LastTime events_recount
1 6i9Gla 2021-02-25 21:32:22 2021-02-25 22:18:29 3
2 59hGIY 2021-02-25 21:43:23 2021-02-25 21:43:23 1
3 LzDaPX 2021-02-25 21:42:38 2021-02-25 22:02:54 2
4 o0fsPt 2021-02-25 21:59:24 2021-02-25 22:58:29 11
5 o3wUUC 2021-02-25 22:38:21 2021-02-25 23:02:31 2
6 gXbJAq 2021-02-25 21:35:48 2021-02-25 21:35:48 1
本质上,我试图按 UserId 进行分组,然后过滤并汇总 df1 中发生在 df2 中 OrigTime
和 LastTime
之间的行。以下是我想在过滤后的行之间总结的内容:-
- 总持续时间(对 df1 中的
Duration
列求和)
- 平均持续时间(df1 中
Duration
列的平均值)
- df1
中观看次数最多Genre
Genre
改变了多少次(我知道在这个答案中,它将是零,但我需要将解决方案应用于更大、更多样化的现实世界数据集
在此之后,我希望将过滤和摘要的输出合并回 df2。
关于如何做到这一点的任何指示?非常感谢!
此代码可能有效。
change_ftn <- function(x){
count <- 0
eval <- 1
if (length(x) == 1){
count <- 0
}
else {
for (i in 1:(length(x)-1)){
if(eval == 1){
if(!(x[i] == x[i+1]) ){
count <- count + 1
eval <- 0
}
} else {
eval <- 1
}
}
}
count
}
df1 %>%
left_join(df2, "UserId") %>%
filter(DateTime_Start > OrigTime, DateTime_Start < LastTime) %>%
group_by(UserId) %>%
summarise(UserId = unique(UserId),
OrigTime = unique(OrigTime),
LastTime = unique(LastTime),
events_recount = unique(events_recount),
sum = sum(Duration),
mean = mean(Duration),
max = which.max(table(Genre)) %>% names,
change = change_ftn(Genre)
)
UserId OrigTime LastTime events_recount sum mean max change
<chr> <dttm> <dttm> <int> <int> <dbl> <chr> <dbl>
1 6i9Gla 2021-02-25 21:32:22 2021-02-25 22:18:29 3 167 167 Sport 0
2 o0fsPt 2021-02-25 21:59:24 2021-02-25 22:58:29 11 1684 187. Sport 0
library(data.table)
# Convert df1 and df2 to data.table format
# keep rownames in column 'rn' and set keys
setDT(df1, keep.rownames = "rn1")
setDT(df2, keep.rownames = "rn2")
# create an end-dummyvariable in df1
df1[, temp_time := DateTime_Start]
# set keys
setkey(df1, UserId, DateTime_Start, temp_time)
setkey(df2, UserId, OrigTime, LastTime)
# perform overlap join
answer <- foverlaps(df2, df1)
# now you can summarise based on this data.table
这是我们如何做的分步指南:
library(tidyverse)
# Question 1 and 2
sum_mean_Duration <- df1 %>%
left_join(df2, by="UserId") %>%
group_by(UserId) %>%
mutate(condition = ifelse(between(DateTime_Start, OrigTime, LastTime), 1, 0)) %>%
filter(condition == 1) %>%
summarise(Total_duration = sum(Duration), Mean_duration = mean(Duration))
# Question 3 (I added some mock data "Thriller")
Max_watched <- df1 %>%
left_join(df2, by="UserId") %>%
group_by(UserId) %>%
count(Genre) %>%
filter(n == max(n))
# Question 4 (I added some mock data "Thriller")
Change_Genre <- df1 %>%
left_join(df2, by="UserId") %>%
group_by(UserId) %>%
summarise(Change_Genre = n_distinct(Genre))
# the single dataframes
sum_mean_Duration
Max_watched
Change_Genre
# Bring them all together with df2
list(df2, sum_mean_Duration, Max_watched, Change_Genre) %>%
reduce(left_join, by = "UserId")
输出:
UserId OrigTime LastTime events_recount Total_duration Mean_duration Genre n Change_Genre
1 6i9Gla 2021-02-25 21:32:22 2021-02-25 22:18:29 3 1062 354.0000 Sport 4 2
2 59hGIY 2021-02-25 21:43:23 2021-02-25 21:43:23 1 8 8.0000 Sport 2 1
3 LzDaPX 2021-02-25 21:42:38 2021-02-25 22:02:54 2 26 13.0000 Sport 9 2
4 o0fsPt 2021-02-25 21:59:24 2021-02-25 22:58:29 11 2599 236.2727 Sport 17 2
5 o3wUUC 2021-02-25 22:38:21 2021-02-25 23:02:31 2 45 22.5000 Sport 3 1
6 gXbJAq 2021-02-25 21:35:48 2021-02-25 21:35:48 1 10 10.0000 Sport 2 1
我正在尝试使用一个数据框中的日期时间戳来过滤和汇总另一个数据框中的数据。这也是通过分组变量(一个 UserId)来完成的。以下是我正在处理的数据的一些虚构示例:-
df1
df1<-structure(list(UserId = c("6i9Gla", "6i9Gla", "6i9Gla", "6i9Gla",
"6i9Gla", "6i9Gla", "59hGIY", "59hGIY", "LzDaPX", "LzDaPX", "LzDaPX",
"LzDaPX", "LzDaPX", "LzDaPX", "LzDaPX", "LzDaPX", "LzDaPX", "LzDaPX",
"o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt",
"o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt",
"o0fsPt", "o0fsPt", "o0fsPt", "o0fsPt", "o3wUUC", "o3wUUC", "o3wUUC",
"gXbJAq", "gXbJAq"), Duration = c(632L, 167L, 868L, 27L, 309L,
671L, 7L, 8L, 7L, 19L, 81L, 600L, 391L, 615L, 332L, 197L, 168L,
27L, 836L, 257L, 24L, 555L, 99L, 286L, 387L, 11L, 79L, 181L,
293L, 126L, 6L, 10L, 1247L, 259L, 11L, 547L, 28L, 19L, 17L, 7L,
10L), Genre = c("Sport", "Sport", "Sport", "Sport", "Sport",
"Sport", "Sport", "Sport", "Sport", "Sport", "Sport", "Sport",
"Sport", "Sport", "Sport", "Sport", "Sport", "Sport", "Sport",
"Sport", "Sport", "Sport", "Sport", "Sport", "Sport", "Sport",
"Sport", "Sport", "Sport", "Sport", "Sport", "Sport", "Sport",
"Sport", "Sport", "Sport", "Sport", "Sport", "Sport", "Sport",
"Sport"), DateTime_Start = structure(c(1614292441.754, 1614291282.352,
1614291509.308, 1614288742.042, 1614294373.856, 1614293122.735,
1614294911.325, 1614289403.922, 1614289358.205, 1614290574.724,
1614293909.406, 1614295977.859, 1614294049.531, 1614294711.345,
1614295613.728, 1614294488.27, 1614295400.927, 1614293277.01,
1614290364.352, 1614293571.4, 1614293873.58, 1614292529.847,
1614291353.127, 1614296784.14, 1614295424.46, 1614294012.164,
1614293909.164, 1614292191.902, 1614291777.142, 1614295930.443,
1614292521.197, 1614291208.605, 1614294092.039, 1614293283.587,
1614294040.841, 1614296214.851, 1614292701.846, 1614296929.017,
1614294151.79, 1614292835.834, 1614288948.473), class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -41L), class = "data.frame")
head(df1)
UserId Duration Genre DateTime_Start
1 6i9Gla 632 Sport 2021-02-25 22:34:01
2 6i9Gla 167 Sport 2021-02-25 22:14:42
3 6i9Gla 868 Sport 2021-02-25 22:18:29
4 6i9Gla 27 Sport 2021-02-25 21:32:22
5 6i9Gla 309 Sport 2021-02-25 23:06:13
6 6i9Gla 671 Sport 2021-02-25 22:45:22
df2
df2<-structure(list(UserId = c("6i9Gla", "59hGIY", "LzDaPX", "o0fsPt",
"o3wUUC", "gXbJAq"), OrigTime = structure(c(1614288742.042, 1614289403.922,
1614289358.205, 1614290364.352, 1614292701.846, 1614288948.473
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), LastTime = structure(c(1614291509.308,
1614289403.922, 1614290574.724, 1614293909.164, 1614294151.79,
1614288948.473), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
events_recount = c(3L, 1L, 2L, 11L, 2L, 1L)), row.names = c(NA,
-6L), class = "data.frame")
head(df2)
UserId OrigTime LastTime events_recount
1 6i9Gla 2021-02-25 21:32:22 2021-02-25 22:18:29 3
2 59hGIY 2021-02-25 21:43:23 2021-02-25 21:43:23 1
3 LzDaPX 2021-02-25 21:42:38 2021-02-25 22:02:54 2
4 o0fsPt 2021-02-25 21:59:24 2021-02-25 22:58:29 11
5 o3wUUC 2021-02-25 22:38:21 2021-02-25 23:02:31 2
6 gXbJAq 2021-02-25 21:35:48 2021-02-25 21:35:48 1
本质上,我试图按 UserId 进行分组,然后过滤并汇总 df1 中发生在 df2 中 OrigTime
和 LastTime
之间的行。以下是我想在过滤后的行之间总结的内容:-
- 总持续时间(对 df1 中的
Duration
列求和) - 平均持续时间(df1 中
Duration
列的平均值) - df1 中观看次数最多
Genre
改变了多少次(我知道在这个答案中,它将是零,但我需要将解决方案应用于更大、更多样化的现实世界数据集
Genre
在此之后,我希望将过滤和摘要的输出合并回 df2。
关于如何做到这一点的任何指示?非常感谢!
此代码可能有效。
change_ftn <- function(x){
count <- 0
eval <- 1
if (length(x) == 1){
count <- 0
}
else {
for (i in 1:(length(x)-1)){
if(eval == 1){
if(!(x[i] == x[i+1]) ){
count <- count + 1
eval <- 0
}
} else {
eval <- 1
}
}
}
count
}
df1 %>%
left_join(df2, "UserId") %>%
filter(DateTime_Start > OrigTime, DateTime_Start < LastTime) %>%
group_by(UserId) %>%
summarise(UserId = unique(UserId),
OrigTime = unique(OrigTime),
LastTime = unique(LastTime),
events_recount = unique(events_recount),
sum = sum(Duration),
mean = mean(Duration),
max = which.max(table(Genre)) %>% names,
change = change_ftn(Genre)
)
UserId OrigTime LastTime events_recount sum mean max change
<chr> <dttm> <dttm> <int> <int> <dbl> <chr> <dbl>
1 6i9Gla 2021-02-25 21:32:22 2021-02-25 22:18:29 3 167 167 Sport 0
2 o0fsPt 2021-02-25 21:59:24 2021-02-25 22:58:29 11 1684 187. Sport 0
library(data.table)
# Convert df1 and df2 to data.table format
# keep rownames in column 'rn' and set keys
setDT(df1, keep.rownames = "rn1")
setDT(df2, keep.rownames = "rn2")
# create an end-dummyvariable in df1
df1[, temp_time := DateTime_Start]
# set keys
setkey(df1, UserId, DateTime_Start, temp_time)
setkey(df2, UserId, OrigTime, LastTime)
# perform overlap join
answer <- foverlaps(df2, df1)
# now you can summarise based on this data.table
这是我们如何做的分步指南:
library(tidyverse)
# Question 1 and 2
sum_mean_Duration <- df1 %>%
left_join(df2, by="UserId") %>%
group_by(UserId) %>%
mutate(condition = ifelse(between(DateTime_Start, OrigTime, LastTime), 1, 0)) %>%
filter(condition == 1) %>%
summarise(Total_duration = sum(Duration), Mean_duration = mean(Duration))
# Question 3 (I added some mock data "Thriller")
Max_watched <- df1 %>%
left_join(df2, by="UserId") %>%
group_by(UserId) %>%
count(Genre) %>%
filter(n == max(n))
# Question 4 (I added some mock data "Thriller")
Change_Genre <- df1 %>%
left_join(df2, by="UserId") %>%
group_by(UserId) %>%
summarise(Change_Genre = n_distinct(Genre))
# the single dataframes
sum_mean_Duration
Max_watched
Change_Genre
# Bring them all together with df2
list(df2, sum_mean_Duration, Max_watched, Change_Genre) %>%
reduce(left_join, by = "UserId")
输出:
UserId OrigTime LastTime events_recount Total_duration Mean_duration Genre n Change_Genre
1 6i9Gla 2021-02-25 21:32:22 2021-02-25 22:18:29 3 1062 354.0000 Sport 4 2
2 59hGIY 2021-02-25 21:43:23 2021-02-25 21:43:23 1 8 8.0000 Sport 2 1
3 LzDaPX 2021-02-25 21:42:38 2021-02-25 22:02:54 2 26 13.0000 Sport 9 2
4 o0fsPt 2021-02-25 21:59:24 2021-02-25 22:58:29 11 2599 236.2727 Sport 17 2
5 o3wUUC 2021-02-25 22:38:21 2021-02-25 23:02:31 2 45 22.5000 Sport 3 1
6 gXbJAq 2021-02-25 21:35:48 2021-02-25 21:35:48 1 10 10.0000 Sport 2 1