R 在 dplyr 上表现不佳 summarize with multiple join and filters
R underperforming on dplyr summarize with multiple joins and filters
我有以下包含 3 个数据帧的示例数据集:
base_pop_ex <-
structure(
list(
anon_id = c(
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7"
),
session_number = c(1,
2),
entrance_date = c("2021-06-28 11:43:21.633 Z", "2021-06-29 01:10:08.109 Z"),
single_article_session = c(0, 0)
),
.Names = c(
"anon_id",
"session_number",
"entrance_date",
"single_article_session"
),
row.names = c(NA,-2L),
class = c("tbl_df", "tbl", "data.frame")
)
ad_views_ex <-
structure(
list(
anon_id = c(
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7"
),
ad_view_date = c(
"2021-06-28 11:43:22.654 Z",
"2021-06-28 11:44:15.360 Z",
"2021-06-28 11:44:32.538 Z",
"2021-06-28 12:07:19.557 Z",
"2021-06-28 12:07:20.146 Z",
"2021-06-29 01:10:08.706 Z",
"2021-06-29 01:10:17.127 Z",
"2021-06-29 01:40:30.726 Z",
"2021-06-29 01:40:30.914 Z"
),
ad_call_count = c(3, 1, 1, 1, 3,
3, 1, 1, 3)
),
.Names = c("anon_id", "ad_view_date", "ad_call_count"),
row.names = c(NA,-9L),
class = c("tbl_df", "tbl", "data.frame")
)
scroll_depth_ex <-
structure(
list(
anon_id = c(
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7"
),
scroll_date = c(
"2021-06-28 11:43:38.263 Z",
"2021-06-28 11:43:41.593 Z",
"2021-06-28 11:43:48.882 Z",
"2021-06-28 11:43:49.339 Z",
"2021-06-28 11:43:52.270 Z",
"2021-06-28 11:43:57.995 Z",
"2021-06-28 11:44:15.324 Z",
"2021-06-28 11:44:16.955 Z",
"2021-06-28 11:44:30.284 Z",
"2021-06-28 11:44:44.197 Z",
"2021-06-28 12:07:19.564 Z",
"2021-06-28 12:07:19.581 Z",
"2021-06-28 12:07:19.593 Z",
"2021-06-28 12:07:19.600 Z",
"2021-06-28 12:07:19.617 Z",
"2021-06-28 12:07:19.639 Z",
"2021-06-28 12:07:19.648 Z",
"2021-06-28 12:07:19.664 Z",
"2021-06-29 01:10:13.401 Z",
"2021-06-29 01:10:25.065 Z",
"2021-06-29 01:11:02.595 Z",
"2021-06-29 01:11:45.444 Z",
"2021-06-29 01:40:30.741 Z",
"2021-06-29 01:40:30.747 Z",
"2021-06-29 01:40:30.903 Z",
"2021-06-29 01:40:30.909 Z"
),
scroll_depth = c(
10,
20,
30,
40,
50,
60,
70,
80,
90,
100,
10,
20,
30,
40,
50,
60,
70,
80,
10,
20,
30,
40,
10,
20,
30,
40
)
),
.Names = c("anon_id", "scroll_date",
"scroll_depth"),
row.names = c(NA,-26L),
class = c("tbl_df",
"tbl", "data.frame")
)
我想加入所有三个数据框,所以最后我得到 anon_id
、entrance_date
、session_number
、ad_views
和 scroll_depth
:
ad_views
是数据框 ad_views_ex
的所有 ad_call_counts 的总和,其中 ad_view_date
大于 [=51] 中的 entrance_date
=] base_pop_ex
,同时两个日期之间的分钟差小于 60
scroll_depth
对连接使用与前一个指标相同的逻辑。但是,我在这里计算每组事件的最大值
下面的代码完成了它的工作:
library(tidyr)
library(lubridate)
combined_ex <- base_pop_ex %>%
left_join(ad_views_ex, by = c("anon_id")) %>%
filter(
entrance_date <= ad_view_date &
difftime(ad_view_date, entrance_date, units = "mins") <= 60
) %>%
group_by(anon_id, entrance_date, session_number) %>%
summarize(
ad_views = sum(ad_call_count, na.rm = TRUE)
)
)
combined_ex2 <- combined_ex %>%
left_join(scroll_depth_ex, by = c("anon_id")) %>%
filter(
entrance_date <= scroll_date &
difftime(scroll_date, entrance_date, units = "mins") <= 60
) %>%
group_by(anon_id, entrance_date, session_number, ad_views) %>%
summarize(
scroll_depth = max(scroll_depth, na.rm = TRUE)
)
)
最终结果是 combined_ex2
这个:
| anon_id | entrance_date | session_number | ad_views | scroll_depth |
|------------------------------------|----------------------------|----------------|----------|--------------|
|0003ff12-03b1-42b9-86cf-4b7c05e3e3a7| 2021-06-28 11:43:21.633 Z | 1 | 162 | 100 |
|0003ff12-03b1-42b9-86cf-4b7c05e3e3a7| 2021-06-29 01:10:08.109 Z | 2 | 64 | 40 |
但是,当我将其缩放到我的真实数据时,Rstudio 需要大约 1 分钟来创建第一个组合数据框,8 分钟来创建第二个组合数据框。我的数据包含 base_pop_ex
的 500K 行,ad_views_ex
的 1.4M 行和 scroll_depth_ex
的 3.7M 行,我没有考虑太多。
- 谁能告诉我为什么我的代码在我的数据上表现不佳?
- 此外,有没有一种方法可以完成相同的工作而不必将联接、分组和汇总拆分为两个步骤?
我认为是 filter()
这一步。尝试将这两个部分分开,将 difftime()
放在第二位。
例如:
...
filter(entrance_date <= ad_view_date) %>%
filter(difftime(ad_view_date, entrance_date, units = "mins") <= 60) %>%
...
我有以下包含 3 个数据帧的示例数据集:
base_pop_ex <-
structure(
list(
anon_id = c(
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7"
),
session_number = c(1,
2),
entrance_date = c("2021-06-28 11:43:21.633 Z", "2021-06-29 01:10:08.109 Z"),
single_article_session = c(0, 0)
),
.Names = c(
"anon_id",
"session_number",
"entrance_date",
"single_article_session"
),
row.names = c(NA,-2L),
class = c("tbl_df", "tbl", "data.frame")
)
ad_views_ex <-
structure(
list(
anon_id = c(
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7"
),
ad_view_date = c(
"2021-06-28 11:43:22.654 Z",
"2021-06-28 11:44:15.360 Z",
"2021-06-28 11:44:32.538 Z",
"2021-06-28 12:07:19.557 Z",
"2021-06-28 12:07:20.146 Z",
"2021-06-29 01:10:08.706 Z",
"2021-06-29 01:10:17.127 Z",
"2021-06-29 01:40:30.726 Z",
"2021-06-29 01:40:30.914 Z"
),
ad_call_count = c(3, 1, 1, 1, 3,
3, 1, 1, 3)
),
.Names = c("anon_id", "ad_view_date", "ad_call_count"),
row.names = c(NA,-9L),
class = c("tbl_df", "tbl", "data.frame")
)
scroll_depth_ex <-
structure(
list(
anon_id = c(
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
"0003ff12-03b1-42b9-86cf-4b7c05e3e3a7"
),
scroll_date = c(
"2021-06-28 11:43:38.263 Z",
"2021-06-28 11:43:41.593 Z",
"2021-06-28 11:43:48.882 Z",
"2021-06-28 11:43:49.339 Z",
"2021-06-28 11:43:52.270 Z",
"2021-06-28 11:43:57.995 Z",
"2021-06-28 11:44:15.324 Z",
"2021-06-28 11:44:16.955 Z",
"2021-06-28 11:44:30.284 Z",
"2021-06-28 11:44:44.197 Z",
"2021-06-28 12:07:19.564 Z",
"2021-06-28 12:07:19.581 Z",
"2021-06-28 12:07:19.593 Z",
"2021-06-28 12:07:19.600 Z",
"2021-06-28 12:07:19.617 Z",
"2021-06-28 12:07:19.639 Z",
"2021-06-28 12:07:19.648 Z",
"2021-06-28 12:07:19.664 Z",
"2021-06-29 01:10:13.401 Z",
"2021-06-29 01:10:25.065 Z",
"2021-06-29 01:11:02.595 Z",
"2021-06-29 01:11:45.444 Z",
"2021-06-29 01:40:30.741 Z",
"2021-06-29 01:40:30.747 Z",
"2021-06-29 01:40:30.903 Z",
"2021-06-29 01:40:30.909 Z"
),
scroll_depth = c(
10,
20,
30,
40,
50,
60,
70,
80,
90,
100,
10,
20,
30,
40,
50,
60,
70,
80,
10,
20,
30,
40,
10,
20,
30,
40
)
),
.Names = c("anon_id", "scroll_date",
"scroll_depth"),
row.names = c(NA,-26L),
class = c("tbl_df",
"tbl", "data.frame")
)
我想加入所有三个数据框,所以最后我得到 anon_id
、entrance_date
、session_number
、ad_views
和 scroll_depth
:
ad_views
是数据框ad_views_ex
的所有 ad_call_counts 的总和,其中ad_view_date
大于 [=51] 中的entrance_date
=]base_pop_ex
,同时两个日期之间的分钟差小于 60scroll_depth
对连接使用与前一个指标相同的逻辑。但是,我在这里计算每组事件的最大值
下面的代码完成了它的工作:
library(tidyr)
library(lubridate)
combined_ex <- base_pop_ex %>%
left_join(ad_views_ex, by = c("anon_id")) %>%
filter(
entrance_date <= ad_view_date &
difftime(ad_view_date, entrance_date, units = "mins") <= 60
) %>%
group_by(anon_id, entrance_date, session_number) %>%
summarize(
ad_views = sum(ad_call_count, na.rm = TRUE)
)
)
combined_ex2 <- combined_ex %>%
left_join(scroll_depth_ex, by = c("anon_id")) %>%
filter(
entrance_date <= scroll_date &
difftime(scroll_date, entrance_date, units = "mins") <= 60
) %>%
group_by(anon_id, entrance_date, session_number, ad_views) %>%
summarize(
scroll_depth = max(scroll_depth, na.rm = TRUE)
)
)
最终结果是 combined_ex2
这个:
| anon_id | entrance_date | session_number | ad_views | scroll_depth |
|------------------------------------|----------------------------|----------------|----------|--------------|
|0003ff12-03b1-42b9-86cf-4b7c05e3e3a7| 2021-06-28 11:43:21.633 Z | 1 | 162 | 100 |
|0003ff12-03b1-42b9-86cf-4b7c05e3e3a7| 2021-06-29 01:10:08.109 Z | 2 | 64 | 40 |
但是,当我将其缩放到我的真实数据时,Rstudio 需要大约 1 分钟来创建第一个组合数据框,8 分钟来创建第二个组合数据框。我的数据包含 base_pop_ex
的 500K 行,ad_views_ex
的 1.4M 行和 scroll_depth_ex
的 3.7M 行,我没有考虑太多。
- 谁能告诉我为什么我的代码在我的数据上表现不佳?
- 此外,有没有一种方法可以完成相同的工作而不必将联接、分组和汇总拆分为两个步骤?
我认为是 filter()
这一步。尝试将这两个部分分开,将 difftime()
放在第二位。
例如:
...
filter(entrance_date <= ad_view_date) %>%
filter(difftime(ad_view_date, entrance_date, units = "mins") <= 60) %>%
...