R 在 dplyr 上表现不佳 summarize with multiple join and filters

R underperforming on dplyr summarize with multiple joins and filters

我有以下包含 3 个数据帧的示例数据集:

base_pop_ex <-
  structure(
    list(
      anon_id = c(
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7"
      ),
      session_number = c(1,
                         2),
      entrance_date = c("2021-06-28 11:43:21.633 Z", "2021-06-29 01:10:08.109 Z"),
      single_article_session = c(0, 0)
    ),
    .Names = c(
      "anon_id",
      "session_number",
      "entrance_date",
      "single_article_session"
    ),
    row.names = c(NA,-2L),
    class = c("tbl_df", "tbl", "data.frame")
  )

ad_views_ex <-
  structure(
    list(
      anon_id = c(
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7"
      ),
      ad_view_date = c(
        "2021-06-28 11:43:22.654 Z",
        "2021-06-28 11:44:15.360 Z",
        "2021-06-28 11:44:32.538 Z",
        "2021-06-28 12:07:19.557 Z",
        "2021-06-28 12:07:20.146 Z",
        "2021-06-29 01:10:08.706 Z",
        "2021-06-29 01:10:17.127 Z",
        "2021-06-29 01:40:30.726 Z",
        "2021-06-29 01:40:30.914 Z"
      ),
      ad_call_count = c(3, 1, 1, 1, 3,
                        3, 1, 1, 3)
    ),
    .Names = c("anon_id", "ad_view_date", "ad_call_count"),
    row.names = c(NA,-9L),
    class = c("tbl_df", "tbl", "data.frame")
  )

scroll_depth_ex <-
  structure(
    list(
      anon_id = c(
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7",
        "0003ff12-03b1-42b9-86cf-4b7c05e3e3a7"
      ),
      scroll_date = c(
        "2021-06-28 11:43:38.263 Z",
        "2021-06-28 11:43:41.593 Z",
        "2021-06-28 11:43:48.882 Z",
        "2021-06-28 11:43:49.339 Z",
        "2021-06-28 11:43:52.270 Z",
        "2021-06-28 11:43:57.995 Z",
        "2021-06-28 11:44:15.324 Z",
        "2021-06-28 11:44:16.955 Z",
        "2021-06-28 11:44:30.284 Z",
        "2021-06-28 11:44:44.197 Z",
        "2021-06-28 12:07:19.564 Z",
        "2021-06-28 12:07:19.581 Z",
        "2021-06-28 12:07:19.593 Z",
        "2021-06-28 12:07:19.600 Z",
        "2021-06-28 12:07:19.617 Z",
        "2021-06-28 12:07:19.639 Z",
        "2021-06-28 12:07:19.648 Z",
        "2021-06-28 12:07:19.664 Z",
        "2021-06-29 01:10:13.401 Z",
        "2021-06-29 01:10:25.065 Z",
        "2021-06-29 01:11:02.595 Z",
        "2021-06-29 01:11:45.444 Z",
        "2021-06-29 01:40:30.741 Z",
        "2021-06-29 01:40:30.747 Z",
        "2021-06-29 01:40:30.903 Z",
        "2021-06-29 01:40:30.909 Z"
      ),
      scroll_depth = c(
        10,
        20,
        30,
        40,
        50,
        60,
        70,
        80,
        90,
        100,
        10,
        20,
        30,
        40,
        50,
        60,
        70,
        80,
        10,
        20,
        30,
        40,
        10,
        20,
        30,
        40
      )
    ),
    .Names = c("anon_id", "scroll_date",
               "scroll_depth"),
    row.names = c(NA,-26L),
    class = c("tbl_df",
              "tbl", "data.frame")
  )

我想加入所有三个数据框,所以最后我得到 anon_identrance_datesession_numberad_viewsscroll_depth :

下面的代码完成了它的工作:

library(tidyr)
library(lubridate)

combined_ex <- base_pop_ex %>%
      left_join(ad_views_ex, by = c("anon_id")) %>%
      filter(
        entrance_date <= ad_view_date &
          difftime(ad_view_date, entrance_date, units = "mins") <= 60
      ) %>%
      group_by(anon_id, entrance_date, session_number) %>%
      summarize(
        ad_views = sum(ad_call_count, na.rm = TRUE)
        )
      )
    
    combined_ex2 <- combined_ex %>%
      left_join(scroll_depth_ex, by = c("anon_id")) %>%
      filter(
        entrance_date <= scroll_date &
          difftime(scroll_date, entrance_date, units = "mins") <= 60
      ) %>%
      group_by(anon_id, entrance_date, session_number, ad_views) %>%
      summarize(
        scroll_depth = max(scroll_depth, na.rm = TRUE)
      )
    )

最终结果是 combined_ex2 这个:

|               anon_id              |       entrance_date        | session_number | ad_views | scroll_depth |
|------------------------------------|----------------------------|----------------|----------|--------------|
|0003ff12-03b1-42b9-86cf-4b7c05e3e3a7| 2021-06-28 11:43:21.633 Z  |       1        |   162    |      100     |      
|0003ff12-03b1-42b9-86cf-4b7c05e3e3a7| 2021-06-29 01:10:08.109 Z  |       2        |    64    |      40      |

但是,当我将其缩放到我的真实数据时,Rstudio 需要大约 1 分钟来创建第一个组合数据框,8 分钟来创建第二个组合数据框。我的数据包含 base_pop_ex 的 500K 行,ad_views_ex 的 1.4M 行和 scroll_depth_ex 的 3.7M 行,我没有考虑太多。

我认为是 filter() 这一步。尝试将这两个部分分开,将 difftime() 放在第二位。

例如:

  ...
  filter(entrance_date <= ad_view_date) %>%
  filter(difftime(ad_view_date, entrance_date, units = "mins") <= 60) %>%
  ...