交易日前6个月的总金额

Sum amount last 6 month prior to the date of transaction

这是我的交易数据。它显示了从 from 列中的帐户到 to 列中的帐户进行的交易以及日期和金额信息

data 

id          from    to          date        amount  
<int>       <fctr>  <fctr>      <date>      <dbl>
19521       6644    6934        2005-01-01  700.0
19524       6753    8456        2005-01-01  600.0
19523       9242    9333        2005-01-01  1000.0
…           …       …           …           …
1056317     7819    7454        2010-12-31  60.2
1056318     6164    7497        2010-12-31  107.5
1056319     7533    7492        2010-12-31  164.1

我想计算 from 列中的账户在特定交易发生日期之前的最后 6 个月内收到了多少交易金额,并希望将此信息保存为新列。

以下代码非常适合在小型数据集中完成此任务,例如,有 1000 行:

library(dplyr)
library(purrr)
data %>% 
  mutate(total_trx_amount_received_in_last_sixmonth= map2_dbl(from, date, 
~sum(amount[to == .x & between(date, .y - 180, .y)])))

但是,由于我的数据超过 100 万行,此代码将需要几个多小时才能完成。 我在互联网上搜索了是否可以加快此代码的 运行 时间。我在 SO 上尝试了 如何使 purrr map 函数 运行 更快。因此,我尝试了以下代码,而不是 dplyrmutate 我使用 data.table 来更快地加速代码:

library(future)
library(data.table)
library(furrr)
data[, total_trx_amount_received_in_last_sixmonth:= furrr::future_pmap_dbl(list(from, date), 
~mean(amount[to == .x & between(date, .y-180, .y)])) ]

但是,速度一点也没有提升。

有没有关于如何使代码 运行 更快的建议?

dput()输出数据:

structure(list(id = c(18529L, 13742L, 9913L, 956L, 2557L, 1602L, 
18669L, 35900L, 48667L, 51341L, 53713L, 60126L, 60545L, 65113L, 
66783L, 83324L, 87614L, 88898L, 89874L, 94765L, 100277L, 101587L, 
103444L, 108414L, 113319L, 121516L, 126607L, 130170L, 131771L, 
135002L, 149431L, 157403L, 157645L, 158831L, 162597L, 162680L, 
163901L, 165044L, 167082L, 168562L, 168940L, 172578L, 173031L, 
173267L, 177507L, 179167L, 182612L, 183499L, 188171L, 189625L, 
193940L, 198764L, 199342L, 200134L, 203328L, 203763L, 204733L, 
205651L, 209672L, 210242L, 210979L, 214532L, 214741L, 215738L, 
216709L, 220828L, 222140L, 222905L, 226133L, 226527L, 227160L, 
228193L, 231782L, 232454L, 233774L, 237836L, 237837L, 238860L, 
240223L, 245032L, 246673L, 247561L, 251611L, 251696L, 252663L, 
254410L, 255126L, 255230L, 258484L, 258485L, 259309L, 259910L, 
260542L, 262091L, 264462L, 264887L, 264888L, 266125L, 268574L, 
272959L), from = c("5370", "5370", "5370", "8605", "5370", "6390", 
"5370", "5370", "8934", "5370", "5635", "6046", "5680", "8026", 
"9037", "5370", "7816", "8046", "5492", "8756", "5370", "9254", 
"5370", "5370", "7078", "6615", "5370", "9817", "8228", "8822", 
"5735", "7058", "5370", "8667", "9315", "6053", "7990", "8247", 
"8165", "5656", "9261", "5929", "8251", "5370", "6725", "5370", 
"6004", "7022", "7442", "5370", "8679", "6491", "7078", "5370", 
"5370", "5370", "5658", "5370", "9296", "8386", "5370", "5370", 
"5370", "9535", "5370", "7541", "5370", "9621", "5370", "7158", 
"8240", "5370", "5370", "8025", "5370", "5370", "5370", "6989", 
"5370", "7059", "5370", "5370", "5370", "9121", "5608", "5370", 
"5370", "7551", "5370", "5370", "5370", "5370", "9163", "9362", 
"6072", "5370", "5370", "5370", "5370", "5370"), to = c("9356", 
"5605", "8567", "5370", "5636", "5370", "8933", "8483", "5370", 
"7626", "5370", "5370", "5370", "5370", "5370", "9676", "5370", 
"5370", "5370", "5370", "9105", "5370", "9772", "6979", "5370", 
"5370", "7564", "5370", "5370", "5370", "5370", "5370", "8744", 
"5370", "5370", "5370", "5370", "5370", "5370", "5370", "5370", 
"5370", "5370", "7318", "5370", "8433", "5370", "5370", "5370", 
"7122", "5370", "5370", "5370", "8566", "6728", "9689", "5370", 
"8342", "5370", "5370", "5614", "5596", "5953", "5370", "7336", 
"5370", "7247", "5370", "7291", "5370", "5370", "6282", "7236", 
"5370", "8866", "8613", "9247", "5370", "6767", "5370", "9273", 
"7320", "9533", "5370", "5370", "8930", "9343", "5370", "9499", 
"7693", "7830", "5392", "5370", "5370", "5370", "7497", "8516", 
"9023", "7310", "8939"), date = structure(c(12934, 13000, 13038, 
13061, 13099, 13113, 13117, 13179, 13238, 13249, 13268, 13296, 
13299, 13309, 13314, 13391, 13400, 13404, 13409, 13428, 13452, 
13452, 13460, 13482, 13493, 13518, 13526, 13537, 13542, 13544, 
13596, 13616, 13617, 13626, 13633, 13633, 13639, 13642, 13646, 
13656, 13660, 13664, 13667, 13669, 13677, 13686, 13694, 13694, 
13707, 13716, 13725, 13738, 13739, 13746, 13756, 13756, 13756, 
13761, 13769, 13770, 13776, 13786, 13786, 13786, 13791, 13799, 
13806, 13813, 13817, 13817, 13817, 13822, 13829, 13830, 13836, 
13847, 13847, 13847, 13852, 13860, 13866, 13871, 13878, 13878, 
13878, 13882, 13883, 13883, 13887, 13887, 13888, 13889, 13890, 
13891, 13895, 13896, 13896, 13899, 13905, 13909), class = "Date"), 
    amount = c(24.4, 7618, 21971, 5245, 2921, 8000, 169.2, 71.5, 
    14.6, 4214, 14.6, 13920, 14.6, 24640, 1600, 261.1, 16400, 
    3500, 2700, 19882, 182, 14.6, 16927, 25653, 3059, 2880, 9658, 
    4500, 12480, 14.6, 1000, 3679, 34430, 12600, 14.6, 19.2, 
    4900, 826, 3679, 2100, 38000, 79, 11400, 21495, 3679, 200, 
    14.6, 100.6, 3679, 5300, 108.9, 3679, 2696, 7500, 171.6, 
    14.6, 99.2, 2452, 3679, 3218, 700, 69.7, 14.6, 91.5, 2452, 
    3679, 2900, 17572, 14.6, 14.6, 90.5, 2452, 49752, 3679, 1900, 
    14.6, 870, 85.2, 2452, 3679, 1600, 540, 14.6, 14.6, 79, 210, 
    2452, 28400, 720, 180, 420, 44289, 489, 3679, 840, 2900, 
    150, 870, 420, 14.6)), row.names = c(NA, -100L), class = "data.frame")

这是一个使用 data.table 的选项:

library(data.table)
setDT(df)
setkey(df, to, date)

# Unique combination of from and date
af <- df[, unique(.SD), .SDcols = c("from", "date")]

# For each combination check sum of incoming in the last 6 months
for (i in 1:nrow(af)) {
  set(
    af, i = i, j = "am6m", 
    value = df[(date) %between% (af$date[[i]] - c(180, 0)) & to == af$from[[i]], sum(amount)]
  )
}
# Join the results into the main data.frame
df[, am6m := af[.SD, on = .(from, date), am6m]]



> tail(df)
#        id from   to       date  amount    am6m
# 1:  18529 5370 9356 2005-05-31    24.4     0.0
# 2: 258484 5370 9499 2008-01-09   720.0 74543.5
# 3: 251611 5370 9533 2007-12-31    14.6 46143.5
# 4:  83324 5370 9676 2006-08-31   261.1 40203.8
# 5: 203763 5370 9689 2007-08-31    14.6 92353.1
# 6: 103444 5370 9772 2006-11-08 16927.0 82671.2

这是一个使用 window 函数的选项。

但是,它们需要完整的日常数据才能工作,因此所需的内存量可能很大(您必须每天为每个来源创建一行)。

另请注意,此方法仅适用于大型数据集或直接在数据库上执行计算。需要大量set-up时间才能将原始数据变成没有间隙的形式。而且最后join数据也需要时间

不过,无论数据大小,滑动功能在速度上都是相对一致的。与子集相反,子集随着子集数据大小的增加而增加。

library(tidyverse)
library(tsibble)

# Calculate the 6 month window
six_mo_rollup <- data %>% 
  ## NOTE: You have to deal with duplicates somehow...either remove
  ## false duplicates or make them not duplicates...
  # We can get a unique from/date combo by summing since we need
  # to sum anyway.
  group_by(from,date) %>%
  summarise(amount = sum(amount),
            .groups = "keep") %>%
  ungroup() %>%
  # Now that each from/date is unique
  # convert data to a tsibble object
  as_tsibble(key = c(from),index = date) %>%
  # window functions can't have any missing time periods...so fill gaps
  # window functions grab 180 rows...not 180 days from the date
  group_by_key() %>%
  fill_gaps(.full = TRUE) %>%
  ungroup() %>%
  # arrange data from lowest to highest so slide can work right.
  arrange(date) %>%
  group_by(from) %>%
  mutate(
    six_mo_sum = slide_dbl(
      amount,
      sum,
      na.rm = TRUE, 
      .size = 180, 
      .align = "right"
    )
  ) %>%
  ungroup() %>%
  # any row without amount was created by fill_gaps in the example
  # so we can drop those rows to save space
  filter(!is.na(amount))

six_mo_rollup %>% filter(from == "5370")
# # A tsibble: 41 x 4 [1D]
# # Key:       from [1]
# from  date        amount six_mo_sum
#  <chr>  <date>      <dbl>      <dbl>
# 1 5370  2005-05-31    24.4        NA 
# 2 5370  2005-08-05  7618          NA 
# 3 5370  2005-09-12 21971          NA 
# 4 5370  2005-11-12  2921          NA 
# 5 5370  2005-11-30   169.      32679.
# 6 5370  2006-01-31    71.5     32751.
# 7 5370  2006-04-11  4214        7376.
# 8 5370  2006-08-31   261.       4475.
# 9 5370  2006-10-31   182         443.
# 10 5370  2006-11-08 16927       17370.
# # ... with 31 more rows

# Join the windowed data to the original dataset
data <- data %>%
  left_join(
    six_mo_rollup %>% select(from,date,six_mo_sum),
    by = c("from","date")
  )

更新:

在评论中,很明显您想对每个 for 的 to 值求和。原来我不明白。代码的更新是将所有汇总更改为 to 而不是 for.

此外,您需要没有 6 个月完整数据的值。所以你添加 .partial = TRUE.

# Calculate the 6 month window
six_mo_rollup <- data %>% 
  ## NOTE: You have to deal with duplicates somehow...either remove
  ## false duplicates or make them not duplicates...
  # We can get a unique from/date combo by summing since we need
  # to sum anyway.
  group_by(to,date) %>%
  summarise(amount = sum(amount),
            .groups = "keep") %>%
  ungroup() %>%
  # Now that each from/date is unique
  # convert data to a tsibble object
  as_tsibble(key = c(to),index = date) %>%
  # window functions can't have any missing time periods...so fill gaps
  # window functions grab 180 rows...not 180 days from the date
  group_by_key() %>%
  fill_gaps(.full = TRUE) %>%
  ungroup() %>%
  # arrange data from lowest to highest so slide can work right.
  arrange(date) %>%
  group_by(to) %>%
  mutate(
    six_mo_sum = slide_dbl(
      amount,
      sum,
      na.rm = TRUE, 
      .size = 180, 
      .align = "right",
      .partial = TRUE
    )
  ) %>%
  ungroup() %>%
  # any row without amount was created by fill_gaps in the example
  # so we can drop those rows to save space
  filter(!is.na(amount))

six_mo_rollup %>% filter(to == "5370")
# # A tsibble: 50 x 4 [1D]
# # Key:       to [1]
# to    date        amount six_mo_sum
# <chr> <date>       <dbl>      <dbl>
# 1 5370  2005-10-05  5245        5245 
# 2 5370  2005-11-26  8000       13245 
# 3 5370  2006-03-31    14.6     13260.
# 4 5370  2006-04-30    14.6      8029.
# 5 5370  2006-05-28 13920       13949.
# 6 5370  2006-05-31    14.6     13964.
# 7 5370  2006-06-10 24640       38604.
# 8 5370  2006-06-15  1600       40204.
# 9 5370  2006-09-09 16400       56604.
# 10 5370  2006-09-13  3500       60104.
# # ... with 40 more rows

# Join the windowed data to the original dataset
data <- data %>%
  left_join(
    six_mo_rollup %>% select(to,date,six_mo_sum),
    by = c("from" = "to","date" = "date")
  )

1m 的记录数据集足够小,不需要并行化。有很多方法可以做到这一点,“看起来”是正确的,但实际上并非如此……小心!

首先,您可能想知道为什么您原来的方法很慢? R 是一种解释型数组语言。要以可接受的性能做任何事情,您必须将向量传递给已经用低级语言预编译的快速函数。如果你在数据集上“逐元素映射”一个函数,你将失去向量化的大部分优势——purrr::mapbase::lapply 等从根本上讲都具有与预分配的 for 循环相当的性能,即。不是很好。您正在进行 100 万次以上的单独函数调用(每条记录一次)。对此的并行化只能通过减去一些开销来提高性能。

为您解答疑问:

  • 是否有每个帐户每天只能进行一次交易的限制,或者在任何一天都可以进行多次交易?我假设是的,每天可以进行多次交易。
  • “在进行特定交易之前的最后 6 个月内,列中的账户收到了多少交易金额”- 我假设这意味着“忽略与获得交易的交易相同日期进行的交易”字段附加到”,因为无法确定这些交易的执行时间

我的做法:先按账户和天求和,然后按天计算滚动求和,然后加入到随后的一天。

install.packages("RcppRoll") # for roll_sum()
install.packages(tidyr)      # for complete()

library(dplyr)

start_date <- as.Date("2018-01-01")
end_date <- as.Date("2020-01-01")
window_size <- 180L

# your example dataset is way too small to assess performance.
# Here is a 100k record dataset.

big_data <- tibble(
  from = as.factor(sapply(1:1000L, function(x) sample(1:100L,100, replace = F))),
  to = as.factor(sapply(1:1000L, function(x) sample(1:100L,100, replace = F))),
  amount = sample(1:10000, 100000, replace = TRUE),
  date = sample(seq.Date(from = start_date, to = end_date, by = "days"), 100000, replace = TRUE)
) %>%
  arrange(date) %>%
  mutate(id = row_number()) %>% 
  ungroup()

# calculate daily sum of values from PRECEDING day for join
daily_summary <- big_data %>%
  group_by(to, date) %>%
  summarize(daily_sum = sum(amount, na.rm = TRUE)) %>%
  ungroup() %>%
  # backfill empty records for data going back 6 months from start
  # this is needed because roll_sum() has no partial mode implemented.
  # and populate missing account - date combinations
  complete(date = seq.Date(from = start_date - window_size, to = end_date, by = "days"), to, fill = list(daily_sum = 0)) %>%
  group_by(to) %>%
  arrange(date) %>%
  mutate(
    total_trx_amount_received_in_last_sixmonth = RcppRoll::roll_sum(daily_sum, align = "right", n = window_size, fill = NA),
    date = date + 1
  ) %>%
  filter(date >= start_date) %>%
  select(date = date, from = to, total_trx_amount_received_in_last_sixmonth)

results <- left_join(big_data, daily_summary, by = c("from", "date"))

现在,性能如何?比你报告的要好得多,至少对我来说是这样。对于 100k 记录数据集(100 个帐户,2 年的信息),我在笔记本电脑上花费了 0.6 秒。对于 1m 的记录数据集(1000 个帐户,2 年的信息),我使用 microbenchmark 得到了 7-8 秒。可能不是最有效的方法,但考虑到我没有进行优化并且没有使用 data.table 这通常是 R 中高性能 2d 操作的关键

,所以完全可以接受

使用dplyr分组仍然意味着我们对每个帐户调用一次快速预编译函数RcppRoll::roll_sum(),从性能角度来看这并不理想,但至少我们只执行一个函数每个帐户调用而不是每个单独记录调用一个函数。您可能还想查看 RollingWindow 包中实现的单次滚动 window 函数,因为它们可能更快。

这只是 non-equi 加入 data.table。您可以创建 date - 180 的变量并限制当前日期与该变量之间的连接。这应该相当快

library(data.table)
setDT(dt)[, date_minus_180 := date - 180]
dt[, amnt_6_m := .SD[dt, sum(amount, na.rm = TRUE), 
     on = .(to = from, date <= date, date >= date_minus_180), by = .EACHI]$V1]
head(dt, 10)
#        id from   to       date  amount date_minus_180 amnt_6_m
#  1: 18529 5370 9356 2005-05-31    24.4     2004-12-02      0.0
#  2: 13742 5370 5605 2005-08-05  7618.0     2005-02-06      0.0
#  3:  9913 5370 8567 2005-09-12 21971.0     2005-03-16      0.0
#  4:   956 8605 5370 2005-10-05  5245.0     2005-04-08      0.0
#  5:  2557 5370 5636 2005-11-12  2921.0     2005-05-16   5245.0
#  6:  1602 6390 5370 2005-11-26  8000.0     2005-05-30      0.0
#  7: 18669 5370 8933 2005-11-30   169.2     2005-06-03  13245.0
#  8: 35900 5370 8483 2006-01-31    71.5     2005-08-04  13245.0
#  9: 48667 8934 5370 2006-03-31    14.6     2005-10-02      0.0
# 10: 51341 5370 7626 2006-04-11  4214.0     2005-10-13   8014.6