在使用可变数据量计算滚动均值时矢量化循环

Vectorize loops when calculating rolling means with variable amounts of data

我有一个从部署的传感器中获取的日常水化学值的数据框。我正在尝试计算每日最大值的 7 天滚动平均值。这个是现场环境数据,数据可能有点乱。

以下是计算平均值和分配质量级别的规则:

我的代码使用循环遍历每个结果,创建一个包含 7 天 window 的新数据框,然后计算移动平均值。请参阅下面的最小示例。

请注意,此示例中缺少 11 日、16 日、17 日和 18 日:

  daily_data <- tibble::tribble(
    ~Monitoring.Location.ID,        ~date, ~dyMax, ~dyMin, ~dyDQL,
    "River 1", as.Date("2018-07-01"), 24.219, 22.537,    "A",
    "River 1", as.Date("2018-07-02"), 24.557, 20.388,    "A",
    "River 1", as.Date("2018-07-03"), 24.847, 20.126,    "A",
    "River 1", as.Date("2018-07-04"), 25.283, 20.674,    "A",
    "River 1", as.Date("2018-07-05"), 25.501, 20.865,    "A",
    "River 1", as.Date("2018-07-06"),  25.04, 21.008,    "A",
    "River 1", as.Date("2018-07-07"), 24.847, 20.674,    "A",
    "River 1", as.Date("2018-07-08"), 23.424, 20.793,    "B",
    "River 1", as.Date("2018-07-09"), 22.657, 18.866,    "E",
    "River 1", as.Date("2018-07-10"), 22.298,   18.2,    "A",
    "River 1", as.Date("2018-07-12"),  22.92, 19.008,    "A",
    "River 1", as.Date("2018-07-13"), 23.978, 19.532,    "A",
    "River 1", as.Date("2018-07-14"), 24.508, 19.936,    "A",
    "River 1", as.Date("2018-07-15"), 25.137, 20.627,    "A",
    "River 1", as.Date("2018-07-19"), 24.919, 20.674,    "A"
  )
  
for (l in seq_len(nrow(daily_data))){
  
  
  station_7day <- filter(daily_data,
                         dplyr::between(date, daily_data[[l,'date']] - lubridate::days(6), daily_data[l,'date']))
  
  
  daily_data[l,"ma.max7"] <- dplyr::case_when(nrow(subset(station_7day, dyDQL %in% c("A")))== 7 & l >=7  ~ mean(station_7day$dyMax),
                                              nrow(subset(station_7day, dyDQL %in% c("A", 'B'))) >= 6 & l >=7~ mean(station_7day$dyMax),
                                              max(station_7day$dyDQL == 'E') & nrow(subset(station_7day, dyDQL %in% c("A", "B"))) >= 6  & l >=7 ~ mean(station_7day$dyMax[station_7day$dyDQL %in% c("A", "B")]),
                                              nrow(subset(station_7day, dyDQL %in% c("A", "B", "E"))) >= 6 & l >=7~  mean(station_7day$dyMax),
                                              TRUE ~ NA_real_)
  
  daily_data[l, "ma.max7_DQL"] <-  dplyr::case_when(nrow(subset(station_7day, dyDQL %in% c("A")))== 7 & l >=7  ~ "A",
                                                    nrow(subset(station_7day, dyDQL %in% c("A", 'B'))) >= 6 & l >=7~ "B",
                                                    max(station_7day$dyDQL == 'E') & nrow(subset(station_7day, dyDQL %in% c("A", "B"))) >= 6  & l >=7 ~ "B",
                                                    nrow(subset(station_7day, dyDQL %in% c("A", "B", "E"))) >= 6 & l >=7~ "E",
                                                    TRUE ~ NA_character_)
  
  
}

预期结果是:

tibble::tribble(
  ~Monitoring.Location.ID,        ~date, ~dyMax, ~dyMin, ~dyDQL,         ~ma.max7, ~ma.max7_DQL,
                "River 1", as.Date("2018-07-01"), 24.219, 22.537,    "A",               NA,           NA,
                "River 1", as.Date("2018-07-02"), 24.557, 20.388,    "A",               NA,           NA,
                "River 1", as.Date("2018-07-03"), 24.847, 20.126,    "A",               NA,           NA,
                "River 1", as.Date("2018-07-04"), 25.283, 20.674,    "A",               NA,           NA,
                "River 1", as.Date("2018-07-05"), 25.501, 20.865,    "A",               NA,           NA,
                "River 1", as.Date("2018-07-06"),  25.04, 21.008,    "A",               NA,           NA,
                "River 1", as.Date("2018-07-07"), 24.847, 20.674,    "A", 24.8991428571429,          "A",
                "River 1", as.Date("2018-07-08"), 23.424, 20.793,    "B", 24.7855714285714,          "B",
                "River 1", as.Date("2018-07-09"), 22.657, 18.866,    "E", 24.5141428571429,          "B",
                "River 1", as.Date("2018-07-10"), 22.298,   18.2,    "A",            24.15,          "B",
                "River 1", as.Date("2018-07-12"),  22.92, 19.008,    "A",           23.531,          "E",
                "River 1", as.Date("2018-07-13"), 23.978, 19.532,    "A",           23.354,          "E",
                "River 1", as.Date("2018-07-14"), 24.508, 19.936,    "A",          23.2975,          "E",
                "River 1", as.Date("2018-07-15"), 25.137, 20.627,    "A",           23.583,          "E",
                "River 1", as.Date("2018-07-19"), 24.919, 20.674,    "A",               NA,           NA
  )

该代码工作正常,但在计算多个位置具有多个不同水质参数的多年水平数据的值时速度非常慢。

由于可以从 6 天的数据中计算出 7 天的值,我认为我无法使用 zoo 包中的任何滚动功能。我不认为我可以使用 roll 包中的 roll_mean 函数,因为当有 6 天的“A”或“B”数据时丢弃价值 1 天的“E”数据的可变性质。

有没有办法对其进行矢量化,以避免遍历每一行数据?

这是一种矢量化方法,使用 slider:slide_index 计算高质量和备份质量值,然后结合以获得最佳可用值:

library(tidyverse); library(slider)

以下函数按位置分组,计算每周平均值(包括从日期 6 到日期(包括日期)的所有内容)和包含的观察值数量,然后过滤为只有 6 个以上的观察值。

get_weekly_by_loc <- function(df) {
  df %>%
    group_by(Monitoring.Location.ID) %>%
    mutate(mean = slide_index_dbl(dyMax, date, mean, .complete = TRUE, .before = lubridate::days(6)),
           count = slide_index_dbl(dyMax, date, ~sum(!is.na(.)), .before = lubridate::days(6))) %>%
    ungroup() %>%
    filter(count >= 6)
}

然后我们可以 运行 这个函数只对 A/B 数据和整体:

daily_data_high_quality <- daily_data %>%
  filter(dyDQL %in% c("A", "B")) %>%
  get_weekly_by_loc() %>%
  select(Monitoring.Location.ID, date, high_qual_mean = mean)
  
daily_data_backup <- daily_data %>%
  get_weekly_by_loc() %>%
  select(Monitoring.Location.ID, date, backup_mean = mean)

然后加入那些并使用高质量的(如果可用):

daily_data %>%
  left_join(daily_data_high_quality) %>%
  left_join(daily_data_backup) %>%
  mutate(max7_DQL = coalesce(high_qual_mean, backup_mean)) %>%
  mutate(moar_digits = format(max7_DQL, nsmall = 6))

结果

# A tibble: 15 x 9
   Monitoring.Location.ID date       dyMax dyMin dyDQL high_qual_mean backup_mean max7_DQL moar_digits
   <chr>                  <date>     <dbl> <dbl> <chr>          <dbl>       <dbl>    <dbl> <chr>      
 1 River 1                2018-07-01  24.2  22.5 A               NA          NA       NA   "       NA"
 2 River 1                2018-07-02  24.6  20.4 A               NA          NA       NA   "       NA"
 3 River 1                2018-07-03  24.8  20.1 A               NA          NA       NA   "       NA"
 4 River 1                2018-07-04  25.3  20.7 A               NA          NA       NA   "       NA"
 5 River 1                2018-07-05  25.5  20.9 A               NA          NA       NA   "       NA"
 6 River 1                2018-07-06  25.0  21.0 A               NA          NA       NA   "       NA"
 7 River 1                2018-07-07  24.8  20.7 A               24.9        24.9     24.9 "24.899143"
 8 River 1                2018-07-08  23.4  20.8 B               24.8        24.8     24.8 "24.785571"
 9 River 1                2018-07-09  22.7  18.9 E               NA          24.5     24.5 "24.514143"
10 River 1                2018-07-10  22.3  18.2 A               24.4        24.2     24.4 "24.398833"
11 River 1                2018-07-12  22.9  19.0 A               NA          23.5     23.5 "23.531000"
12 River 1                2018-07-13  24.0  19.5 A               NA          23.4     23.4 "23.354000"
13 River 1                2018-07-14  24.5  19.9 A               NA          23.3     23.3 "23.297500"
14 River 1                2018-07-15  25.1  20.6 A               NA          23.6     23.6 "23.583000"
15 River 1                2018-07-19  24.9  20.7 A               NA          NA       NA   "       NA"

我使用了 tidyverserunner 并且在单个管道语法中这样做了。语法解释-

  • 我首先使用 runner.
  • 将 7 天(根据提供的逻辑)DQL 和 MAX 值收集到一个列表中
  • 在此之前,我已将 DQL 转换为有序因子变量,将在最后一个语法中使用。
  • 其次,我使用purrr::map根据给定条件修改每个列表,
    • 不少于6个才算
    • 如果7个值中恰好有一个E,则不必计算
  • 最后我使用 unnest_wider
  • 取消了列表的嵌套
library(runner)
daily_data %>% mutate(dyDQL = factor(dyDQL, levels = c("A", "B", "E"), ordered = T),
                      d = runner(x = data.frame(a = dyMax, b= dyDQL),
                                   k = "7 days",
                                   lag = 0,
                                   idx = date,
                                   f = function(x) list(x))) %>%
  mutate(d = map(d, ~ .x %>% group_by(b) %>%
                     mutate(c = n()) %>%
                     ungroup() %>%
                     filter(!n() < 6) %>%
                     filter(!(b == 'E' & c == 1 & n() == 7)) %>%
                     summarise(ma.max7 = ifelse(n() == 0, NA, mean(a)), ma.max7.DQL = max(b))
                   )
         ) %>%
  unnest_wider(d)

# A tibble: 15 x 7
   Monitoring.Location.ID date       dyMax dyMin dyDQL ma.max7 ma.max7.DQL
   <chr>                  <date>     <dbl> <dbl> <ord>   <dbl> <ord>      
 1 River 1                2018-07-01  24.2  22.5 A        NA   NA         
 2 River 1                2018-07-02  24.6  20.4 A        NA   NA         
 3 River 1                2018-07-03  24.8  20.1 A        NA   NA         
 4 River 1                2018-07-04  25.3  20.7 A        NA   NA         
 5 River 1                2018-07-05  25.5  20.9 A        NA   NA         
 6 River 1                2018-07-06  25.0  21.0 A        24.9 A          
 7 River 1                2018-07-07  24.8  20.7 A        24.9 A          
 8 River 1                2018-07-08  23.4  20.8 B        24.8 B          
 9 River 1                2018-07-09  22.7  18.9 E        24.8 B          
10 River 1                2018-07-10  22.3  18.2 A        24.4 B          
11 River 1                2018-07-12  22.9  19.0 A        23.5 E          
12 River 1                2018-07-13  24.0  19.5 A        23.4 E          
13 River 1                2018-07-14  24.5  19.9 A        23.3 E          
14 River 1                2018-07-15  25.1  20.6 A        23.6 E          
15 River 1                2018-07-19  24.9  20.7 A        NA   NA