将 auto.arima(预测包)迭代应用于 R 中的多个时间序列

Iteratively applying auto.arima (forecast package) to multiple time series in R

我有以下时间序列:

ts<-data.frame(Date=c('2017-01-01','2017-01-02','2017-01-03','2017-01-04','2017-01-05','2017-01-06','2017-01-07','2017-01-08','2017-01-09','2017-01-10'),
               A=c(15,37,29,18,12,8,2,24,42,10),
               B=c(16,22,5,6,22,12,13,7,20,36))

ts

      Date    A  B
1  2017-01-01 15 16
2  2017-01-02 37 22
3  2017-01-03 29  5
4  2017-01-04 18  6
5  2017-01-05 12 22
6  2017-01-06  8 12
7  2017-01-07  2 13
8  2017-01-08 24  7
9  2017-01-09 42 20
10 2017-01-10 10 36

我想在时间序列 A 和 B 上迭代应用预测包中的 auto.arima 函数。

我需要一种函数式方法的帮助,该方法首先创建具有以下设置的预测函数(此函数将遍历多个系列):

1. splits data into train:test in 80:20 ratio
2. Trains auto.arima model on the train set
3. Model evaluation using the test set (rmse metric)
4. optional----> cross-validation with 1 time step
5. generates forecast (horizon=2) with the error metric as below:

  series    Date         rmse    pt_forecast_1 pt_forecast_2
1    A   2017-01-11      0.21       12            13
2    B   2017-01-12      0.11       36            34

这里需要帮助。谢谢

我写了 data_gen_func() 来做你需要的。我希望它有所帮助。输出几乎与您需要的相同。您需要安装预报和 CombMSC 包。如果您没有安装,下面的代码将完成这项工作。

我还向您展示了如何使用,并描述了您需要传递给 data_gen_func() 的参数。

if(!require(forecast)){
  install.packages("forecast")
}

if(!require(CombMSC)){
  install.packages("CombMSC")
}


#' @param dta a multiple time series
#' @param h final forecast horizon
#' @param test_size how many observation to use for test
#' @param start_fc_date Startind date of forecast. Note you can change it. this method was the fist came to my mind.
#' @param ts_frequency A character string, containing one of "day", "week", 
#' "month", "quarter" or "year". 
#' This can optionally be preceded by a (positive or negative) 
#' integer and a space, or followed by "s".
#' @param error masure of error. 
#' It can be one of the following: ME , RMSE, MAE, MPE, MAPE, MASE, ACF1.

data_gen_func <- function(dta, h, test_size, start_fc_date, ts_frequency, 
                          error = "RMSE"){
  
  if(!"Date" %in% class(start_fc_date)){
    stop(" 'start_fc_date' must have class of 'Date'")
  }
  
  if(!"mts" %in% class(dta)){
    stop("dta must be an mts")
  }
  
  nts <- ncol(dta)
  
  fc <- data.frame(matrix(nrow = h, ncol = nts))
  
  acc <- data.frame(matrix(nrow = 1, ncol = nts))
  
  
  train_length <- nrow(dta) - test_size
  
  for (i in 1:nts) {
    
    d_list <- CombMSC::splitTrainTest(dta[,i], train_length)
    
    train <- d_list$train
    test <- d_list$test
    
    point_fc <- forecast(auto.arima(train), h = test_size)$mean
    
    acc[,i] <- accuracy(point_fc, test)[,paste0(error)]
    
    colnames(acc)[i] <- colnames(dta)[i]
    
    fc[,i] <- forecast(auto.arima(dta[,i]), h = h)$mean
    
    colnames(fc)[i] <- colnames(dta)[i]
  }
  
  acc <- tidyr::pivot_longer(acc, everything(),names_to = "series", 
                             values_to = paste0(error))
  
  fc$date <- seq(from = start_fc_date, length.out = h, by = ts_frequency)
  
  tidyr::pivot_longer(fc, -date,names_to = "series", 
                      values_to = "fc")%>%
    tidyr::pivot_wider(names_from = date, values_from= fc)-> fc
  
  output <- dplyr::left_join(fc,acc)
  
  return(output)
}


# usage -------------------
library(forecast)
library(CombMSC)

my_data <-  ts(data.frame(
  AA = arima.sim(list(order=c(1,0,0), ar=.5), n=50, mean = 12), 
  AB = arima.sim(list(order=c(1,0,0), ar=.5), n=50, mean = 12), 
  AC = arima.sim(list(order=c(1,0,0), ar=.5), n=50, mean = 11),
  BA = arima.sim(list(order=c(1,0,0), ar=.5), n=50, mean = 10),
  BB = arima.sim(list(order=c(1,0,0), ar=.5), n=50, mean = 14)), 
  start = c(2010, 1), frequency = 12)

end(my_data)


out1 <- data_gen_func(dta = my_data, h = 2, test_size = 1, start_fc_date = as.Date("2014-03-01"),
                     ts_frequency = "month", error = "MAPE")
out1

5 个时间序列的输出如下所示

    # A tibble: 5 x 4
  series `2014-03-01` `2014-04-01`  MAPE
  <chr>         <dbl>        <dbl> <dbl>
1 AA             23.6         23.4  3.38
2 AB             24.2         24.4  1.18
3 AC             21.1         21.3  4.31
4 BA             19.9         20.1  1.47
5 BB             27.3         27.7  3.54

如果您设置 error = "RMSE",结果将如下所示:

# A tibble: 5 x 4
  series `2014-03-01` `2014-04-01`  RMSE
  <chr>         <dbl>        <dbl> <dbl>
1 AA             24.0         24.0 1.05 
2 AB             23.2         23.3 0.160
3 AC             22.2         22.2 0.851
4 BA             19.4         19.7 1.59 
5 BB             27.5         27.9 1.04 

使用您的示例数据:它很短,因此您会收到一些警告

my_ts <-data.frame(Date=c('2017-01-01','2017-01-02','2017-01-03','2017-01-04','2017-01-05','2017-01-06','2017-01-07','2017-01-08','2017-01-09','2017-01-10'),
               A=c(15,37,29,18,12,8,2,24,42,10),
               B=c(16,22,5,6,22,12,13,7,20,36))



my_ts <- stats::ts(my_ts[,-1], start = c(2017,1), frequency = 7)


out2 <- data_gen_func(dta = my_ts, h = 2, test_size = 2, 
                      start_fc_date = as.Date("2017-01-10"),
              ts_frequency = "day", error = "MAPE")

out2

输出:

# A tibble: 2 x 4
  series `2017-01-10` `2017-01-11`  MAPE
  <chr>         <dbl>        <dbl> <dbl>
1 A              19.7         19.7  69.0
2 B              15.9         15.9  49.9

如果您对输出不满意,也可以转换数据。

tidyr::pivot_longer(out2, -c(series,MAPE), names_to = "date", 
                    values_to= "point_fc")

旋转后的输出

# A tibble: 4 x 4
  series  MAPE date       point_fc
  <chr>  <dbl> <chr>         <dbl>
1 A       69.0 2017-01-10     19.7
2 A       69.0 2017-01-11     19.7
3 B       49.9 2017-01-10     15.9
4 B       49.9 2017-01-11     15.9