auto arima 训练和测试,按 r 中的 id 分组
autoarima train and test, grouped by id in r
我正在尝试使用 auto.arima 预测时间序列。我需要的是拆分训练和测试数据以查看模型指标。我的日期范围在 2016 年 12 月到 2020 年 1 月之间。我需要直到 2018 年 12 月的火车数据以及此后的测试。
除此之外,我还需要每个 ID 的指标 RMSE 和 MSE。
这是我的数据示例:
x<- tibble::tribble(
~ID, ~Date, ~Value,
1L, "01-12-2016", 48L,
1L, "01-01-2017", 10055L,
1L, "01-02-2017", 650L,
1L, "01-03-2017", 8255L,
1L, "01-04-2017", 3680L,
1L, "01-05-2017", 2180L,
1L, "01-06-2017", 2790L,
1L, "01-07-2017", 3805L,
1L, "01-08-2017", 2811L,
1L, "01-09-2017", -225L,
1L, "01-10-2017", -232L,
1L, "01-11-2017", -243L,
1L, "01-12-2017", -217L,
1L, "01-01-2018", -256L,
1L, "01-02-2018", -277L,
1L, "01-03-2018", -3L,
1L, "01-04-2018", -247L,
1L, "01-05-2018", 88L,
1L, "01-06-2018", -260L,
1L, "01-07-2018", -228L,
1L, "01-08-2018", -285L,
1L, "01-09-2018", -321L,
1L, "01-10-2018", -265L,
1L, "01-11-2018", -302L,
1L, "01-12-2018", -11968L,
1L, "01-01-2019", 5435L,
1L, "01-02-2019", 6694L,
1L, "01-03-2019", 4750L,
1L, "01-04-2019", 3747L,
1L, "01-05-2019", 3727L,
1L, "01-06-2019", 3252L,
1L, "01-07-2019", 1691L,
1L, "01-08-2019", 2489L,
1L, "01-09-2019", -182L,
1L, "01-10-2019", 3926L,
1L, "01-11-2019", 326L,
1L, "01-12-2019", -1047L,
1L, "01-01-2020", 14L,
2L, "01-12-2016", -241L,
2L, "01-01-2017", -262L,
2L, "01-02-2017", -231L,
2L, "01-03-2017", -203L,
2L, "01-04-2017", -226L,
2L, "01-05-2017", -223L,
2L, "01-06-2017", -300L,
2L, "01-07-2017", -259L,
2L, "01-08-2017", -241L,
2L, "01-09-2017", -225L,
2L, "01-10-2017", -227L,
2L, "01-11-2017", -243L,
2L, "01-12-2017", -217L,
2L, "01-01-2018", -256L,
2L, "01-02-2018", -277L,
2L, "01-03-2018", 0L,
2L, "01-04-2018", -247L,
2L, "01-05-2018", -274L,
2L, "01-06-2018", -264L,
2L, "01-07-2018", -227L,
2L, "01-08-2018", -275L,
2L, "01-09-2018", -325L,
2L, "01-10-2018", -269L,
2L, "01-11-2018", -306L,
2L, "01-12-2018", -264L,
2L, "01-01-2019", -308L,
2L, "01-02-2019", -332L,
2L, "01-03-2019", -260L,
2L, "01-04-2019", -300L,
2L, "01-05-2019", -302L,
2L, "01-06-2019", -291L,
2L, "01-07-2019", -284L,
2L, "01-08-2019", -288L,
2L, "01-09-2019", -272L,
2L, "01-10-2019", 0L,
2L, "01-11-2019", 0L,
2L, "01-12-2019", -17107L,
2L, "01-01-2020", 3500L,
3L, "01-12-2016", 1940L,
3L, "01-01-2017", 1753L,
3L, "01-02-2017", 2758L,
3L, "01-03-2017", 2539L,
3L, "01-04-2017", -9078L,
3L, "01-05-2017", 5215L,
3L, "01-06-2017", 1796L,
3L, "01-07-2017", -8424L,
3L, "01-08-2017", 19868L,
3L, "01-09-2017", 10707L,
3L, "01-10-2017", 8985L,
3L, "01-11-2017", 3058L,
3L, "01-12-2017", 2469L,
3L, "01-01-2018", 21L,
3L, "01-02-2018", 1039L,
3L, "01-03-2018", 2875L,
3L, "01-04-2018", -2678L,
3L, "01-05-2018", 1515L,
3L, "01-06-2018", 2651L,
3L, "01-07-2018", -5014L,
3L, "01-08-2018", 299L,
3L, "01-09-2018", 1755L,
3L, "01-10-2018", 5009L,
3L, "01-11-2018", 2857L,
3L, "01-12-2018", 2909L,
3L, "01-01-2019", 1353L,
3L, "01-02-2019", 2337L,
3L, "01-03-2019", 3019L,
3L, "01-04-2019", -531L,
3L, "01-05-2019", -1055L,
3L, "01-06-2019", 1706L,
3L, "01-07-2019", -507L,
3L, "01-08-2019", 2234L,
3L, "01-09-2019", 890L,
3L, "01-10-2019", 94L,
3L, "01-11-2019", -1781L,
3L, "01-12-2019", 102590L,
3L, "01-01-2020", 471L,
4L, "01-12-2016", 2658L,
4L, "01-01-2017", 2344L,
4L, "01-02-2017", 2728L,
4L, "01-03-2017", -58L,
4L, "01-04-2017", -226L,
4L, "01-05-2017", -5L,
4L, "01-06-2017", -300L,
4L, "01-07-2017", -259L,
4L, "01-08-2017", -241L,
4L, "01-09-2017", -225L,
4L, "01-10-2017", -229L,
4L, "01-11-2017", -243L,
4L, "01-12-2017", -217L,
4L, "01-01-2018", -245L,
4L, "01-02-2018", -277L,
4L, "01-03-2018", -155L,
4L, "01-04-2018", 5437L,
4L, "01-05-2018", 2866L,
4L, "01-06-2018", 3091L,
4L, "01-07-2018", 3669L,
4L, "01-08-2018", 311L,
4L, "01-09-2018", 4120L,
4L, "01-10-2018", 2357L,
4L, "01-11-2018", -4759L,
4L, "01-12-2018", 4220L,
4L, "01-01-2019", 2730L,
4L, "01-02-2019", 2515L,
4L, "01-03-2019", 2560L,
4L, "01-04-2019", 2864L,
4L, "01-05-2019", 1935L,
4L, "01-06-2019", 938L,
4L, "01-07-2019", 3268L,
4L, "01-08-2019", 3232L,
4L, "01-09-2019", 3347L,
4L, "01-10-2019", 4241L,
4L, "01-11-2019", -247L,
4L, "01-12-2019", 179L,
4L, "01-01-2020", 2542L
)
x<-as.data.frame(x)
我尝试使用 dplyr:
x %>%
group_by(ID) %>%
do(fit=auto.arima(.$Value,seasonal = F,stepwise = F,approximation = F),
fit_forecast=forecast(auto.arima(.$Value,seasonal = F,stepwise = F,approximation = F),h=12))
但我不知道如何添加训练和测试步骤以及指标。有谁知道如何解决它?谢谢 !
您只需将数据框子集化为训练和测试。通常 80/20 拆分或 90/10 拆分。但在你的情况下,因为你似乎想要特定的日期,你甚至可以硬编码行数。
library("lubridate")
x$Date <- dmy(x$Date)
x <- x[order(x$Date),]
rownames(x) <- NULL
x.train <- x[1:139,]
x.text <- x[-(1:139),]
通常还会对数据进行采样以获得 80/20 或 90/10。但是在时间序列中,您想简单地拆分而不是采样。
我已经在 https://community.rstudio.com/t/autoarima-train-and-test-grouped-by-id-in-r/66400 上回答了同样的问题,但为了记录在这里又一次。
使用 tsibble 和 fable 包更容易做到这一点,就像这样。
library(dplyr)
library(tsibble)
library(lubridate)
library(fable)
# Turn x into a tsibble object
x <- x %>%
mutate(Date = yearmonth(dmy(Date))) %>%
as_tsibble(index = Date, key = ID)
# Use filter to create training set
# Then fit non-seasonal ARIMA models
fit <- x %>%
filter(Date <= yearmonth("2018 Dec")) %>%
model(ARIMA(Value ~ PDQ(0,0,0), stepwise=FALSE, approximation=FALSE))
# Now forecast the test set and compute RMSE and MSE
fit %>%
forecast(h = 13) %>%
accuracy(x) %>%
mutate(MSE = RMSE^2) %>%
select(ID, RMSE, MSE)
#> # A tibble: 4 x 3
#> ID RMSE MSE
#> <int> <dbl> <dbl>
#> 1 1 9696. 94011195.
#> 2 2 4792. 22964718.
#> 3 3 27899. 778326636.
#> 4 4 1776. 3153398.
由 reprex package (v0.3.0)
于 2020-05-18 创建
我正在尝试使用 auto.arima 预测时间序列。我需要的是拆分训练和测试数据以查看模型指标。我的日期范围在 2016 年 12 月到 2020 年 1 月之间。我需要直到 2018 年 12 月的火车数据以及此后的测试。
除此之外,我还需要每个 ID 的指标 RMSE 和 MSE。 这是我的数据示例:
x<- tibble::tribble(
~ID, ~Date, ~Value,
1L, "01-12-2016", 48L,
1L, "01-01-2017", 10055L,
1L, "01-02-2017", 650L,
1L, "01-03-2017", 8255L,
1L, "01-04-2017", 3680L,
1L, "01-05-2017", 2180L,
1L, "01-06-2017", 2790L,
1L, "01-07-2017", 3805L,
1L, "01-08-2017", 2811L,
1L, "01-09-2017", -225L,
1L, "01-10-2017", -232L,
1L, "01-11-2017", -243L,
1L, "01-12-2017", -217L,
1L, "01-01-2018", -256L,
1L, "01-02-2018", -277L,
1L, "01-03-2018", -3L,
1L, "01-04-2018", -247L,
1L, "01-05-2018", 88L,
1L, "01-06-2018", -260L,
1L, "01-07-2018", -228L,
1L, "01-08-2018", -285L,
1L, "01-09-2018", -321L,
1L, "01-10-2018", -265L,
1L, "01-11-2018", -302L,
1L, "01-12-2018", -11968L,
1L, "01-01-2019", 5435L,
1L, "01-02-2019", 6694L,
1L, "01-03-2019", 4750L,
1L, "01-04-2019", 3747L,
1L, "01-05-2019", 3727L,
1L, "01-06-2019", 3252L,
1L, "01-07-2019", 1691L,
1L, "01-08-2019", 2489L,
1L, "01-09-2019", -182L,
1L, "01-10-2019", 3926L,
1L, "01-11-2019", 326L,
1L, "01-12-2019", -1047L,
1L, "01-01-2020", 14L,
2L, "01-12-2016", -241L,
2L, "01-01-2017", -262L,
2L, "01-02-2017", -231L,
2L, "01-03-2017", -203L,
2L, "01-04-2017", -226L,
2L, "01-05-2017", -223L,
2L, "01-06-2017", -300L,
2L, "01-07-2017", -259L,
2L, "01-08-2017", -241L,
2L, "01-09-2017", -225L,
2L, "01-10-2017", -227L,
2L, "01-11-2017", -243L,
2L, "01-12-2017", -217L,
2L, "01-01-2018", -256L,
2L, "01-02-2018", -277L,
2L, "01-03-2018", 0L,
2L, "01-04-2018", -247L,
2L, "01-05-2018", -274L,
2L, "01-06-2018", -264L,
2L, "01-07-2018", -227L,
2L, "01-08-2018", -275L,
2L, "01-09-2018", -325L,
2L, "01-10-2018", -269L,
2L, "01-11-2018", -306L,
2L, "01-12-2018", -264L,
2L, "01-01-2019", -308L,
2L, "01-02-2019", -332L,
2L, "01-03-2019", -260L,
2L, "01-04-2019", -300L,
2L, "01-05-2019", -302L,
2L, "01-06-2019", -291L,
2L, "01-07-2019", -284L,
2L, "01-08-2019", -288L,
2L, "01-09-2019", -272L,
2L, "01-10-2019", 0L,
2L, "01-11-2019", 0L,
2L, "01-12-2019", -17107L,
2L, "01-01-2020", 3500L,
3L, "01-12-2016", 1940L,
3L, "01-01-2017", 1753L,
3L, "01-02-2017", 2758L,
3L, "01-03-2017", 2539L,
3L, "01-04-2017", -9078L,
3L, "01-05-2017", 5215L,
3L, "01-06-2017", 1796L,
3L, "01-07-2017", -8424L,
3L, "01-08-2017", 19868L,
3L, "01-09-2017", 10707L,
3L, "01-10-2017", 8985L,
3L, "01-11-2017", 3058L,
3L, "01-12-2017", 2469L,
3L, "01-01-2018", 21L,
3L, "01-02-2018", 1039L,
3L, "01-03-2018", 2875L,
3L, "01-04-2018", -2678L,
3L, "01-05-2018", 1515L,
3L, "01-06-2018", 2651L,
3L, "01-07-2018", -5014L,
3L, "01-08-2018", 299L,
3L, "01-09-2018", 1755L,
3L, "01-10-2018", 5009L,
3L, "01-11-2018", 2857L,
3L, "01-12-2018", 2909L,
3L, "01-01-2019", 1353L,
3L, "01-02-2019", 2337L,
3L, "01-03-2019", 3019L,
3L, "01-04-2019", -531L,
3L, "01-05-2019", -1055L,
3L, "01-06-2019", 1706L,
3L, "01-07-2019", -507L,
3L, "01-08-2019", 2234L,
3L, "01-09-2019", 890L,
3L, "01-10-2019", 94L,
3L, "01-11-2019", -1781L,
3L, "01-12-2019", 102590L,
3L, "01-01-2020", 471L,
4L, "01-12-2016", 2658L,
4L, "01-01-2017", 2344L,
4L, "01-02-2017", 2728L,
4L, "01-03-2017", -58L,
4L, "01-04-2017", -226L,
4L, "01-05-2017", -5L,
4L, "01-06-2017", -300L,
4L, "01-07-2017", -259L,
4L, "01-08-2017", -241L,
4L, "01-09-2017", -225L,
4L, "01-10-2017", -229L,
4L, "01-11-2017", -243L,
4L, "01-12-2017", -217L,
4L, "01-01-2018", -245L,
4L, "01-02-2018", -277L,
4L, "01-03-2018", -155L,
4L, "01-04-2018", 5437L,
4L, "01-05-2018", 2866L,
4L, "01-06-2018", 3091L,
4L, "01-07-2018", 3669L,
4L, "01-08-2018", 311L,
4L, "01-09-2018", 4120L,
4L, "01-10-2018", 2357L,
4L, "01-11-2018", -4759L,
4L, "01-12-2018", 4220L,
4L, "01-01-2019", 2730L,
4L, "01-02-2019", 2515L,
4L, "01-03-2019", 2560L,
4L, "01-04-2019", 2864L,
4L, "01-05-2019", 1935L,
4L, "01-06-2019", 938L,
4L, "01-07-2019", 3268L,
4L, "01-08-2019", 3232L,
4L, "01-09-2019", 3347L,
4L, "01-10-2019", 4241L,
4L, "01-11-2019", -247L,
4L, "01-12-2019", 179L,
4L, "01-01-2020", 2542L
)
x<-as.data.frame(x)
我尝试使用 dplyr:
x %>%
group_by(ID) %>%
do(fit=auto.arima(.$Value,seasonal = F,stepwise = F,approximation = F),
fit_forecast=forecast(auto.arima(.$Value,seasonal = F,stepwise = F,approximation = F),h=12))
但我不知道如何添加训练和测试步骤以及指标。有谁知道如何解决它?谢谢 !
您只需将数据框子集化为训练和测试。通常 80/20 拆分或 90/10 拆分。但在你的情况下,因为你似乎想要特定的日期,你甚至可以硬编码行数。
library("lubridate")
x$Date <- dmy(x$Date)
x <- x[order(x$Date),]
rownames(x) <- NULL
x.train <- x[1:139,]
x.text <- x[-(1:139),]
通常还会对数据进行采样以获得 80/20 或 90/10。但是在时间序列中,您想简单地拆分而不是采样。
我已经在 https://community.rstudio.com/t/autoarima-train-and-test-grouped-by-id-in-r/66400 上回答了同样的问题,但为了记录在这里又一次。
使用 tsibble 和 fable 包更容易做到这一点,就像这样。
library(dplyr)
library(tsibble)
library(lubridate)
library(fable)
# Turn x into a tsibble object
x <- x %>%
mutate(Date = yearmonth(dmy(Date))) %>%
as_tsibble(index = Date, key = ID)
# Use filter to create training set
# Then fit non-seasonal ARIMA models
fit <- x %>%
filter(Date <= yearmonth("2018 Dec")) %>%
model(ARIMA(Value ~ PDQ(0,0,0), stepwise=FALSE, approximation=FALSE))
# Now forecast the test set and compute RMSE and MSE
fit %>%
forecast(h = 13) %>%
accuracy(x) %>%
mutate(MSE = RMSE^2) %>%
select(ID, RMSE, MSE)
#> # A tibble: 4 x 3
#> ID RMSE MSE
#> <int> <dbl> <dbl>
#> 1 1 9696. 94011195.
#> 2 2 4792. 22964718.
#> 3 3 27899. 778326636.
#> 4 4 1776. 3153398.
由 reprex package (v0.3.0)
于 2020-05-18 创建