R Prophet add_regressor 给出奇怪的结果

R Prophet add_regressor giving strange results

我正在尝试(第一次)使用 add_regressor 函数向 prophet 添加一个外部变量,但我得到的结果看起来很奇怪。我使用的数据集可在 kaggle(众所周知的洗发水销售)here 上免费获得。我正在尝试使用 R 的 quantmod 包作为我的外部变量,为 SPY 股票指数使用免费可用的数据。

以下是我开始编写代码的方式:

library(prophet)
library(quantmod)
library(dplyr)


df <- read.csv("~/shampoo.csv")


#now get the min and max dates in the column
min_date <- min(df$Date, na.rm = TRUE)
max_date <- max(df$Date, na.rm = TRUE)


#download the SPY stock data
getSymbols("SPY", from  = min_date, to = max_date)

#SPY closes stored into a df and massage a bit
Close <- data.frame(Cl(SPY))
Close <- cbind(ds = rownames(Close), Close)
rownames(Close) <- NULL
Close_no_rename <- Close
colnames(Close)[colnames(Close) == 'SPY.Close'] <- 'y'
colnames(Close_no_rename)[colnames(Close_no_rename) == 'SPY.Close'] <- 'SPY_CLOSE'


#now put this into prophet and make a forecast for the forecast_period for SPY
stock_model <- prophet(Close)


#make a forecast dataframe
future_stocks <- make_future_dataframe(stock_model, periods = 30, freq = "month", include_history = FALSE)

#the below df will have predicted stock prices of the SPY. want to extract the future y values as point forecast along with dates
forecast <- predict(stock_model, future_stocks) %>% select(ds, yhat)
colnames(forecast)[colnames(forecast) == 'yhat'] <- 'SPY_CLOSE'


#rename the columns of the actual df
colnames(df)[colnames(df) == 'Date'] <- 'ds'
colnames(df)[colnames(df) == 'Value'] <- 'y'

#now want to merge the Close df y historic values onto the training df, merge by date ds column
df_historic_with_SPY_close <- merge(df, Close_no_rename, by = "ds")


#now actually forecast using prophet
model <- prophet()

#add the SPY regressor
model <- add_regressor(model, 'SPY_CLOSE', prior.scale = 0.0000001, standardize = FALSE)
model <- fit.prophet(model, df_historic_with_SPY_close)


forecast_final <- predict(model, forecast)

plot(model, forecast_final)

这不会抛出任何错误,但预测图看起来……错了。看起来好像秤不对或什么的。我尝试摆弄先前的设置并标准化设置,但没有成功。谢谢你的帮助!

这里是用作主要变量的洗发水数据集:

Date    Value
2017-01-01  266
2017-02-01  145.9
2017-03-01  183.1
2017-04-01  119.3
2017-05-01  180.3
2017-06-01  168.5
2017-07-01  231.8
2017-08-01  224.5
2017-09-01  192.8
2017-10-01  122.9
2017-11-01  336.5
2017-12-01  185.9
2018-01-01  194.3
2018-02-01  149.5
2018-03-01  210.1
2018-04-01  273.3
2018-05-01  191.4
2018-06-01  287
2018-07-01  226
2018-08-01  303.6
2018-09-01  289.9
2018-10-01  421.6
2018-11-01  264.5
2018-12-01  342
2019-01-01  339.7
2019-02-01  440.4
2019-03-01  315.9
2019-04-01  439.3
2019-05-01  401.3
2019-06-01  437.4
2019-07-01  575.5
2019-08-01  407.6
2019-09-01  682
2019-10-01  475.3
2019-11-01  581.3
2019-12-01  646.9

我想我已经修复了一些东西,但我唯一不同的是将日期从因子格式更改为日期格式,并告诉 R 使用 dplyr 中的 select 函数。我也 运行 R 没有加载任何其他包。所以,这为什么有效仍然是个谜。

当行

时我遇到了这个问题
df_historic_with_SPY_close <- merge(df, Close_no_rename, by = "ds")

没有正常工作。我发现这是因为我已将 df 日期格式化为日期以使它们与 getSymbols 一起使用,但它们与 Close_no_rename.

不同

首先,我使用的数据:

df<-dput(df)
structure(list(ds = structure(c(17167, 17198, 17226, 17257, 17287, 
17318, 17348, 17379, 17410, 17440, 17471, 17501, 17532, 17563, 
17591, 17622, 17652, 17683, 17713, 17744, 17775, 17805, 17836, 
17866, 17897, 17928, 17956, 17987, 18017, 18048, 18078, 18109, 
18140, 18170, 18201, 18231), class = "Date"), y = c(266, 145.9, 
183.1, 119.3, 180.3, 168.5, 231.8, 224.5, 192.8, 122.9, 336.5, 
185.9, 194.3, 149.5, 210.1, 273.3, 191.4, 287, 226, 303.6, 289.9, 
421.6, 264.5, 342.3, 339.7, 440.4, 315.9, 439.3, 401.3, 437.4, 
575.5, 407.6, 682, 475.3, 581.3, 646.9)), row.names = c(NA, -36L
), class = "data.frame")


    library(prophet)
    library(quantmod)
    library(dplyr)
    
# can use your df, rather than above
    df<-read.csv("~/shampoo.csv")
# either way, should run this
    df$Date<-as.Date.factor(df$Date,tryFormats = c("%d-%m-%y"))
    str(df) #check
    
    #now get the min and max dates in the column
    min_date <- min(df$Date, na.rm = TRUE)
    max_date <- max(df$Date, na.rm = TRUE)
    
    
    #download the SPY stock data
    getSymbols("SPY", from  = min_date, to = max_date)
    
    #SPY closes stored into a df and massage a bit
    Close <- data.frame(Cl(SPY))
    Close <- cbind(ds = rownames(Close), Close)
    rownames(Close) <- NULL
    Close_no_rename <- Close
    colnames(Close)[colnames(Close) == 'SPY.Close'] <- 'y'
    colnames(Close_no_rename)[colnames(Close_no_rename) == 'SPY.Close'] <- 'SPY_CLOSE'

# make dates in date format 

    Close_no_rename$ds<-as.Date(Close_no_rename$ds)
    str(Close_no_rename)
    
    #now put this into prophet and make a forecast for the forecast_period for SPY
    stock_model <- prophet(Close)
    
    
    #make a forecast dataframe
    future_stocks <- make_future_dataframe(stock_model, periods = 30, freq = "month", include_history = FALSE)
    
    #the below df will have predicted stock prices of the SPY. want to extract the future y values as point forecast along with dates
# specify dplyr:::select
    forecast <- predict(stock_model, future_stocks) %>% dplyr:::select(ds, yhat)
    colnames(forecast)[colnames(forecast) == 'yhat'] <- 'SPY_CLOSE'
    
    
    #rename the columns of the actual df
    colnames(df)[colnames(df) == 'Date'] <- 'ds'
    colnames(df)[colnames(df) == 'Value'] <- 'y'
    
    #now want to merge the Close df y historic values onto the training df, merge by date ds column
    df_historic_with_SPY_close <- merge(df, Close_no_rename, by = "ds")
    df_historic_with_SPY_close
    
    #now actually forecast using prophet
    model <- prophet()
    
    #add the SPY regressor
    model <- add_regressor(model, 'SPY_CLOSE', prior.scale = 0.0000001, standardize = FALSE)
    model <- fit.prophet(model, df_historic_with_SPY_close)
    
    
    forecast_final <- predict(model, forecast)
    
    plot(model, forecast_final)

结果:

编辑

使用以下数据集,日期更改为在 SPY 数据集中接近的日期:

df<-dput(df)
structure(list(ds = structure(c(17169, 17198, 17226, 17259, 17287, 
17318, 17350, 17379, 17410, 17442, 17471, 17501, 17534, 17563, 
17591, 17624, 17652, 17683, 17715, 17744, 17778, 17805, 17836, 
17868, 17898, 17928, 17956, 17987, 18017, 18050, 18078, 18109, 
18142, 18170, 18201, 18232), class = "Date"), y = c(266, 145.9, 
183.1, 119.3, 180.3, 168.5, 231.8, 224.5, 192.8, 122.9, 336.5, 
185.9, 194.3, 149.5, 210.1, 273.3, 191.4, 287, 226, 303.6, 289.9, 
421.6, 264.5, 342.3, 339.7, 440.4, 315.9, 439.3, 401.3, 437.4, 
575.5, 407.6, 682, 475.3, 581.3, 646.9)), class = "data.frame", row.names = c(NA, 
-36L))

我们得到这个,看起来好多了:

编辑 2

问题与缺失数据有关。洗发水数据集中的某些日期不在 SPY 数据集中。下面将 select SPY 数据集中最近日期的数据,以克服数据丢失的问题。但是,它生成的图表看起来仍然很奇怪,稍微更改日期似乎是问题的原因。

替换行:

df_historic_with_SPY_close <- merge(df, Close_no_rename, by = "ds")

与(归功于 ):

library(data.table)
setDT(Close_no_rename)
setDT(df)

setkey(Close_no_rename, ds)[, dateMatch:=ds]
df_historic_with_SPY_close<-Close_no_rename[df, roll='nearest']
df_historic_with_SPY_close<-setDT(df_historic_with_SPY_close)[,-1]
names(df_historic_with_SPY_close)[names(df_historic_with_SPY_close) == "dateMatch"] <- "ds"
df_historic_with_SPY_close
df_historic_with_SPY_close <- mutate (  df_historic_with_SPY_close,  ds = ymd(ds) )
str(df_historic_with_SPY_close)