在 R 中使用 dplyr 处理回归模型的数据

Manipulating data for Regression Model using dplyr in R

我有这样的数据。

library(lubridate)
set.seed(2021)
gen_date <- seq(ymd_h("2021-01-01-00"), ymd_h("2021-09-30-23"), by = "hours")
hourx <- hour(gen_date)
datex <- date(gen_date)
sales <- round(runif(length(datex), 10, 50), 0)*100
mydata <- data.frame(datex, hourx, sales)

head(mydata)
#       datex hourx sales
#1 2021-01-01     0  2800
#2 2021-01-01     1  4100
#3 2021-01-01     2  3800
#4 2021-01-01     3  2500
#5 2021-01-01     4  3500
#6 2021-01-01     5  3800

tail(mydata
#          datex hourx sales
#6547 2021-09-30    18  3900
#6548 2021-09-30    19  3600
#6549 2021-09-30    20  3000
#6550 2021-09-30    21  4700
#6551 2021-09-30    22  4700
#6552 2021-09-30    23  3600

我的任务是使用线性回归进行建模,但数据很棘手。假设我们有 1 月到 3 月的数据,我们需要这些数据来预测 4 月的数据。这里的步骤:

data_jan <- mydata[1:672,]
data_feb <- mydata[745:1416,]
data_mar <- mydata[1417:2088,]
mydata_reg <- data.frame(x1 = data_jan$sales, 
                         x2 = data_feb$sales,
                         y = data_mar$sales)
model_reg <- lm(y~., data = mydata_reg)
mydata_reg_for <- data.frame(x1 = data_feb$sales, 
                             x2 = data_mar$sales)
pred_data_apr <- predict(model_reg, newdata = mydata_reg_for)
data_feb_add <- mydata[1417:1464,]
data_mar_add <- mydata[2089:2136,]
mydata_reg_add <- data.frame(x1 = data_feb_add$sales,
                             x2 = data_mar_add$sales)
pred_data_apr_add <- predict(model_reg, newdata = mydata_reg_add)
data_apr <- c(as.numeric(pred_data_apr), as.numeric(pred_data_apr_add))

我的问题是我们如何使用 dplyr 包每月自动执行此过程 运行?因为每个月都有不同的日子。我使用二月份的数据,因为它的天数最少。此条件也适用于其他月份。非常感谢。

您可以简单地按 group_split

拆分数据
mydata %>%
  group_split(month(datex))

此代码将mydata拆分为12个列表,每个列表元素是每12个月

的dataframe

如果您想控制每个月之后(或每个月中)的天数,您可以按日期而不是行号进行过滤。

我敢肯定它可以整理得比这更多,但是您只需要将 forecast_date <- as.Date("2021-04-01") 更改为您想要预测的月份即可。

##set the forecast month. This should be straight forward to automate with a list or an increment
forcast_date <- as.Date("2021-04-01") # April

##get the forecast month length. This would be used for the data_feb_add and data_mar_add step.
forcast_month_length <- days_in_month(forcast_date) #30 days

##get dates for the previous 3 months
month_1_date <- forcast_date %m-% months(3)
month_2_date <- forcast_date %m-% months(2)
month_3_date <- forcast_date %m-% months(1)

##find the shortest month in that time range.
shortest_month <- min(c(days_in_month(month_1_date), 
                        days_in_month(month_2_date), 
                        days_in_month(month_2_date))) #28 days

##select the first 28 days (the shortest month) for each of the months used for the variables
data_month_1 <- mydata[mydata$datex %in% month_1_date:(month_1_date + shortest_month - 1),]
data_month_2 <- mydata[mydata$datex %in% month_2_date:(month_2_date + shortest_month - 1),]
data_month_3 <- mydata[mydata$datex %in% month_3_date:(month_3_date + shortest_month - 1),]

##select the number of days needed for each month for the forecast data (30 days for april)
month_2_forecast_length <- mydata[mydata$datex %in% month_2_date:(month_2_date + forcast_month_length - 1),]
month_3_forecast_length <- mydata[mydata$datex %in% month_3_date:(month_3_date + forcast_month_length - 1),]