For循环效率

For Loop Efficiency

下面的循环很有效,因为它让我到达了终点线,但我正在寻找一种方法来提高它的效率,因为我正在循环遍历大型数据集。可能使用 Purrr 函数?

library(tidyverse)
library(timetk)
#### CREATE DATA
df_1 <- data.frame(Date              =   seq.Date(as.Date("2016-01-01"), length.out = 36, by = "month"),
                 Inventory           =   round(runif(36,5,100),0),
                 Purchases           =   round(runif(36,5,100),0),
                 Sales               =   round(runif(36,5,100),0),       
                 Ending_Inventory    =   round(runif(36,5,100),0)) %>%
  
      mutate(Starting_Inventory = lag(Ending_Inventory,1)) %>%
      mutate(product             = "Product_1") 

df_2 <- data.frame(Date              =   seq.Date(as.Date("2016-01-01"), length.out = 36, by = "month"),
                   Inventory           =   round(runif(36,5,100),0),
                   Purchases           =   round(runif(36,5,100),0),
                   Sales               =   round(runif(36,5,100),0),       
                   Ending_Inventory    =   round(runif(36,5,100),0)) %>%
  
  mutate(Starting_Inventory = lag(Ending_Inventory,1)) %>%
  mutate(product             = "Product_2") 
 
df <- rbind(df_1, df_2) %>% 
      group_by(product) %>%
      timetk::future_frame(
              .date_var = Date,
              .length_out = "12 months",
              .bind_data = TRUE
            )

这里我创建了一个日期序列来遍历 for 循环

#### CREATE DATE SEQUENCE
Dates <- seq(min(df$Date)  %m+% months(36), min(df$Date)  %m+% months(48), by = "month") 

上面序列中的日期将遍历循环以填充未来的数据,然后我加入,重命名一些列,并删除所有包含(“y”)的列...好像我在表演一些不必要的步骤。

for (i in 1:length(Dates)){
     
  df <- df %>%
  
  mutate(Purchases          = case_when(Date < Dates[i] ~ Purchases,
                                         Date == Dates[i] ~ lag(Purchases, 12)*1.05, 
                                         TRUE ~ 0
                                         )) %>%
    
  mutate(Starting_Inventory = case_when(Date < Dates[i]  ~   Starting_Inventory,
                                              Date == Dates[i] ~ lag(Ending_Inventory,1),
                                             TRUE ~ 0
                                           )) %>%
    
  mutate(Sales            = case_when(Date < Dates[i]  ~   Sales,
                                             Date == Dates[i] ~ lag(Sales,12) * 1.15,
                                             TRUE ~ 0
                                             )) %>%
    
  mutate(Ending_Inventory = case_when(Date < Dates[i]  ~   Ending_Inventory,
                                             Date == Dates[i] ~ Starting_Inventory + Sales + Purchases,
                                             TRUE ~ 0
                                             )) %>%
    
  mutate(Inventory             = case_when(Date <  Dates[i]  ~   Inventory,
                                           Date == Dates[i] ~ Ending_Inventory, 
                                          TRUE ~ 0
    ))
                          
  new_data <- df[df$Date == (Dates[i]),]
  
  df <- df %>%
    left_join(., new_data, by = c("product", "Date")) %>%

    mutate(Inventory.x            = ifelse(Date == Dates[i],Inventory.y,Inventory.x),
           Purchases.x            = ifelse(Date == Dates[i],Purchases.y,Purchases.x),
           Sales.x                = ifelse(Date == Dates[i],Sales.y,Sales.x),
           Starting_Inventory.x   = ifelse(Date == Dates[i],Starting_Inventory.y,Starting_Inventory.x),
           Ending_Inventory.x     = ifelse(Date == Dates[i],Ending_Inventory.y,Ending_Inventory.x),
           ) %>%

    rename(Inventory              = Inventory.x,
           Purchases              = Purchases.x,
           Starting_Inventory     = Starting_Inventory.x,
           Sales                  = Sales.x,
           Ending_Inventory       = Ending_Inventory.x) %>%
    dplyr::select(-contains(".y"))
  
  return
  
  print(i)
  gc()
}

里面有很多不必要的步骤。

  1. Mu​​tate 一次可以接受多个表达式。
  2. case_when 是不必要的,因为在下一步中您只保留修改过的行。
  3. 然后,同理,连接和重命名的步骤比需要的多,您可以通过选择子集将旧行替换为新行。
for (i in seq_along(Dates)){
  new_data <- df2 %>%
    mutate(Purchases = lag(Purchases, 12)*1.05,
           Starting_Inventory = lag(Ending_Inventory,1),
           Sales = lag(Sales,12) * 1.15,
           Ending_Inventory = Starting_Inventory + Sales + Purchases,
           Inventory = Ending_Inventory)
  
  df2[df2$Date == Dates[i],] <- new_data[new_data$Date == Dates[i],]
}
  1. 但是您仍在为每个循环重新计算整个 data.frame。也不需要,因为 mutate() 是迭代的。您只需使用该功能即可完成所有操作。
  2. 此外,由于只需要 2 个条件,您可以将 case_when 替换为 ifelse,这样会更快。
df <-  df %>%
      mutate(
        Purchases = ifelse(
          Date %in% Dates, lag(Purchases, 12)*1.05, Purchases
          ), 
        Starting_Inventory = ifelse(
          Date %in% Dates, lag(Ending_Inventory,1), Starting_Inventory
          ), 
        Sales = ifelse(
          Date %in% Dates, lag(Sales,12) * 1.15, Sales
          ), 
        Ending_Inventory = ifelse(
          Date %in% Dates, Starting_Inventory + Sales + Purchases, 
          Ending_Inventory
          ),
        Inventory = ifelse(
          Date %in% Dates, Ending_Inventory, Inventory
          )
      )

编辑:

我认为当您遇到这样的长循环时,分解您要执行的操作很重要。由于您正在尝试进行就地修改,即使是在 base R 中,您也可以使用这个简短的 for 循环来完成此操作:

df3 <- df.o
    df3 <- df3 |> within({
      for (i in which(Date %in% Dates)){
        Purchases[i] = Purchases[i-12]*1.05
        Sales[i] = Sales[i-12] * 1.15
        Ending_Inventory[i] = Starting_Inventory[i] + Sales[i] + Purchases[i]
        Inventory[i] = Ending_Inventory[i]
        Starting_Inventory[i] = Ending_Inventory[i-1]
      }
      i = NULL
    })

比mutate慢一点,但逻辑是一样的