按一列分组并比较月度、季度混合数据的当前期间值与以前使用 R

Groupby one column and compare monthly, quarterly mixed data's current period values with previous using R

假设我有一个面板数据如下,它是从编辑的:

df <- structure(list(id = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("M01", 
"M02", "S01"), class = "factor"), date = structure(c(2L, 3L, 
4L, 5L, 6L, 7L, 8L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L, 2L, 3L, 4L, 
5L, 6L, 7L, 8L), .Label = c("2020-12", "2021-01", "2021-02", 
"2021-03", "2021-04", "2021-05", "2021-06", "2021-07"), class = "factor"), 
    actual = c(3.4, 5.4, 7.4, 7.4, 7.5, 8, 8.9, 10.8, 10.1, 8.2, 
    10.1, 9.4, 10.1, 9.4, -0.3, NA, NA, 8.6, NA, NA, 8.3, NA), 
    pred = c(3.288889774, 5.819407687, 6.705608369, 6.054457292, 
    5.582409131, 7.01052472, 9.742902434, 10.98571396, 6.522003651, 
    9.688977242, 10.39801463, 9.398991615, 9.764616936, 9.855033457, 
    0.493311422, 8.403722942, 8.174854517, 8.573117852, 8.403065801, 
    8.684289455, 8.719079247, 8.259439468)), class = "data.frame", row.names = c(NA, 
-22L)) 

在groupby id之后,对于每个月的实际值和预测值,我需要检查当月的实际值和预测值与上个月的实际值相比是否具有相同的方向。新建列的逻辑是:如果两个月的差值为positivenegativezeros,则returnincreasedecreaseunchanged,如果其中一个或两个值都是NA,则return NA.

df %>% 
  # mutate(year = as.integer(year)) %>% 
  group_by(id) %>% 
  # arrange(date) %>% 
  mutate(act_direction = case_when(actual > lag(actual) ~ "increase",
                                   actual < lag(actual) ~ "decrease",
                                   actual == lag(actual) ~ "unchanged"),
         pred_direction = case_when(pred > lag(actual) ~ "increase",
                                    pred < lag(actual) ~ "decrease",
                                    pred == lag(actual) ~ "unchanged"))

如果所有 id 都是月度数据,上面的代码可以顺利运行。但是对于这个例子,我们会有例外:假设 id=='S01',它是季度数据而不是月度数据,所以我需要比较值,即 2021-032020-12 而不是 2021-02,其他月份的逻辑相同。

我该如何修改代码以适应这种情况?谢谢。

预期结果:

    id    date actual       pred act_direction pred_direction
1  M01 2021-01    3.4  3.2888898                             
2  M01 2021-02    5.4  5.8194077      increase       increase
3  M01 2021-03    7.4  6.7056084      increase       increase
4  M01 2021-04    7.4  6.0544573     unchanged       decrease
5  M01 2021-05    7.5  5.5824091      increase       decrease
6  M01 2021-06    8.0  7.0105247      increase       decrease
7  M01 2021-07    8.9  9.7429024      increase       increase
8  M02 2021-01   10.8 10.9857140                             
9  M02 2021-02   10.1  6.5220037      decrease       decrease
10 M02 2021-03    8.2  9.6889772      decrease       decrease
11 M02 2021-04   10.1 10.3980146      increase       increase
12 M02 2021-05    9.4  9.3989916      decrease       decrease
13 M02 2021-06   10.1  9.7646169      increase       increase
14 M02 2021-07    9.4  9.8550335      decrease       decrease
15 S01 2020-12   -0.3  0.4933114                             
16 S01 2021-01     NA  8.4037229                             
17 S01 2021-02     NA  8.1748545                             
18 S01 2021-03    8.6  8.5731179      increase       increase  # compare with S01's actual value in 2020-12
19 S01 2021-04     NA  8.4030658                             
20 S01 2021-05     NA  8.6842895                             
21 S01 2021-06    8.3  8.7190792      decrease       increase  # compare with S01's actual value in 2021-03
22 S01 2021-07     NA  8.2594395                             

data.table解决方案

library(data.table)
setDT(df)
v <- c('actual', 'pred')
df[!is.na(actual),
   (paste0(v,'_direction')) := lapply(.SD, function(x){
     return(case_when(
       x > lag(actual) ~ "increase",
       x < lag(actual) ~ "decrease",
       x == lag(actual) ~ "unchanged"
       )
       )
   }),
by = .(id),
.SDcols = v
]
print(df)

     id    date actual       pred actual_direction pred_direction
 1: M01 2021-01    3.4  3.2888898             <NA>           <NA>
 2: M01 2021-02    5.4  5.8194077         increase       increase
 3: M01 2021-03    7.4  6.7056084         increase       increase
 4: M01 2021-04    7.4  6.0544573        unchanged       decrease
 5: M01 2021-05    7.5  5.5824091         increase       decrease
 6: M01 2021-06    8.0  7.0105247         increase       decrease
 7: M01 2021-07    8.9  9.7429024         increase       increase
 8: M02 2021-01   10.8 10.9857140             <NA>           <NA>
 9: M02 2021-02   10.1  6.5220037         decrease       decrease
10: M02 2021-03    8.2  9.6889772         decrease       decrease
11: M02 2021-04   10.1 10.3980146         increase       increase
12: M02 2021-05    9.4  9.3989916         decrease       decrease
13: M02 2021-06   10.1  9.7646169         increase       increase
14: M02 2021-07    9.4  9.8550335         decrease       decrease
15: S01 2020-12   -0.3  0.4933114             <NA>           <NA>
16: S01 2021-01     NA  8.4037229             <NA>           <NA>
17: S01 2021-02     NA  8.1748545             <NA>           <NA>
18: S01 2021-03    8.6  8.5731179         increase       increase
19: S01 2021-04     NA  8.4030658             <NA>           <NA>
20: S01 2021-05     NA  8.6842895             <NA>           <NA>
21: S01 2021-06    8.3  8.7190792         decrease       increase
22: S01 2021-07     NA  8.2594395             <NA>           <NA>
     id    date actual       pred actual_direction pred_direction

您可以尝试使用临时 fill,然后通过引用原始数据删除不需要的值。

library(dplyr)
library(tidyr)

chg <- c("decrease", "unchanged", "increase")

df %>% 
  group_by(id) %>% 
  mutate(actual2=actual) %>% 
  fill(actual2) %>% 
  mutate(act_direction = case_when(
              actual2 > lag(actual2) ~ chg[3], 
              actual2 < lag(actual2) ~ chg[1], 
              actual2 == lag(actual2) ~ chg[2]), 
         pred_direction = case_when(
              pred > lag(actual2) ~ chg[3], 
              pred < lag(actual2) ~ chg[1], 
              pred == lag(actual2) ~ chg[2]), 
         act_direction=ifelse(is.na(actual),NA,act_direction),
         pred_direction=ifelse(is.na(actual),NA,pred_direction), actual2=NULL) %>%
  mutate(across(c(act_direction, pred_direction), replace_na, "")) %>%
  ungroup() %>%
  print(n=nrow(.))
A tibble: 22 × 6
 id    date    actual   pred act_direction pred_direction
 <fct> <fct>    <dbl>  <dbl> <chr>         <chr>         
 M01   2021-01    3.4  3.29  ""            ""            
 M01   2021-02    5.4  5.82  "increase"    "increase"    
 M01   2021-03    7.4  6.71  "increase"    "increase"    
 M01   2021-04    7.4  6.05  "unchanged"   "decrease"    
 M01   2021-05    7.5  5.58  "increase"    "decrease"    
 M01   2021-06    8    7.01  "increase"    "decrease"    
 M01   2021-07    8.9  9.74  "increase"    "increase"    
 M02   2021-01   10.8 11.0   ""            ""            
 M02   2021-02   10.1  6.52  "decrease"    "decrease"    
 M02   2021-03    8.2  9.69  "decrease"    "decrease"    
 M02   2021-04   10.1 10.4   "increase"    "increase"    
 M02   2021-05    9.4  9.40  "decrease"    "decrease"    
 M02   2021-06   10.1  9.76  "increase"    "increase"    
 M02   2021-07    9.4  9.86  "decrease"    "decrease"    
 S01   2020-12   -0.3  0.493 ""            ""            
 S01   2021-01   NA    8.40  ""            ""            
 S01   2021-02   NA    8.17  ""            ""            
 S01   2021-03    8.6  8.57  "increase"    "increase"    
 S01   2021-04   NA    8.40  ""            ""            
 S01   2021-05   NA    8.68  ""            ""            
 S01   2021-06    8.3  8.72  "decrease"    "increase"    
 S01   2021-07   NA    8.26  ""            ""