R 比较列表并识别个别更改

R Compare Lists and Identify Individual Changes

这是 R 中的一个冰淇淋店数据框,它显示了商店每个月的口味。

df <- data.frame(date = as.Date(c(rep("2022-01-01", 3), 
                                  rep("2022-02-01", 3),
                                  rep("2022-03-01", 4))),
                 flavor = c("Almond", "Apple", "Apricot", 
                            "Almond", "Maple", "Mint",
                            "Almond", "Maple", "Mint", "Pumpkin"))

#>         date  flavor
#> 1  2022-01-01  Almond
#> 2  2022-01-01   Apple
#> 3  2022-01-01 Apricot
#> 4  2022-02-01  Almond
#> 5  2022-02-01   Maple
#> 6  2022-02-01    Mint
#> 7  2022-03-01  Almond
#> 8  2022-03-01   Maple
#> 9  2022-03-01    Mint
#> 10 2022-03-01 Pumpkin

我创建了一个脚本来显示任何特定月份添加的冰淇淋口味。您还可以看到已删除的口味,作为列表。请注意,3 月份没有删除任何口味(2022-03-01flavors.removed 等于 <chr [0]>)。

library(dplyr)
library(tidyr)
df %>% 
  group_by(date) %>% 
  summarize(flavors = list(flavor)) %>% 
  mutate(flavors.added = mapply(setdiff, flavors, lag(flavors)),
         flavors.removed = mapply(setdiff, lag(flavors), flavors)) %>% 
  ungroup %>% 
  select(-flavors) %>% 
  unnest_longer(flavors.added)

#> # A tibble: 6 x 3
#>   date       flavors.added flavors.removed
#>   <date>     <chr>         <list>         
#> 1 2022-01-01 Almond        <NULL>         
#> 2 2022-01-01 Apple         <NULL>         
#> 3 2022-01-01 Apricot       <NULL>         
#> 4 2022-02-01 Maple         <chr [2]>      
#> 5 2022-02-01 Mint          <chr [2]>      
#> 6 2022-03-01 Pumpkin       <chr [0]>  

当我试图通过调用 unnest_longer(flavors.removed) 来捕获有关删除的口味的信息时,我最终无意中过滤掉了 2022-03-01 的所有信息,因为 flavors.removed 列表是空的(<chr [0]>) 对于这个 2022-03-01 时间段。

library(dplyr)
library(tidyr)
df %>% 
  group_by(date) %>% 
  summarize(flavors = list(flavor)) %>% 
  mutate(flavors.added = mapply(setdiff, flavors, lag(flavors)),
         flavors.removed = mapply(setdiff, lag(flavors), flavors)) %>% 
  ungroup %>% 
  select(-flavors) %>% 
  unnest_longer(flavors.added) %>% 
  unnest_longer(flavors.removed) %>% 
  pivot_longer(-date, names_to = "type", values_to = "flavor") %>% 
  arrange(date, type) %>% 
  unique()

#> # A tibble: 8 x 3
#>   date       type            flavor 
#>   <date>     <chr>           <chr>  
#> 1 2022-01-01 flavors.added   Almond 
#> 2 2022-01-01 flavors.added   Apple  
#> 3 2022-01-01 flavors.added   Apricot
#> 4 2022-01-01 flavors.removed NA     
#> 5 2022-02-01 flavors.added   Maple  
#> 6 2022-02-01 flavors.added   Mint   
#> 7 2022-02-01 flavors.removed Apple  
#> 8 2022-02-01 flavors.removed Apricot

是否有更好的方法来逐月单独识别添加和删除的口味?我需要重新获得第九行,如下所示,利用我有缺陷的方法将其过滤掉。

#> # A tibble: 9 x 3
#>   date       type            flavor 
#>   <date>     <chr>           <chr>  
#> 1 2022-01-01 flavors.added   Almond 
#> 2 2022-01-01 flavors.added   Apple  
#> 3 2022-01-01 flavors.added   Apricot
#> 4 2022-01-01 flavors.removed NA     
#> 5 2022-02-01 flavors.added   Maple  
#> 6 2022-02-01 flavors.added   Mint   
#> 7 2022-02-01 flavors.removed Apple  
#> 8 2022-02-01 flavors.removed Apricot
#> 9 2022-03-01 flavors.added   Pumpkin

如果您不需要第 4 行的 NA,可能的解决方案:

df %>% 
  group_by(date) %>% 
  summarize(flavors = list(flavor)) %>% 
  mutate(flavors.added = mapply(setdiff, flavors, lag(flavors)),
         flavors.removed = mapply(setdiff, lag(flavors), flavors)) %>% 
  ungroup %>% 
  select(-flavors) %>% 
  pivot_longer(-date, names_to = "type", values_to = "flavor") %>% 
  unnest(flavor)

# A tibble: 8 × 3
  date       type            flavor 
  <date>     <chr>           <chr>  
1 2022-01-01 flavors.added   Almond 
2 2022-01-01 flavors.added   Apple  
3 2022-01-01 flavors.added   Apricot
4 2022-02-01 flavors.added   Maple  
5 2022-02-01 flavors.added   Mint   
6 2022-02-01 flavors.removed Apple  
7 2022-02-01 flavors.removed Apricot
8 2022-03-01 flavors.added   Pumpkin

我发现单独找到添加和删除的风味更直接,然后在需要时将它们绑定在一起。

在这种情况下,您可以使用 tidyr::unchop(keep_empty = TRUE) 来避免删除空行。

library(tidyverse)

df <- tibble(
  date = as.Date(c(
    rep("2022-01-01", 3), 
    rep("2022-02-01", 3),
    rep("2022-03-01", 4)
  )),
  flavor = c(
    "Almond", "Apple", "Apricot", 
    "Almond", "Maple", "Mint",
    "Almond", "Maple", "Mint", "Pumpkin"
  )
)

df
#> # A tibble: 10 × 2
#>    date       flavor 
#>    <date>     <chr>  
#>  1 2022-01-01 Almond 
#>  2 2022-01-01 Apple  
#>  3 2022-01-01 Apricot
#>  4 2022-02-01 Almond 
#>  5 2022-02-01 Maple  
#>  6 2022-02-01 Mint   
#>  7 2022-03-01 Almond 
#>  8 2022-03-01 Maple  
#>  9 2022-03-01 Mint   
#> 10 2022-03-01 Pumpkin

flavors <- df %>% 
  group_by(date) %>% 
  summarize(flavors = list(flavor)) %>% 
  ungroup()

flavors
#> # A tibble: 3 × 2
#>   date       flavors  
#>   <date>     <list>   
#> 1 2022-01-01 <chr [3]>
#> 2 2022-02-01 <chr [3]>
#> 3 2022-03-01 <chr [4]>

# Find added flavors
added <- flavors %>%
  mutate(added = mapply(setdiff, flavors, lag(flavors)), .keep = "unused") %>% 
  unchop(added, keep_empty = TRUE) %>%
  pivot_longer(added, names_to = "type", values_to = "flavor")

# Find removed flavors
removed <- flavors %>%
  mutate(removed = mapply(setdiff, lag(flavors), flavors), .keep = "unused") %>% 
  unchop(removed, keep_empty = TRUE) %>%
  pivot_longer(removed, names_to = "type", values_to = "flavor")

added
#> # A tibble: 6 × 3
#>   date       type  flavor 
#>   <date>     <chr> <chr>  
#> 1 2022-01-01 added Almond 
#> 2 2022-01-01 added Apple  
#> 3 2022-01-01 added Apricot
#> 4 2022-02-01 added Maple  
#> 5 2022-02-01 added Mint   
#> 6 2022-03-01 added Pumpkin
removed
#> # A tibble: 4 × 3
#>   date       type    flavor 
#>   <date>     <chr>   <chr>  
#> 1 2022-01-01 removed <NA>   
#> 2 2022-02-01 removed Apple  
#> 3 2022-02-01 removed Apricot
#> 4 2022-03-01 removed <NA>

bind_rows(added, removed) %>%
  arrange(date, type)
#> # A tibble: 10 × 3
#>    date       type    flavor 
#>    <date>     <chr>   <chr>  
#>  1 2022-01-01 added   Almond 
#>  2 2022-01-01 added   Apple  
#>  3 2022-01-01 added   Apricot
#>  4 2022-01-01 removed <NA>   
#>  5 2022-02-01 added   Maple  
#>  6 2022-02-01 added   Mint   
#>  7 2022-02-01 removed Apple  
#>  8 2022-02-01 removed Apricot
#>  9 2022-03-01 added   Pumpkin
#> 10 2022-03-01 removed <NA>

reprex package (v2.0.1)

创建于 2022-06-02