根据其他列中字符串的 presence/absence 按组创建新列

Create new column based on presence/absence of string in other column by group

我有这个关于船只位置的数据集,其中相同的“id”可以对应两个级别。对应于定义的类别,例如“钓鱼”,也可能显示为“未指定”。我想创建一个新列,每当“id”显示为“未指定”以及另一个类别(在其他行中)时,“未指定”将被该类别替换。

#dataset example
library(dplyr)

levels <- c("passenger", "passenger", "unspecified", "passenger", "passenger",
            "passenger", "passenger", "passenger", "passenger", "passenger",
            "unspecified", "passenger", "fishing", "unspecified", "fishing", 
            "fishing", "fishing","unspecified", "fishing", "fishing", 
            "unspecified","fishing", "fishing", "fishing", "unspecified",
            "unspecified", "unspecified")
id <- c("844", "844", "844", "844", "844","844", "844", "844", "844", "844",
        "844", "844", "845", "845", "845", "845", "845","845", "845", "845", 
        "845","845", "845", "845", "825", "825", "825")
lat <- c(-30.6456, -29.5648, -27.6667, -31.5587, -30.6934, -29.3147, -23.0538, 
         -26.5877, -26.6923, -23.40865, -23.1143, -23.28331, -31.6456, -24.5648, 
         -27.6867, -31.4587, -30.6784, -28.3447, -23.0466, -27.5877, -26.8524, 
         -23.8855, -24.1143, -23.5874, -23.5259, -22.8788, -22.1324)
long <- c(-50.4879, -49.8715, -51.8716, -50.4456, -50.9842, -51.9787, -41.2343, 
          -40.2859, -40.19599, -41.64302, -41.58042, -41.55057, -50.4576, -48.8715, 
          -51.4566, -51.4456, -50.4477, -50.9937, -41.4789, -41.3859, -40.2536, 
          -41.6502, -40.5442, -41.4057, -40.4058, -42.4877, -41.4545)

df <- tibble(levels = as.factor(levels), id  = as.factor(id), lat, long)

这是我的预期输出:

> output %>% print(n = 27)
# A tibble: 27 x 5
   levels      id      lat  long new_colum  
   <fct>       <fct> <dbl> <dbl> <fct>      
 1 passenger   844   -30.6 -50.5 passenger  
 2 passenger   844   -29.6 -49.9 passenger  
 3 unspecified 844   -27.7 -51.9 passenger  
 4 passenger   844   -31.6 -50.4 passenger  
 5 passenger   844   -30.7 -51.0 passenger  
 6 passenger   844   -29.3 -52.0 passenger  
 7 passenger   844   -23.1 -41.2 passenger  
 8 passenger   844   -26.6 -40.3 passenger  
 9 passenger   844   -26.7 -40.2 passenger  
10 passenger   844   -23.4 -41.6 passenger  
11 unspecified 844   -23.1 -41.6 passenger  
12 passenger   844   -23.3 -41.6 passenger  
13 fishing     845   -31.6 -50.5 fishing    
14 unspecified 845   -24.6 -48.9 fishing    
15 fishing     845   -27.7 -51.5 fishing    
16 fishing     845   -31.5 -51.4 fishing    
17 fishing     845   -30.7 -50.4 fishing    
18 unspecified 845   -28.3 -51.0 fishing    
19 fishing     845   -23.0 -41.5 fishing    
20 fishing     845   -27.6 -41.4 fishing    
21 unspecified 845   -26.9 -40.3 fishing    
22 fishing     845   -23.9 -41.7 fishing    
23 fishing     845   -24.1 -40.5 fishing    
24 fishing     845   -23.6 -41.4 fishing    
25 unspecified 825   -23.5 -40.4 unspecified
26 unspecified 825   -22.9 -42.5 unspecified
27 unspecified 825   -22.1 -41.5 unspecified

它适用于当相同的 idunspecified 但它也属于其他级别时。当它只是 unspecified 时,它保持这种状态,当它也属于其他级别时,将 unspecified 替换为该级别。

我相信这是 simplest/most 直观的解决方案:

case_when 语句检查每个组 - 如果所有 levels 都是 "unspecified",则 new_column 变为 "unspecified",否则如果它包含 "fishing""passenger",它成为其中之一。

library(dplyr)
df %>%
    group_by(id) %>%
    mutate(new_column = case_when(
        all(levels == "unspecified") ~ "unspecified",
        any(levels == "fishing") ~ "fishing",
        any(levels == "passenger") ~ "passenger"
        )
    )
# A tibble: 27 × 5
# Groups:   id [3]
   levels      id      lat  long new_column 
   <fct>       <fct> <dbl> <dbl> <chr>      
 1 passenger   844   -30.6 -50.5 passenger  
 2 passenger   844   -29.6 -49.9 passenger  
 3 unspecified 844   -27.7 -51.9 passenger  
 4 passenger   844   -31.6 -50.4 passenger  
 5 passenger   844   -30.7 -51.0 passenger  
 6 passenger   844   -29.3 -52.0 passenger  
 7 passenger   844   -23.1 -41.2 passenger  
 8 passenger   844   -26.6 -40.3 passenger  
 9 passenger   844   -26.7 -40.2 passenger  
10 passenger   844   -23.4 -41.6 passenger  
11 unspecified 844   -23.1 -41.6 passenger  
12 passenger   844   -23.3 -41.6 passenger  
13 fishing     845   -31.6 -50.5 fishing    
14 unspecified 845   -24.6 -48.9 fishing    
15 fishing     845   -27.7 -51.5 fishing    
16 fishing     845   -31.5 -51.4 fishing    
17 fishing     845   -30.7 -50.4 fishing    
18 unspecified 845   -28.3 -51.0 fishing    
19 fishing     845   -23.0 -41.5 fishing    
20 fishing     845   -27.6 -41.4 fishing    
21 unspecified 845   -26.9 -40.3 fishing    
22 fishing     845   -23.9 -41.7 fishing    
23 fishing     845   -24.1 -40.5 fishing    
24 fishing     845   -23.6 -41.4 fishing    
25 unspecified 825   -23.5 -40.4 unspecified
26 unspecified 825   -22.9 -42.5 unspecified
27 unspecified 825   -22.1 -41.5 unspecified

您可以使用 ifelse 语句来指定条件,其中只有一种 levels AND 同时 levels 等于“未指定”,不要更改这些记录。否则,将其更改为另一个不是“未指定”的levels

library(dplyr)

df %>% 
  group_by(id) %>% 
  mutate(new_column = ifelse(n_distinct(levels) == 1 & levels == "unspecified",
                             as.character(levels), 
                             as.character(levels)[levels != "unspecified"]))

# A tibble: 27 × 5
# Groups:   id [3]
   levels      id      lat  long new_column 
   <fct>       <fct> <dbl> <dbl> <chr>      
 1 passenger   844   -30.6 -50.5 passenger  
 2 passenger   844   -29.6 -49.9 passenger  
 3 unspecified 844   -27.7 -51.9 passenger  
 4 passenger   844   -31.6 -50.4 passenger  
 5 passenger   844   -30.7 -51.0 passenger  
 6 passenger   844   -29.3 -52.0 passenger  
 7 passenger   844   -23.1 -41.2 passenger  
 8 passenger   844   -26.6 -40.3 passenger  
 9 passenger   844   -26.7 -40.2 passenger  
10 passenger   844   -23.4 -41.6 passenger  
11 unspecified 844   -23.1 -41.6 passenger  
12 passenger   844   -23.3 -41.6 passenger  
13 fishing     845   -31.6 -50.5 fishing    
14 unspecified 845   -24.6 -48.9 fishing    
15 fishing     845   -27.7 -51.5 fishing    
16 fishing     845   -31.5 -51.4 fishing    
17 fishing     845   -30.7 -50.4 fishing    
18 unspecified 845   -28.3 -51.0 fishing    
19 fishing     845   -23.0 -41.5 fishing    
20 fishing     845   -27.6 -41.4 fishing    
21 unspecified 845   -26.9 -40.3 fishing    
22 fishing     845   -23.9 -41.7 fishing    
23 fishing     845   -24.1 -40.5 fishing    
24 fishing     845   -23.6 -41.4 fishing    
25 unspecified 825   -23.5 -40.4 unspecified
26 unspecified 825   -22.9 -42.5 unspecified
27 unspecified 825   -22.1 -41.5 unspecified 

可以将"unspecified"替换成NA,将这些NA填入同一组id之前的值,然后将剩余的NA替换掉] 回到 "unspecified".

library(tidyverse)

df %>%
  mutate(new_colum = na_if(levels, "unspecified")) %>%
  group_by(id) %>% 
  fill(new_colum) %>%
  ungroup() %>%
  replace_na(list(new_colum = "unspecified"))
# A tibble: 27 × 5
   levels      id      lat  long new_colum  
   <fct>       <fct> <dbl> <dbl> <fct>      
 1 passenger   844   -30.6 -50.5 passenger  
 2 passenger   844   -29.6 -49.9 passenger  
 3 unspecified 844   -27.7 -51.9 passenger  
 4 passenger   844   -31.6 -50.4 passenger  
 5 passenger   844   -30.7 -51.0 passenger  
 6 passenger   844   -29.3 -52.0 passenger  
 7 passenger   844   -23.1 -41.2 passenger  
 8 passenger   844   -26.6 -40.3 passenger  
 9 passenger   844   -26.7 -40.2 passenger  
10 passenger   844   -23.4 -41.6 passenger  
11 unspecified 844   -23.1 -41.6 passenger  
12 passenger   844   -23.3 -41.6 passenger  
13 fishing     845   -31.6 -50.5 fishing    
14 unspecified 845   -24.6 -48.9 fishing    
15 fishing     845   -27.7 -51.5 fishing    
16 fishing     845   -31.5 -51.4 fishing    
17 fishing     845   -30.7 -50.4 fishing    
18 unspecified 845   -28.3 -51.0 fishing    
19 fishing     845   -23.0 -41.5 fishing    
20 fishing     845   -27.6 -41.4 fishing    
21 unspecified 845   -26.9 -40.3 fishing    
22 fishing     845   -23.9 -41.7 fishing    
23 fishing     845   -24.1 -40.5 fishing    
24 fishing     845   -23.6 -41.4 fishing    
25 unspecified 825   -23.5 -40.4 unspecified
26 unspecified 825   -22.9 -42.5 unspecified
27 unspecified 825   -22.1 -41.5 unspecified