创建从多个选定列中提取和标记检测到的字符串模式的列

Create columns that extract and flag a detected string pattern from multiple selected columns

我有下面的数据框,其中每个观察都有一串不同对象的键和值 (k:v)。每个对象由 \n

分隔
df = tribble(
  ~col, ~summary,
  "a",   "Oranges: 1\nApples: 1\nPeaches: 6\nBananas: 2",
  "b",   "Apples: 2\nBananas: 1",
  "c",   "Oranges: 4\nPeaches: 2\nBananas: 5",
  "d",   "Peaches: 2"
)

我将这些值分开如下:

df_sep = df %>% 
  separate(summary,c("col1","col2","col3","col4"),sep = "\n")

我想使用 dplyr 获得以下输出,如果适用,可能使用 acrossany

# A tibble: 4 x 10
  col   summary                                       Apples_flag Bananas_flag oranges_flag Peaches_flag_value Apples_value Bananas_value oranges_value Peaches_value
  <chr> <chr>                                               <dbl>        <dbl>        <dbl>              <dbl>        <dbl>         <dbl>         <dbl>         <dbl>
1 b     Apples: 2, Bananas: 1                                   1            1            0                  0            2             1             0             0
2 a     Oranges: 1, Apples: 1, Peaches: 6, Bananas: 2           1            1            1                  1            1             2             1             6
3 c     Oranges: 4, Peaches: 2, Bananas: 5                      0            1            1                  1            0             5             4             2
4 d     Peaches: 2                                              0            0            0                  1            0             0             0             2

这是 tidyverse 中的一种可能性,它依赖于旋转数据:

df %>% 
  separate(summary,c("col1","col2","col3","col4"),sep = "\n", fill = "right") %>%
  gather(dummy, value, -col) %>%
  filter(!is.na(value)) %>% 
  group_by(col) %>%
  transmute(
    col,
    summary = paste(value, collapse = ", "),
    fruit = gsub(": \d+", "", value),
    n = parse_number(value)
  ) %>%
  arrange(fruit) %>% 
  pivot_wider(names_from = fruit, values_from = n, values_fill =  0) %>%
  mutate(across(-summary, list(flag = sign)))


# A tibble: 4 x 10
# Groups:   col [4]
  col   summary                                       Apples Bananas oranges Peaches Apples_flag Bananas_flag oranges_flag Peaches_flag
  <chr> <chr>                                          <dbl>   <dbl>   <dbl>   <dbl>       <dbl>        <dbl>        <dbl>        <dbl>
1 b     Apples: 2, Bananas: 1                              2       1       0       0           1            1            0            0
2 a     oranges: 1, Apples: 1, Peaches: 6, Bananas: 2      1       2       1       6           1            1            1            1
3 c     oranges: 4, Peaches: 2, Bananas: 5                 0       5       4       2           0            1            1            1
4 d     Peaches: 2                                         0       0       0       2           0            0            0            1

下面是 tidyverse 中另一个可能的解决方案,利用 c_crossanystr_detectstr_match

它通过将 str_detect 应用到 match 参数中指定列的 any 并使用 + 添加 1 作为标志到适当的列来进行标记.

它通过使用 str_match 应用正则表达式来提取值,并将其添加到适当的列中。

df %>% 
  mutate(summary_clone = summary) %>% 
  separate(summary_clone,c("col1","col2","col3","col4"),sep = "\n", fill = "right") %>% 
  rowwise() %>%
# Flagging
  mutate(Apples_flag = +any(str_detect(c_across(matches("col[1-4]")), "Apples")),
         Oranges_flag = +any(str_detect(c_across(matches("col[1-4]")), "oranges")),
         Bananas_flag = +any(str_detect(c_across(matches("col[1-4]")), "Bananas")),
         Peaches_flag = +any(str_detect(c_across(matches("col[1-4]")), "Peaches"))
         ) %>% 
  mutate(summary = str_replace_all(summary,"\n"," ")) %>% 
# Parse matched values
  mutate(Apples_value =  str_match(summary,"Apples: (\d+)")[2],
         Oranges_value =  str_match(summary,"Oranges: (\d+)")[2],
         Bananas_value =  str_match(summary,"Bananas: (\d+)")[2],
         Peaches_value =  str_match(summary,"Peaches: (\d+)")[2]
         )  %>% 
  select(-matches("col[1-4]")) %>% 
  mutate(across(3:10, .fns = ~replace_na(as.numeric(.),0))) 

# A tibble: 4 x 10
# Rowwise: 
 col   summary                                    Apples_flag Oranges_flag Bananas_flag Peaches_flag Apples_value Oranges_value Bananas_value Peaches_value
 <chr> <chr>                                            <dbl>        <dbl>        <dbl>        <dbl>        <dbl>         <dbl>         <dbl>         <dbl>
1 a     Oranges: 1 Apples: 1 Peaches: 6 Bananas: 2           1            1            1            1            1             0             2             6
2 b     Apples: 2 Bananas: 1                                 1            0            1            0            2             0             1             0
3 c     Oranges: 4 Peaches: 2 Bananas: 5                     0            1            1            1            0             0             5             2
4 d     Peaches: 2                                           0            0            0            0            0             0             0             0 

另一种方式是:

df %>%
  left_join(df%>%
  separate_rows(summary, sep='\n') %>%
  separate(summary, c('name', 'value'), convert = TRUE) %>%
  pivot_wider(col,names_glue = '{name}_value', values_fill = 0) %>%
  mutate(across(-col, sign, .names = '{.col}_flag')))