从文本列中提取的单独列表分为多个数字列

Separate list extracted from text column into multiple numeric columns

我有一个文本列,我想将其数字分成几列:

df <- structure(list(text = c("((i: DH=0, SZ=0, RM=0, FO=0, GP=0, BC=0))", 
                              "((i: DH=0, SZ=0, RM=0, FO=0, GP=0, BC=0))", "((i: DH=1, SZ=0, RM=0, FO=NA, GP=0, BC=0))", 
                              "((i: DH=1, SZ=0, RM=NA, FO=0, GP=0, BC=0))", "((i: DH=1, SZ=1, RM=0, FO=0, GP=0, BC=0))", 
                              "((i: DH=0, SZ=1, RM=0, FO=0, GP=0, BC=0))", "((i: DH=0, SZ=0, RM=1, FO=0, GP=0, BC=1))", 
                              "((i: DH=0, SZ=1, RM=0, FO=0, GP=0, BC=0))", "((i: DH=0, SZ=1, RM=0, FO=0, GP=0, BC=0))", 
                              "((i: DH=0, SZ=1, RM=0, FO=1, GP=0, BC=0))", "((i: DH=0, SZ=1, RM=0, FO=0, GP=0, BC=1))", 
                              "((i: DH=0, SZ=NA, RM=0, FO=0, GP=0, BC=0))", "((i: DH=0, SZ=0, RM=0, FO=0, GP=0, BC=0))"
)), row.names = c(NA, -13L), class = "data.frame")

我可以 str_extract 数字,但无法使用 tidyr 的函数 separate:

将数字分成数字列
library(stringr)
library(tidyr)
library(dplyr)
df %>%
  mutate(text = str_extract_all(text,"(?<==)\d|NA(?=,)")) %>%
  separate(col = text,
           into = c("DH", "SZ", "RM", "FO", "GP", "BC"),
           sep = ", ",
           convert = TRUE)
      DH   SZ   RM   FO  GP   BC
1  c("0"  "0"  "0"  "0" "0" "0")
2  c("0"  "0"  "0"  "0" "0" "0")
3  c("1"  "0"  "0" "NA" "0" "0")
4  c("1"  "0" "NA"  "0" "0" "0")
5  c("1"  "1"  "0"  "0" "0" "0")
6  c("0"  "1"  "0"  "0" "0" "0")
7  c("0"  "0"  "1"  "0" "0" "1")
8  c("0"  "1"  "0"  "0" "0" "0")
9  c("0"  "1"  "0"  "0" "0" "0")
10 c("0"  "1"  "0"  "1" "0" "0")
11 c("0"  "1"  "0"  "0" "0" "1")
12 c("0" "NA"  "0"  "0" "0" "0")
13 c("0"  "0"  "0"  "0" "0" "0")

我的直觉是,这是因为 str_extract_all returns listseparate 中的数字在列表中不起作用。这里的解决方案是什么?

这就是 simplify=T 参数的目的,因为它将输出强制转换为矩阵。

df %>%
  mutate(text = str_extract_all(text,"(?<==)\d|NA(?=,)", simplify=T))
   text.1 text.2 text.3 text.4 text.5 text.6
1       0      0      0      0      0      0
2       0      0      0      0      0      0
3       1      0      0     NA      0      0
4       1      0     NA      0      0      0
5       1      1      0      0      0      0
6       0      1      0      0      0      0
7       0      0      1      0      0      1
8       0      1      0      0      0      0
9       0      1      0      0      0      0
10      0      1      0      1      0      0
11      0      1      0      0      0      1
12      0     NA      0      0      0      0
13      0      0      0      0      0      0

您可以使用 unnest_wider 而不是 separate,然后再做一些额外的 tidyverse 魔术。注意:您会收到警告,因为 NA 是由强制引入的,但这在您的情况下是预期的。

df %>%
  mutate(text = str_extract_all(text,"(?<==)\d|NA(?=,)")) %>%
  unnest_wider(text) %>%
  mutate(across(starts_with("..."), ~as.numeric(.))) %>%
  rename_with(.cols = starts_with("..."),
              .fn = ~c("DH", "SZ", "RM", "FO", "GP", "BC"))

给出:

# A tibble: 13 x 6
      DH    SZ    RM    FO    GP    BC
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1     0     0     0     0     0     0
 2     0     0     0     0     0     0
 3     1     0     0    NA     0     0
 4     1     0    NA     0     0     0
 5     1     1     0     0     0     0
 6     0     1     0     0     0     0
 7     0     0     1     0     0     1
 8     0     1     0     0     0     0
 9     0     1     0     0     0     0
10     0     1     0     1     0     0
11     0     1     0     0     0     1
12     0    NA     0     0     0     0
13     0     0     0     0     0     0

另一种方法(在重命名列方面)是直接在列表中设置名称属性,这解决了所有重命名问题:

library(tidyverse)
df %>%
  mutate(text = str_extract_all(text,"(?<==)\d|NA(?=,)"),
         text = map(text, setNames, c("DH", "SZ", "RM", "FO", "GP", "BC"))) %>%
  unnest_wider(text) %>%
  mutate(across(c("DH", "SZ", "RM", "FO", "GP", "BC"), ~as.numeric(.)))