将字符串列转换为特定 data.frame

Transforming string column to specific data.frame

期望输出

需要以下输出

df2 <-
  data.frame(
    v1 = c(1100001, 1100002, 1100003, 1100004, 1100005)
  , v2 = c("A R", "W R", "A K", "M", "A C")
  , v3 = c("P", "G P", "G P", "P", "P")
  , v4 = c(110, 161, 129, 132, "Absent")
  , v5 = c(55, 80.5, 64.5, 66,  "Absent")
    )
df2

       v1  v2  v3     v4     v5
1 1100001 A R   P    110     55
2 1100002 W R G P    161   80.5
3 1100003 A K G P    129   64.5
4 1100004   M   P    132     66
5 1100005 A C   P Absent Absent

这是我的原创data.frame

df1 <-
  structure(list(value = c(
"1100001     A R                P             110    55", 
"1100002     W R                 G P 161    80.5", 
"1100003     A K                  G P 129    64.5", 
"1100004     M                      P             132    66",
"1100005     A C                     P             Absent    Absent"
)), row.names = c(NA, -5L), class = c("data.frame")
)

df1

                                                              value
1            1100001     A R                P             110    55
2                   1100002     W R                 G P 161    80.5
3                  1100003     A K                  G P 129    64.5
4        1100004     M                      P             132    66
5 1100005     A C                     P             Absent    Absent

使用了 splitstackshape 中的 cSplit 函数,但无法获得所需的输出。任意点。

library(splitstackshape)
cSplit(indt = df1, splitCols = "value", sep = " ")

   value_1 value_2 value_3 value_4 value_5 value_6 value_7
1: 1100001       A       R       P     110      55      NA
2: 1100002       W       R       G       P     161    80.5
3: 1100003       A       K       G       P     129    64.5
4: 1100004       M       P     132      66      NA      NA
5: 1100005       A       P  Absent  Absent      NA      NA

我们假设一个新字段在两个或多个 space 或一个 space 后跟一个数字后开始,其中该数字是下一个字段的开始。用逗号替换这些分隔符,然后使用 read.tablesep=","

df1$value |>
  gsub(pattern = "  +| (?=\d)", replacement = ",", perl = TRUE) |>
  textConnection(name = "") |>
  read.table(sep = ",")

给这个 data.frame:

       V1  V2  V3     V4     V5
1 1100001 A R   P    110     55
2 1100002 W R G P    161   80.5
3 1100003 A K G P    129   64.5
4 1100004   M   P    132     66
5 1100005   A   P Absent Absent

会话日志

> df1 <-
+   structure(list(value = c(
+ "1100001     A R                P             110    55", 
+ "1100002     W R                 G P 161    80.5", 
+ "1100003     A K                  G P 129    64.5", 
+ "1100004     M                      P             132    66",
+ "1100005     A C                     P             Absent    Absent"
+ )), row.names = c(NA, -5L), class = c("data.frame")
+ )
> 
> df2 <-
+   data.frame(
+     v1 = c(1100001, 1100002, 1100003, 1100004, 1100005)
+   , v2 = c("A R", "W R", "A K", "M", "A C")
+   , v3 = c("P", "G P", "G P", "P", "P")
+   , v4 = c(110, 161, 129, 132, "Absent")
+   , v5 = c(55, 80.5, 64.5, 66,  "Absent")
+     )
> 
> df2a <- df1$value |>
+   gsub(pattern = "  +| (?=\d)", replacement = ",", perl = TRUE) |>
+   textConnection(name = "") |>
+   read.table(sep = ",")
> 
> all(df2 == df2a)
[1] TRUE

1.Assuming 分隔规则是:a) 多于一个 space b) 字母后的数字和 space 将被拆分

2.We 确保所有即将成为变量的变量都被多个 space 分隔(使用 gsub 替换 one space 介于字母和数字之间 有两个 space)

3.Then 我们使用 tidyr::separate 将字符串分隔成变量,使用 两个或多个 spaces 作为分隔符

library(dplyr)
library(tidyr)
df1 %>% 
  mutate(value = gsub("([A-z])( )([0-9])", "\1  \3", value)) %>% 
  separate(value, c(paste0("v", 1:5)),"[ ]{2,}")

Returns:

       v1  v2  v3     v4     v5
1 1100001 A R   P    110     55
2 1100002 W R G P    161   80.5
3 1100003 A K G P    129   64.5
4 1100004   M   P    132     66
5 1100005   A   P Absent Absent

编辑

关于原始示例中未包含的新约束(数字和字母之间只有一个space):

建议的解决方案:

我们只是重复使用“反向”正则表达式添加额外 space 的命令。因此,数字和字母之间的任何单个 space 都会得到一个额外的 space,然后将被 separate 调用

分隔
df1 %>% 
  mutate(value = gsub("([A-z])( )([0-9])", "\1  \3", value)) %>% 
  mutate(value = gsub("([0-9])( )([A-z])", "\1  \3", value)) %>% 
  separate(value, c(paste0("v", 1:5)),"[ ]{2,}")