提取r中分号之间的字符
Extract characters between semicolons in r
正在尝试提取分号之间的数据并将该数据放入新列中。
这是一些数据
df <- data.frame(data = c("a;;c;d", "a;b;;d","a;;;d","a;b;;;"), num =c(1:4))
这是我到目前为止从 S.O 中收集到的内容。
res <- df %>%
mutate(
colA = str_extract(data, "^[^;]*(?=;)"),
colB = str_extract(data, "(?<=;)[^;]*(?=;)"),
colC = str_extract(data, "(?<=;)(?<=;)[^;]*(?=;)"),
colD = str_extract(data, "(?<=;)[^;]*$")
)
它几乎可以满足我的要求,但 colC
与 colB
相同。我不太了解正则表达式,因此非常感谢收到解决方案和解释。
基础 R
cbind(df, read.csv2(text = df$data, header = FALSE))
# data num V1 V2 V3 V4 V5
# 1 a;;c;d 1 a c d NA
# 2 a;b;;d 2 a b d NA
# 3 a;;;d 3 a d NA
# 4 a;b;;; 4 a b NA
dplyr
library(dplyr)
df %>%
mutate(read.csv2(text = data, header = FALSE))
# data num V1 V2 V3 V4 V5
# 1 a;;c;d 1 a c d NA
# 2 a;b;;d 2 a b d NA
# 3 a;;;d 3 a d NA
# 4 a;b;;; 4 a b NA
这 没有显式赋值 因为 mutate
(和 summarize
)会很乐意接受一个命名的-list
(其中 data.frame
是一种特殊且兼容的情况。
除了 r2evans 基础 R 和 dplyr:
data.table
library(data.table)
df <- data.table(data = c("a;;c;d", "a;b;;d","a;;;d","a;b;;;"), num =c(1:4))
df[, c("ColA", "ColB", "ColC", "ColD"):=tstrsplit(data, ";")]
df
data num ColA ColB ColC ColD
1: a;;c;d 1 a c d
2: a;b;;d 2 a b d
3: a;;;d 3 a d
4: a;b;;; 4 a b
另一个解决方案,使用tidyr::separate
:
library(tidyverse)
df <- data.frame(data = c("a;;c;d", "a;b;;d","a;;;d","a;b;;;"), num =c(1:4))
df %>%
separate(data, into = str_c("col", letters[1:4]), sep=";", extra="drop")
#> cola colb colc cold num
#> 1 a c d 1
#> 2 a b d 2
#> 3 a d 3
#> 4 a b 4
正在尝试提取分号之间的数据并将该数据放入新列中。
这是一些数据
df <- data.frame(data = c("a;;c;d", "a;b;;d","a;;;d","a;b;;;"), num =c(1:4))
这是我到目前为止从 S.O 中收集到的内容。
res <- df %>%
mutate(
colA = str_extract(data, "^[^;]*(?=;)"),
colB = str_extract(data, "(?<=;)[^;]*(?=;)"),
colC = str_extract(data, "(?<=;)(?<=;)[^;]*(?=;)"),
colD = str_extract(data, "(?<=;)[^;]*$")
)
它几乎可以满足我的要求,但 colC
与 colB
相同。我不太了解正则表达式,因此非常感谢收到解决方案和解释。
基础 R
cbind(df, read.csv2(text = df$data, header = FALSE))
# data num V1 V2 V3 V4 V5
# 1 a;;c;d 1 a c d NA
# 2 a;b;;d 2 a b d NA
# 3 a;;;d 3 a d NA
# 4 a;b;;; 4 a b NA
dplyr
library(dplyr)
df %>%
mutate(read.csv2(text = data, header = FALSE))
# data num V1 V2 V3 V4 V5
# 1 a;;c;d 1 a c d NA
# 2 a;b;;d 2 a b d NA
# 3 a;;;d 3 a d NA
# 4 a;b;;; 4 a b NA
这 没有显式赋值 因为 mutate
(和 summarize
)会很乐意接受一个命名的-list
(其中 data.frame
是一种特殊且兼容的情况。
除了 r2evans 基础 R 和 dplyr:
data.table
library(data.table)
df <- data.table(data = c("a;;c;d", "a;b;;d","a;;;d","a;b;;;"), num =c(1:4))
df[, c("ColA", "ColB", "ColC", "ColD"):=tstrsplit(data, ";")]
df
data num ColA ColB ColC ColD
1: a;;c;d 1 a c d
2: a;b;;d 2 a b d
3: a;;;d 3 a d
4: a;b;;; 4 a b
另一个解决方案,使用tidyr::separate
:
library(tidyverse)
df <- data.frame(data = c("a;;c;d", "a;b;;d","a;;;d","a;b;;;"), num =c(1:4))
df %>%
separate(data, into = str_c("col", letters[1:4]), sep=";", extra="drop")
#> cola colb colc cold num
#> 1 a c d 1
#> 2 a b d 2
#> 3 a d 3
#> 4 a b 4