用于提取和拆分的 stringr
stringr for extract and split
我有一堆看起来像这样的行:
people <- matrix(c("Joe Smith", "Highland (Baltimore, MD)", "Male", "Jane Davis", "Trinity (Albany, NY)", "Female"), ncol = 3, byrow = T)
我正在使用的正则表达式模式是:
cut <- "\w*\,\s.."
该正则表达式模式基本上将第二列减少为仅包含 "Baltimore, MD" 和 "Albany, NY",因此括号内的所有内容。
然后我想使用 str_split 将城市和州放入两个单独的列中,因此最终输出将如下所示:
[,1] [,2] [,3] [,4]
[1,] "Joe Smith" "Highland (Baltimore, MD)" "Male"
[2,] "Jane Davis" "Trinity (Albany, NY)" "Female"
1 2 3 4
1 Joe Smith Baltimore MD Male
2 Jane Davis Albany NY Female
我就是想不通。
library(tidyverse)
people%>%as.data.frame()%>%mutate(V2=sub(".*\((.*)\).*","\1",people[,2]))%>%
separate(V2,c("City","State"),",")
V1 City State V3
1 Joe Smith Baltimore MD Male
2 Jane Davis Albany NY Female
people <- matrix(c("Joe Smith", "Highland (Baltimore, MD)", "Male", "Jane Davis", "Trinity (Albany, NY)", "Female"), ncol = 3, byrow = T)
people<-data.frame(people)
res<-data.frame(people,stringr::str_split_fixed(people$X2," ",n=2))
res$X2.1<-gsub(")","",res$X2.1,fixed=TRUE)
res$X2.1<-gsub("(","",res$X2.1,fixed=TRUE)
res<-data.frame(people,stringr::str_split_fixed(res$X2.1,",",n=2))
names(res)<-c("name1","name2","name3","name4","name5")
res$name2<-NULL
res
我们可以用 base R
res <- trimws(cbind(people[,1], as.matrix(read.csv(text =
gsub("^\S+\s+\(|\)", "", people[,2]), sep=",", header = FALSE)), people[,3]))
colnames(res) <- NULL
res
# [,1] [,2] [,3] [,4]
#[1,] "Joe Smith" "Baltimore" "MD" "Male"
#[2,] "Jane Davis" "Albany" "NY" "Female"
类似于,这个使用extract()
而不是mutate()
+sub()
+separate()
的组合:
library(tidyverse)
people %>%
as.data.frame() %>%
extract(V2, into = c("City", "State"), regex = ".*\((.*), (.*)\)")
# V1 City State V3
# 1 Joe Smith Baltimore MD Male
# 2 Jane Davis Albany NY Female
您还可以使用我的 "splitstackshape" 包中的 cSplit
:
library(splitstackshape)
cSplit(as.data.table(people)[, V2 := gsub(".*\((.*)\)", "\1", V2)], "V2", ",")
# V1 V3 V2_1 V2_2
# 1: Joe Smith Male Baltimore MD
# 2: Jane Davis Female Albany NY
我有一堆看起来像这样的行:
people <- matrix(c("Joe Smith", "Highland (Baltimore, MD)", "Male", "Jane Davis", "Trinity (Albany, NY)", "Female"), ncol = 3, byrow = T)
我正在使用的正则表达式模式是:
cut <- "\w*\,\s.."
该正则表达式模式基本上将第二列减少为仅包含 "Baltimore, MD" 和 "Albany, NY",因此括号内的所有内容。
然后我想使用 str_split 将城市和州放入两个单独的列中,因此最终输出将如下所示:
[,1] [,2] [,3] [,4]
[1,] "Joe Smith" "Highland (Baltimore, MD)" "Male"
[2,] "Jane Davis" "Trinity (Albany, NY)" "Female"
1 2 3 4
1 Joe Smith Baltimore MD Male
2 Jane Davis Albany NY Female
我就是想不通。
library(tidyverse)
people%>%as.data.frame()%>%mutate(V2=sub(".*\((.*)\).*","\1",people[,2]))%>%
separate(V2,c("City","State"),",")
V1 City State V3
1 Joe Smith Baltimore MD Male
2 Jane Davis Albany NY Female
people <- matrix(c("Joe Smith", "Highland (Baltimore, MD)", "Male", "Jane Davis", "Trinity (Albany, NY)", "Female"), ncol = 3, byrow = T)
people<-data.frame(people)
res<-data.frame(people,stringr::str_split_fixed(people$X2," ",n=2))
res$X2.1<-gsub(")","",res$X2.1,fixed=TRUE)
res$X2.1<-gsub("(","",res$X2.1,fixed=TRUE)
res<-data.frame(people,stringr::str_split_fixed(res$X2.1,",",n=2))
names(res)<-c("name1","name2","name3","name4","name5")
res$name2<-NULL
res
我们可以用 base R
res <- trimws(cbind(people[,1], as.matrix(read.csv(text =
gsub("^\S+\s+\(|\)", "", people[,2]), sep=",", header = FALSE)), people[,3]))
colnames(res) <- NULL
res
# [,1] [,2] [,3] [,4]
#[1,] "Joe Smith" "Baltimore" "MD" "Male"
#[2,] "Jane Davis" "Albany" "NY" "Female"
类似于extract()
而不是mutate()
+sub()
+separate()
的组合:
library(tidyverse)
people %>%
as.data.frame() %>%
extract(V2, into = c("City", "State"), regex = ".*\((.*), (.*)\)")
# V1 City State V3
# 1 Joe Smith Baltimore MD Male
# 2 Jane Davis Albany NY Female
您还可以使用我的 "splitstackshape" 包中的 cSplit
:
library(splitstackshape)
cSplit(as.data.table(people)[, V2 := gsub(".*\((.*)\)", "\1", V2)], "V2", ",")
# V1 V3 V2_1 V2_2
# 1: Joe Smith Male Baltimore MD
# 2: Jane Davis Female Albany NY