R正则表达式在特定位置的字符之间获取字符串
R regex to fetch strings between characters at specific positions
我在R
.
中有一些字符串数据如下
DT <- structure(list(ID = c(1, 2, 3, 4, 5, 6), GKT = c("G1:GRST, G45:KRPT",
"G48932:KD56", "G7764:MGI45, K7786:IRE4R, K45:TG45", "K4512:3345, G51:56:34, K22:45I67",
"K678:RT,IG, G123:TGIF, G33:IG56", "T4534:K456")), .Names = c("ID",
"GKT"), class = "data.frame", row.names = c(NA, 6L))
DT
ID GKT
1 1 G1:GRST, G45:KRPT
2 2 G48932:KD56
3 3 G7764:MGI45, K7786:IRE4R, K45:TG45
4 4 K4512:3345, G51:56:34, K22:45I67
5 5 K678:RT,IG, G123:TGIF, G33:IG56
6 6 T4534:K456
我想在 R
中使用 gsub
和 regex
从 DT$GKT
获得输出 out
。
out <- c("G1, G45", "G48932", "G7764, K7786, K45", "K4512, G51, K22",
"K678, G123, G33", "T4534")
DT$out <- out
DT
ID GKT out
1 1 G1:GRST, G45:KRPT G1, G45
2 2 G48932:KD56 G48932
3 3 G7764:MGI45, K7786:IRE4R, K45:TG45 G7764, K7786, K45
4 4 K4512:3345, G51:56:34, K22:45I67 K4512, G51, K22
5 5 K678:RT,IG, G123:TGIF, G33:IG56 K678, G123, G33
6 6 T4534:K456 T4534
我试过gsub(x=DT$GKT, pattern = "(:)(.*)(, |\b)", replacement="")
,但它只获取第一个实例。
gsub(x=DT$GKT, pattern = "(:)(.*)(, |\b)", replacement="")
[1] "G1" "G48932" "G7764" "K4512" "K678" "T4534"
您可以使用前瞻 (?=
) 检查 :
并仅捕获第一组
unlist(regmatches(DT$GKT, gregexpr("([A-Z0-9]+)(?=:)", DT$GKT, perl=T)))
# [1] "G1" "G45" "G48932" "G7764" "K7786" "K45" "K4512" "G51"
# [9] "56" "K22" "K678" "G123" "G33" "T4534"
编辑
您可以使用以下正则表达式替换...
DT$out <- gsub(':\S+\b', '', DT$GKT)
DT
# ID GKT out
# 1 1 G1:GRST, G45:KRPT G1, G45
# 2 2 G48932:KD56 G48932
# 3 3 G7764:MGI45, K7786:IRE4R, K45:TG45 G7764, K7786, K45
# 4 4 K4512:3345, G51:56:34, K22:45I67 K4512, G51, K22
# 5 5 K678:RT,IG, G123:TGIF, G33:IG56 K678, G123, G33
# 6 6 T4534:K456 T4534
使用 gsub
的另一个选项是使用后视
DT$out <- gsub("(?=:)(.[A-Z0-9,]+)(?=\b)", "", DT$GKT, perl = TRUE)
DT
# ID GKT out
# 1 1 G1:GRST, G45:KRPT G1, G45
# 2 2 G48932:KD56 G48932
# 3 3 G7764:MGI45, K7786:IRE4R, K45:TG45 G7764, K7786, K45
# 4 4 K4512:3345, G51:56:34, K22:45I67 K4512, G51, K22
# 5 5 K678:RT,IG, G123:TGIF, G33:IG56 K678, G123, G33
# 6 6 T4534:K456 T4534
我在R
.
DT <- structure(list(ID = c(1, 2, 3, 4, 5, 6), GKT = c("G1:GRST, G45:KRPT",
"G48932:KD56", "G7764:MGI45, K7786:IRE4R, K45:TG45", "K4512:3345, G51:56:34, K22:45I67",
"K678:RT,IG, G123:TGIF, G33:IG56", "T4534:K456")), .Names = c("ID",
"GKT"), class = "data.frame", row.names = c(NA, 6L))
DT
ID GKT
1 1 G1:GRST, G45:KRPT
2 2 G48932:KD56
3 3 G7764:MGI45, K7786:IRE4R, K45:TG45
4 4 K4512:3345, G51:56:34, K22:45I67
5 5 K678:RT,IG, G123:TGIF, G33:IG56
6 6 T4534:K456
我想在 R
中使用 gsub
和 regex
从 DT$GKT
获得输出 out
。
out <- c("G1, G45", "G48932", "G7764, K7786, K45", "K4512, G51, K22",
"K678, G123, G33", "T4534")
DT$out <- out
DT
ID GKT out
1 1 G1:GRST, G45:KRPT G1, G45
2 2 G48932:KD56 G48932
3 3 G7764:MGI45, K7786:IRE4R, K45:TG45 G7764, K7786, K45
4 4 K4512:3345, G51:56:34, K22:45I67 K4512, G51, K22
5 5 K678:RT,IG, G123:TGIF, G33:IG56 K678, G123, G33
6 6 T4534:K456 T4534
我试过gsub(x=DT$GKT, pattern = "(:)(.*)(, |\b)", replacement="")
,但它只获取第一个实例。
gsub(x=DT$GKT, pattern = "(:)(.*)(, |\b)", replacement="")
[1] "G1" "G48932" "G7764" "K4512" "K678" "T4534"
您可以使用前瞻 (?=
) 检查 :
并仅捕获第一组
unlist(regmatches(DT$GKT, gregexpr("([A-Z0-9]+)(?=:)", DT$GKT, perl=T)))
# [1] "G1" "G45" "G48932" "G7764" "K7786" "K45" "K4512" "G51"
# [9] "56" "K22" "K678" "G123" "G33" "T4534"
编辑
您可以使用以下正则表达式替换...
DT$out <- gsub(':\S+\b', '', DT$GKT)
DT
# ID GKT out
# 1 1 G1:GRST, G45:KRPT G1, G45
# 2 2 G48932:KD56 G48932
# 3 3 G7764:MGI45, K7786:IRE4R, K45:TG45 G7764, K7786, K45
# 4 4 K4512:3345, G51:56:34, K22:45I67 K4512, G51, K22
# 5 5 K678:RT,IG, G123:TGIF, G33:IG56 K678, G123, G33
# 6 6 T4534:K456 T4534
使用 gsub
的另一个选项是使用后视
DT$out <- gsub("(?=:)(.[A-Z0-9,]+)(?=\b)", "", DT$GKT, perl = TRUE)
DT
# ID GKT out
# 1 1 G1:GRST, G45:KRPT G1, G45
# 2 2 G48932:KD56 G48932
# 3 3 G7764:MGI45, K7786:IRE4R, K45:TG45 G7764, K7786, K45
# 4 4 K4512:3345, G51:56:34, K22:45I67 K4512, G51, K22
# 5 5 K678:RT,IG, G123:TGIF, G33:IG56 K678, G123, G33
# 6 6 T4534:K456 T4534