如何在R中的字符串中的某个位置之后添加双引号
How to add double quotes to after a certain position in a string in R
我有一个 data.table,其中有很多行在 R 中看起来像这样:
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
NCBINCC GenBank gene 331 1008 . - . gene_id=UL1 protein_id=ABV71500.1
NCBINCC GenBank gene 1009 1120 . - . gene_id=UL4 protein_id=ABV71520
NCBINCC GenBank gene 1135 1200 . - . gene_id=UL6 protein_id=ABV71525
是否有一种简单的方法在字符串之间(在字符串 gene_id= 和 protein_id= 之后)添加引号,以便它们仅包含不同的基因和蛋白质,如以下输出:
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
NCBINCC GenBank gene 331 1008 . - . gene_id="UL1" protein_id="ABV71500.1"
NCBINCC GenBank gene 1009 1120 . - . gene_id="UL4" protein_id="ABV71520"
NCBINCC GenBank gene 1135 1200 . - . gene_id="UL6" protein_id="ABV71525"
我已经看到 用于 shell,但想知道是否有办法在 R 中也这样做。谢谢。
我们可以使用 str_replace
和正则表达式环视来匹配 =
,捕获包括 .
在内的字母数字字符并替换为引用的反向引用 (\1
)
library(stringr)
library(dplyr)
df1 <- df1 %>%
mutate(across(c(V9, V10),
~ str_replace(., "(?<=\=)([[:alnum:].]+)", '"\1"')))
-输出
df1
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
#1 NCBINCC GenBank gene 331 1008 . - . gene_id="UL1" protein_id="ABV71500.1"
#2 NCBINCC GenBank gene 1009 1120 . - . gene_id="UL4" protein_id="ABV71520"
#3 NCBINCC GenBank gene 1135 1200 . - . gene_id="UL6" protein_id="ABV71525"
忘记使用相应的选项using base R
nm1 <- c("V9", "V10")
df1[nm1] <- lapply(df1[nm1], function(x)
sub("(?<=\=)([[:alnum:].]+)", '"\1"', x, perl = TRUE))
数据
df1 <- structure(list(V1 = c("NCBINCC", "NCBINCC", "NCBINCC"), V2 = c("GenBank",
"GenBank", "GenBank"), V3 = c("gene", "gene", "gene"), V4 = c(331L,
1009L, 1135L), V5 = c(1008L, 1120L, 1200L), V6 = c(".", ".",
"."), V7 = c("-", "-", "-"), V8 = c(".", ".", "."), V9 = c("gene_id=UL1",
"gene_id=UL4", "gene_id=UL6"), V10 = c("protein_id=ABV71500.1",
"protein_id=ABV71520", "protein_id=ABV71525")), class = "data.frame",
row.names = c(NA,
-3L))
我会使用 mutate
和 stringr
:
require(dplyr)
require(stringr)
myTable %>%
mutate(across(c(V9, V10),
function(x){
firstHalf <- str_extract(x, "^.+=") # everything up to and including the '='
secondHalf <- str_extract(x, "(?<==).*$") # everything after the '='
# Add quotes to secondHalf
newSecondHalf <- paste0("\"", secondHalf, "\"")
# Glue it all back together and spit it out
paste0(firstHalf, newSecondHalf)
}))
假设一个名为 mydatatable 的数据 table,我使用了 gsub 和 paste0。
library(dplyr)
mydatatable <- mydatatable %>%
mutate(across(c(V9, V10), ~paste0(gsub("=", '="', .), '"')))
如果您对包裹感到厌倦,您可能想在 lapply
中尝试 sub
。
v <- c('V9', 'V10')
d[v] <- lapply(d[v], sub, pa='\=(.*)', re='="\1"')
d
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
# 1 NCBINCC GenBank gene 331 1008 . - . gene_id="UL1" protein_id="ABV71500.1"
# 2 NCBINCC GenBank gene 1009 1120 . - . gene_id="UL4" protein_id="ABV71520"
# 3 NCBINCC GenBank gene 1135 1200 . - . gene_id="UL6" protein_id="ABV71525"
数据
d <- read.table(header=T, text='V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
NCBINCC GenBank gene 331 1008 . - . gene_id=UL1 protein_id=ABV71500.1
NCBINCC GenBank gene 1009 1120 . - . gene_id=UL4 protein_id=ABV71520
NCBINCC GenBank gene 1135 1200 . - . gene_id=UL6 protein_id=ABV71525')
我有一个 data.table,其中有很多行在 R 中看起来像这样:
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
NCBINCC GenBank gene 331 1008 . - . gene_id=UL1 protein_id=ABV71500.1
NCBINCC GenBank gene 1009 1120 . - . gene_id=UL4 protein_id=ABV71520
NCBINCC GenBank gene 1135 1200 . - . gene_id=UL6 protein_id=ABV71525
是否有一种简单的方法在字符串之间(在字符串 gene_id= 和 protein_id= 之后)添加引号,以便它们仅包含不同的基因和蛋白质,如以下输出:
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
NCBINCC GenBank gene 331 1008 . - . gene_id="UL1" protein_id="ABV71500.1"
NCBINCC GenBank gene 1009 1120 . - . gene_id="UL4" protein_id="ABV71520"
NCBINCC GenBank gene 1135 1200 . - . gene_id="UL6" protein_id="ABV71525"
我已经看到
我们可以使用 str_replace
和正则表达式环视来匹配 =
,捕获包括 .
在内的字母数字字符并替换为引用的反向引用 (\1
)
library(stringr)
library(dplyr)
df1 <- df1 %>%
mutate(across(c(V9, V10),
~ str_replace(., "(?<=\=)([[:alnum:].]+)", '"\1"')))
-输出
df1
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
#1 NCBINCC GenBank gene 331 1008 . - . gene_id="UL1" protein_id="ABV71500.1"
#2 NCBINCC GenBank gene 1009 1120 . - . gene_id="UL4" protein_id="ABV71520"
#3 NCBINCC GenBank gene 1135 1200 . - . gene_id="UL6" protein_id="ABV71525"
忘记使用相应的选项using base R
nm1 <- c("V9", "V10")
df1[nm1] <- lapply(df1[nm1], function(x)
sub("(?<=\=)([[:alnum:].]+)", '"\1"', x, perl = TRUE))
数据
df1 <- structure(list(V1 = c("NCBINCC", "NCBINCC", "NCBINCC"), V2 = c("GenBank",
"GenBank", "GenBank"), V3 = c("gene", "gene", "gene"), V4 = c(331L,
1009L, 1135L), V5 = c(1008L, 1120L, 1200L), V6 = c(".", ".",
"."), V7 = c("-", "-", "-"), V8 = c(".", ".", "."), V9 = c("gene_id=UL1",
"gene_id=UL4", "gene_id=UL6"), V10 = c("protein_id=ABV71500.1",
"protein_id=ABV71520", "protein_id=ABV71525")), class = "data.frame",
row.names = c(NA,
-3L))
我会使用 mutate
和 stringr
:
require(dplyr)
require(stringr)
myTable %>%
mutate(across(c(V9, V10),
function(x){
firstHalf <- str_extract(x, "^.+=") # everything up to and including the '='
secondHalf <- str_extract(x, "(?<==).*$") # everything after the '='
# Add quotes to secondHalf
newSecondHalf <- paste0("\"", secondHalf, "\"")
# Glue it all back together and spit it out
paste0(firstHalf, newSecondHalf)
}))
假设一个名为 mydatatable 的数据 table,我使用了 gsub 和 paste0。
library(dplyr)
mydatatable <- mydatatable %>%
mutate(across(c(V9, V10), ~paste0(gsub("=", '="', .), '"')))
如果您对包裹感到厌倦,您可能想在 lapply
中尝试 sub
。
v <- c('V9', 'V10')
d[v] <- lapply(d[v], sub, pa='\=(.*)', re='="\1"')
d
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
# 1 NCBINCC GenBank gene 331 1008 . - . gene_id="UL1" protein_id="ABV71500.1"
# 2 NCBINCC GenBank gene 1009 1120 . - . gene_id="UL4" protein_id="ABV71520"
# 3 NCBINCC GenBank gene 1135 1200 . - . gene_id="UL6" protein_id="ABV71525"
数据
d <- read.table(header=T, text='V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
NCBINCC GenBank gene 331 1008 . - . gene_id=UL1 protein_id=ABV71500.1
NCBINCC GenBank gene 1009 1120 . - . gene_id=UL4 protein_id=ABV71520
NCBINCC GenBank gene 1135 1200 . - . gene_id=UL6 protein_id=ABV71525')