提取特定词
Extract specific word
我有这个对象列表:
dput(head(annotations))
structure(list(X1 = c("KQ415659.1", "KQ415659.1", "KQ415659.1",
"KQ415659.1", "KQ415659.1", "KQ415659.1"), X2 = c("Genbank",
"Genbank", "Genbank", "Genbank", "Genbank", "Genbank"), X3 = c("exon",
"exon", "exon", "exon", "exon", "exon"), X4 = c(2986, 8779, 12123,
14982, 15303, 15780), X5 = c(3040, 8886, 12182, 15050, 15387,
15844), X6 = c(".", ".", ".", ".", ".", "."), X7 = c("+", "+",
"+", "+", "+", "+"), X8 = c(".", ".", ".", ".", ".", "."), X9 = c("transcript_id \"rna0\"; gene_id \"gene0\"; gene_name \"ICBIM_22028242mg\";",
"transcript_id \"rna0\"; gene_id \"gene0\"; gene_name \"ICBIM_22028243mg\";",
"transcript_id \"rna0\"; gene_id \"gene0\"; gene_name \"ICBIM_22028244mg\";",
"transcript_id \"rna0\"; gene_id \"gene0\"; gene_name \"ICBIM_22028245mg\";",
"transcript_id \"rna0\"; gene_id \"gene0\"; gene_name \"ICBIM_22028246mg\";",
"transcript_id \"rna0\"; gene_id \"gene0\"; gene_name \"ICBIM_22028247mg\";"
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-6L))
我想从所有字符串中提取以"ICBIM"开头并以"mg"结尾的单词。
# A tibble: 6 x 1
X9
<chr>
1 "transcript_id \"rna0\"; gene_id \"gene0\"; gene_name \"ICBIM_22028241mg\";"
2 "transcript_id \"rna0\"; gene_id \"gene0\"; gene_name \"ICBIM_22028242mg\";"
3 "transcript_id \"rna0\"; gene_id \"gene0\"; gene_name \"ICBIM_22028243mg\";"
4 "transcript_id \"rna0\"; gene_id \"gene0\"; gene_name \"ICBIM_22028244mg\";"
5 "transcript_id \"rna0\"; gene_id \"gene0\"; gene_name \"ICBIM_22028245mg\";"
6 "transcript_id \"rna0\"; gene_id \"gene0\"; gene_name \"ICBIM_22028246mg\";"
starts with "ICBIM" and ends with "mg"
ICBIM.+?mg
您可以在 sub
:
中使用此正则表达式
sub(".*\b(ICBIM.+mg)\b.*", "\1", annotations$X9)
# [1] "ICBIM_22028242mg" "ICBIM_22028243mg" "ICBIM_22028244mg" "ICBIM_22028245mg"
# [5] "ICBIM_22028246mg" "ICBIM_22028247mg"
这里,\b
表示文字边框。
R 中的正则表达式匹配有点奇怪。你有 6 个函数(grep
、grepl
、sub
、gsub
、regexpr
、gregepr
、regexec
)做相关的东西,但是 none 其中 returns 是实际的字符串。您可以获得的最接近的东西是来自(例如)regexpr
的匹配位置和长度列表,可以传递给 regmatches
以获取字符串。
所以我会这样做:
your_data = c("transcript_id \"rna0\"; gene_id \"gene0\"; gene_name \"ICBIM_22028241mg\";", "transcript_id \"rna0\"; gene_id \"gene0\"; gene_name \"ICBIM_22028242mg\";")
matches = regexpr("ICBIM.*mg", your_data)
regmatches(your_data, matches)
[1] "ICBIM_22028241mg" "ICBIM_22028242mg"
假设行的格式相同,没有 regex:
substring(annotations$X9, 51, 66)
# [1] "ICBIM_22028242mg" "ICBIM_22028243mg" "ICBIM_22028244mg" "ICBIM_22028245mg" "ICBIM_22028246mg" "ICBIM_22028247mg"
read.fwf(textConnection(annotations$X9), widths = c(50, 16), stringsAsFactors = FALSE)[, 2]
# [1] "ICBIM_22028242mg" "ICBIM_22028243mg" "ICBIM_22028244mg" "ICBIM_22028245mg" "ICBIM_22028246mg" "ICBIM_22028247mg"
我有这个对象列表:
dput(head(annotations))
structure(list(X1 = c("KQ415659.1", "KQ415659.1", "KQ415659.1",
"KQ415659.1", "KQ415659.1", "KQ415659.1"), X2 = c("Genbank",
"Genbank", "Genbank", "Genbank", "Genbank", "Genbank"), X3 = c("exon",
"exon", "exon", "exon", "exon", "exon"), X4 = c(2986, 8779, 12123,
14982, 15303, 15780), X5 = c(3040, 8886, 12182, 15050, 15387,
15844), X6 = c(".", ".", ".", ".", ".", "."), X7 = c("+", "+",
"+", "+", "+", "+"), X8 = c(".", ".", ".", ".", ".", "."), X9 = c("transcript_id \"rna0\"; gene_id \"gene0\"; gene_name \"ICBIM_22028242mg\";",
"transcript_id \"rna0\"; gene_id \"gene0\"; gene_name \"ICBIM_22028243mg\";",
"transcript_id \"rna0\"; gene_id \"gene0\"; gene_name \"ICBIM_22028244mg\";",
"transcript_id \"rna0\"; gene_id \"gene0\"; gene_name \"ICBIM_22028245mg\";",
"transcript_id \"rna0\"; gene_id \"gene0\"; gene_name \"ICBIM_22028246mg\";",
"transcript_id \"rna0\"; gene_id \"gene0\"; gene_name \"ICBIM_22028247mg\";"
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-6L))
我想从所有字符串中提取以"ICBIM"开头并以"mg"结尾的单词。
# A tibble: 6 x 1
X9
<chr>
1 "transcript_id \"rna0\"; gene_id \"gene0\"; gene_name \"ICBIM_22028241mg\";"
2 "transcript_id \"rna0\"; gene_id \"gene0\"; gene_name \"ICBIM_22028242mg\";"
3 "transcript_id \"rna0\"; gene_id \"gene0\"; gene_name \"ICBIM_22028243mg\";"
4 "transcript_id \"rna0\"; gene_id \"gene0\"; gene_name \"ICBIM_22028244mg\";"
5 "transcript_id \"rna0\"; gene_id \"gene0\"; gene_name \"ICBIM_22028245mg\";"
6 "transcript_id \"rna0\"; gene_id \"gene0\"; gene_name \"ICBIM_22028246mg\";"
starts with "ICBIM" and ends with "mg"
ICBIM.+?mg
您可以在 sub
:
sub(".*\b(ICBIM.+mg)\b.*", "\1", annotations$X9)
# [1] "ICBIM_22028242mg" "ICBIM_22028243mg" "ICBIM_22028244mg" "ICBIM_22028245mg"
# [5] "ICBIM_22028246mg" "ICBIM_22028247mg"
这里,\b
表示文字边框。
R 中的正则表达式匹配有点奇怪。你有 6 个函数(grep
、grepl
、sub
、gsub
、regexpr
、gregepr
、regexec
)做相关的东西,但是 none 其中 returns 是实际的字符串。您可以获得的最接近的东西是来自(例如)regexpr
的匹配位置和长度列表,可以传递给 regmatches
以获取字符串。
所以我会这样做:
your_data = c("transcript_id \"rna0\"; gene_id \"gene0\"; gene_name \"ICBIM_22028241mg\";", "transcript_id \"rna0\"; gene_id \"gene0\"; gene_name \"ICBIM_22028242mg\";")
matches = regexpr("ICBIM.*mg", your_data)
regmatches(your_data, matches)
[1] "ICBIM_22028241mg" "ICBIM_22028242mg"
假设行的格式相同,没有 regex:
substring(annotations$X9, 51, 66)
# [1] "ICBIM_22028242mg" "ICBIM_22028243mg" "ICBIM_22028244mg" "ICBIM_22028245mg" "ICBIM_22028246mg" "ICBIM_22028247mg"
read.fwf(textConnection(annotations$X9), widths = c(50, 16), stringsAsFactors = FALSE)[, 2]
# [1] "ICBIM_22028242mg" "ICBIM_22028243mg" "ICBIM_22028244mg" "ICBIM_22028245mg" "ICBIM_22028246mg" "ICBIM_22028247mg"