从 tab delim 文件中提取最长的序列
Extracting the longest sequence from the tab delim file
我有 tab delim 文件,其中包含以下信息
>fasta
>ss_23_122_0_1
MJSDHWTEZTZEWUIASUDUAISDUASADIASDIAUSIDAUSIDCASDAS
>ss_23_167_0_1
WEIURIOWERWKLEJDSAJFASDGASZDTTQZWTEZQWTEZUQWEZQWTEZQTWEZTQW
>ss_23_167_0_1
MAASDASDWEPWERIWERIWER
>ss_23_167_0_1
QWEKCKLSDOIEOWIOWEUWWEUWEZURZEWURZUWEUZUQZUWZUE
>ss_45_201_0_1
HZTMKSKDIUWZUWEZTZWERWUEOIRUOEROOWEWERSDFSDFRRRETERTER
>ss_45_201_0_1
ZTTRASOIIDIFOSDIOFISDOFSDFQAWTZETQWE
>ss_89_10_0_2
NJZTIWEIOIOIPIEPWIQPOEIQWIEPOQWIEPOQWIEPQIWEP
对于像ss_45_201_0_1
和ss_23_167_0_1
这样的id有多个条目,我想只保留那些长度最大的条目。我想得到如下输出:
>fasta
>ss_23_122_0_1
MJSDHWTEZTZEWUIASUDUAISDUASADIASDIAUSIDAUSIDCASDAS
>ss_23_167_0_1
WEIURIOWERWKLEJDSAJFASDGASZDTTQZWTEZQWTEZUQWEZQWTEZQTWEZTQW
>ss_45_201_0_1
HZTMKSKDIUWZUWEZTZWERWUEOIRUOEROOWEWERSDFSDFRRRETERTER
>ss_89_10_0_2
NJZTIWEIOIOIPIEPWIQPOEIQWIEPOQWIEPOQWIEPQIWEP
我在 R 中尝试了以下代码,但它失败了
Unique(fasta)
谁能指导一下。对于那些具有多个不同长度条目的相同 ID,我怎样才能只获得最长的序列。
也许有更优雅的方式...
l <-list(ss_23_122_0_1 = "MJSDHWTEZTZEWUIASUDUAISDUASADIASDIAUSIDAUSIDCASDAS",
ss_23_167_0_1 = "WEIURIOWERWKLEJDSAJFASDGASZDTTQZWTEZQWTEZUQWEZQWTEZQTWEZTQW",
ss_23_167_0_1 = "MAASDASDWEPWERIWERIWER",
ss_23_167_0_1 = "QWEKCKLSDOIEOWIOWEUWWEUWEZURZEWURZUWEUZUQZUWZUE",
ss_45_201_0_1 = "HZTMKSKDIUWZUWEZTZWERWUEOIRUOEROOWEWERSDFSDFRRRETERTER",
ss_45_201_0_1 = "ZTTRASOIIDIFOSDIOFISDOFSDFQAWTZETQWE",
ss_89_10_0_2 = "NJZTIWEIOIOIPIEPWIQPOEIQWIEPOQWIEPOQWIEPQIWEP")
res <- split(l, names(l))
ind <- lapply(split(sapply(l, nchar), names(l)), which.max)
Map(function(x, y) x[y], res, ind)
$ss_23_122_0_1
$ss_23_122_0_1$ss_23_122_0_1
[1] "MJSDHWTEZTZEWUIASUDUAISDUASADIASDIAUSIDAUSIDCASDAS"
$ss_23_167_0_1
$ss_23_167_0_1$ss_23_167_0_1
[1] "WEIURIOWERWKLEJDSAJFASDGASZDTTQZWTEZQWTEZUQWEZQWTEZQTWEZTQW"
$ss_45_201_0_1
$ss_45_201_0_1$ss_45_201_0_1
[1] "HZTMKSKDIUWZUWEZTZWERWUEOIRUOEROOWEWERSDFSDFRRRETERTER"
$ss_89_10_0_2
$ss_89_10_0_2$ss_89_10_0_2
[1] "NJZTIWEIOIOIPIEPWIQPOEIQWIEPOQWIEPOQWIEPQIWEP"
这里有三个选项可供考虑。
选项 1:基础 R
取消列出列表,在其上使用 nchar
,然后使用 ave
计算出要保留的值。
x <- nchar(unlist(l))
l[as.logical(ave(x, names(x), FUN = function(x) x == max(x)))]
# $ss_23_122_0_1
# [1] "MJSDHWTEZTZEWUIASUDUAISDUASADIASDIAUSIDAUSIDCASDAS"
#
# $ss_23_167_0_1
# [1] "WEIURIOWERWKLEJDSAJFASDGASZDTTQZWTEZQWTEZUQWEZQWTEZQTWEZTQW"
#
# $ss_45_201_0_1
# [1] "HZTMKSKDIUWZUWEZTZWERWUEOIRUOEROOWEWERSDFSDFRRRETERTER"
#
# $ss_89_10_0_2
# [1] "NJZTIWEIOIOIPIEPWIQPOEIQWIEPOQWIEPOQWIEPQIWEP"
选项 2:"data.table"
使用 "reshape2" 中的 melt
创建一个 data.frame
。使用 rank
和 nchar
进行子集化。 (我使用 rank 而不是 ==
,这样我就不必使用 nchar
两次——还没有检查比较效率。)
library(data.table)
library(reshape2)
as.data.table(melt(l))[, Rnk := rank(nchar(as.character(value))),
by = L1][Rnk == 1]
# value L1 Rnk
# 1: MJSDHWTEZTZEWUIASUDUAISDUASADIASDIAUSIDAUSIDCASDAS ss_23_122_0_1 1
# 2: MAASDASDWEPWERIWERIWER ss_23_167_0_1 1
# 3: ZTTRASOIIDIFOSDIOFISDOFSDFQAWTZETQWE ss_45_201_0_1 1
# 4: NJZTIWEIOIOIPIEPWIQPOEIQWIEPOQWIEPOQWIEPQIWEP ss_89_10_0_2 1
选项 3:"dplyr"
与 "data.table" 类似的方法。
library(dplyr)
library(reshape2)
melt(l) %>%
group_by(L1) %>%
mutate(Rnk = dense_rank(nchar(as.character(value)))) %>%
filter(Rnk == 1)
# Source: local data frame [4 x 3]
# Groups: L1
#
# value L1 Rnk
# 1 MJSDHWTEZTZEWUIASUDUAISDUASADIASDIAUSIDAUSIDCASDAS ss_23_122_0_1 1
# 2 MAASDASDWEPWERIWERIWER ss_23_167_0_1 1
# 3 ZTTRASOIIDIFOSDIOFISDOFSDFQAWTZETQWE ss_45_201_0_1 1
# 4 NJZTIWEIOIOIPIEPWIQPOEIQWIEPOQWIEPOQWIEPQIWEP ss_89_10_0_2 1
我有 tab delim 文件,其中包含以下信息
>fasta
>ss_23_122_0_1
MJSDHWTEZTZEWUIASUDUAISDUASADIASDIAUSIDAUSIDCASDAS
>ss_23_167_0_1
WEIURIOWERWKLEJDSAJFASDGASZDTTQZWTEZQWTEZUQWEZQWTEZQTWEZTQW
>ss_23_167_0_1
MAASDASDWEPWERIWERIWER
>ss_23_167_0_1
QWEKCKLSDOIEOWIOWEUWWEUWEZURZEWURZUWEUZUQZUWZUE
>ss_45_201_0_1
HZTMKSKDIUWZUWEZTZWERWUEOIRUOEROOWEWERSDFSDFRRRETERTER
>ss_45_201_0_1
ZTTRASOIIDIFOSDIOFISDOFSDFQAWTZETQWE
>ss_89_10_0_2
NJZTIWEIOIOIPIEPWIQPOEIQWIEPOQWIEPOQWIEPQIWEP
对于像ss_45_201_0_1
和ss_23_167_0_1
这样的id有多个条目,我想只保留那些长度最大的条目。我想得到如下输出:
>fasta
>ss_23_122_0_1
MJSDHWTEZTZEWUIASUDUAISDUASADIASDIAUSIDAUSIDCASDAS
>ss_23_167_0_1
WEIURIOWERWKLEJDSAJFASDGASZDTTQZWTEZQWTEZUQWEZQWTEZQTWEZTQW
>ss_45_201_0_1
HZTMKSKDIUWZUWEZTZWERWUEOIRUOEROOWEWERSDFSDFRRRETERTER
>ss_89_10_0_2
NJZTIWEIOIOIPIEPWIQPOEIQWIEPOQWIEPOQWIEPQIWEP
我在 R 中尝试了以下代码,但它失败了
Unique(fasta)
谁能指导一下。对于那些具有多个不同长度条目的相同 ID,我怎样才能只获得最长的序列。
也许有更优雅的方式...
l <-list(ss_23_122_0_1 = "MJSDHWTEZTZEWUIASUDUAISDUASADIASDIAUSIDAUSIDCASDAS",
ss_23_167_0_1 = "WEIURIOWERWKLEJDSAJFASDGASZDTTQZWTEZQWTEZUQWEZQWTEZQTWEZTQW",
ss_23_167_0_1 = "MAASDASDWEPWERIWERIWER",
ss_23_167_0_1 = "QWEKCKLSDOIEOWIOWEUWWEUWEZURZEWURZUWEUZUQZUWZUE",
ss_45_201_0_1 = "HZTMKSKDIUWZUWEZTZWERWUEOIRUOEROOWEWERSDFSDFRRRETERTER",
ss_45_201_0_1 = "ZTTRASOIIDIFOSDIOFISDOFSDFQAWTZETQWE",
ss_89_10_0_2 = "NJZTIWEIOIOIPIEPWIQPOEIQWIEPOQWIEPOQWIEPQIWEP")
res <- split(l, names(l))
ind <- lapply(split(sapply(l, nchar), names(l)), which.max)
Map(function(x, y) x[y], res, ind)
$ss_23_122_0_1
$ss_23_122_0_1$ss_23_122_0_1
[1] "MJSDHWTEZTZEWUIASUDUAISDUASADIASDIAUSIDAUSIDCASDAS"
$ss_23_167_0_1
$ss_23_167_0_1$ss_23_167_0_1
[1] "WEIURIOWERWKLEJDSAJFASDGASZDTTQZWTEZQWTEZUQWEZQWTEZQTWEZTQW"
$ss_45_201_0_1
$ss_45_201_0_1$ss_45_201_0_1
[1] "HZTMKSKDIUWZUWEZTZWERWUEOIRUOEROOWEWERSDFSDFRRRETERTER"
$ss_89_10_0_2
$ss_89_10_0_2$ss_89_10_0_2
[1] "NJZTIWEIOIOIPIEPWIQPOEIQWIEPOQWIEPOQWIEPQIWEP"
这里有三个选项可供考虑。
选项 1:基础 R
取消列出列表,在其上使用 nchar
,然后使用 ave
计算出要保留的值。
x <- nchar(unlist(l))
l[as.logical(ave(x, names(x), FUN = function(x) x == max(x)))]
# $ss_23_122_0_1
# [1] "MJSDHWTEZTZEWUIASUDUAISDUASADIASDIAUSIDAUSIDCASDAS"
#
# $ss_23_167_0_1
# [1] "WEIURIOWERWKLEJDSAJFASDGASZDTTQZWTEZQWTEZUQWEZQWTEZQTWEZTQW"
#
# $ss_45_201_0_1
# [1] "HZTMKSKDIUWZUWEZTZWERWUEOIRUOEROOWEWERSDFSDFRRRETERTER"
#
# $ss_89_10_0_2
# [1] "NJZTIWEIOIOIPIEPWIQPOEIQWIEPOQWIEPOQWIEPQIWEP"
选项 2:"data.table"
使用 "reshape2" 中的 melt
创建一个 data.frame
。使用 rank
和 nchar
进行子集化。 (我使用 rank 而不是 ==
,这样我就不必使用 nchar
两次——还没有检查比较效率。)
library(data.table)
library(reshape2)
as.data.table(melt(l))[, Rnk := rank(nchar(as.character(value))),
by = L1][Rnk == 1]
# value L1 Rnk
# 1: MJSDHWTEZTZEWUIASUDUAISDUASADIASDIAUSIDAUSIDCASDAS ss_23_122_0_1 1
# 2: MAASDASDWEPWERIWERIWER ss_23_167_0_1 1
# 3: ZTTRASOIIDIFOSDIOFISDOFSDFQAWTZETQWE ss_45_201_0_1 1
# 4: NJZTIWEIOIOIPIEPWIQPOEIQWIEPOQWIEPOQWIEPQIWEP ss_89_10_0_2 1
选项 3:"dplyr"
与 "data.table" 类似的方法。
library(dplyr)
library(reshape2)
melt(l) %>%
group_by(L1) %>%
mutate(Rnk = dense_rank(nchar(as.character(value)))) %>%
filter(Rnk == 1)
# Source: local data frame [4 x 3]
# Groups: L1
#
# value L1 Rnk
# 1 MJSDHWTEZTZEWUIASUDUAISDUASADIASDIAUSIDAUSIDCASDAS ss_23_122_0_1 1
# 2 MAASDASDWEPWERIWERIWER ss_23_167_0_1 1
# 3 ZTTRASOIIDIFOSDIOFISDOFSDFQAWTZETQWE ss_45_201_0_1 1
# 4 NJZTIWEIOIOIPIEPWIQPOEIQWIEPOQWIEPOQWIEPQIWEP ss_89_10_0_2 1