用于处理基因型数据的 R 代码
R code to work on genotype data
我有这个数据叫做 mydf
。
我需要将 REF
和 ALT
列中的字母(DNA 字母)与 colnames(x)
("A","T","G","C"
) 进行匹配,并将相应的数值粘贴在一起作为 "REF,ALT"
。
但是,有些行在 TYPE
列中有 "snp:+[0-9]"
和 "flat$"
。
现在 "flat$"
行我想要:
- sum 来自
"snp:+[0-9]"
个相应 "start"
id 的 ALT
值,包括扁平线本身,如果
ALT
个字母是唯一的(请参阅 curly 中包含的脚本
一条直线的括号)
- 粘贴那个
ALT
值再次作为"REF,ALT"
(REF
值将是
对于具有相同起始 ID 的 "snp:+[0-9]"
和 "flat$"
相同)
- 得到输出如结果所示。
我已经为一条扁平线完成了此操作,但我需要帮助制作 flatcase
的功能,以便它对所有扁平线执行相同的操作。
如何为 flatcase
创建一个函数来执行此操作?
Code
normalCase <- function(x, ns) {
ref.idx <- which(ns == "REF")
ref.allele <- x[ref.idx]
ref.count <- x[which(ns == ref.allele)]
alt.idx <- which(ns == "ALT")
alt.allele <- x[alt.idx]
alt.count <- x[which(ns == alt.allele)]
paste(ref.count, alt.count, sep=",")
}
flatcase??{
g<-x[,"start"]=="chr16:2530921"& grepl("snp:+[0-9]",x[,"TYPE"])
myt<-x[g,]
x[g,"ALT"]
unique(x[g,"ALT"])
c<-unique(x[g,"ALT"])
flat<-myt[grepl("flat$",myt[,"TYPE"]),]
c<-unique(x[g,"ALT"])
alt.count<- sum(as.numeric(flat[c]))
}
calculateAD <- function(x, mat, ns) {
if (grepl("flat$", x[which(ns == 'TYPE')])) {
flatCase(x, mat, ns)
} else {
normalCase(x, ns)
}
}
bamAD <- function(x) {
new.x <- cbind(x, apply(x, 1, calculateAD, x, colnames(x)))
colnames(new.x)[ncol(new.x)] <- "bam.AD"
new.x
}
我试过的flatCase函数是:
flatCase <- function(x, mat, ns) {
id.idx <- which(ns == 'start')
type.idx <- which(ns == 'TYPE')
ref.idx <- which(ns == 'REF')
alt.idx <- which(ns == 'ALT')
id <- x[id.idx]
#m <- mat[mat[, id.idx] == id & mat[, type.idx] == "snp", ]
#m <- mat[mat[, id.idx] == id & mat[, type.idx] == "snp", ]
m<-mat[grepl(id,mat[, id.idx]) & grepl("snp:+[0-9]",mat[, type.idx]),]
#flat<-mat[grepl("flat$",mat[, type.idx]),]
ref.allele <- x[ref.idx]
ref.count<-x[which(ns == ref.allele)]
alt.count <- sum(apply(m, 1, function(x) as.numeric(x[which(ns == x[alt.idx])])))
paste(ref.count, alt.count, sep=",")
}
mydf
x <- as.matrix(read.csv(text="start,A,T,G,C,REF,ALT,TYPE
chr20:5363934,95,29,14,59,C,T,snp
chr5:8529759,24,1,28,41,G,C,snp
chr14:9620689,65,49,41,96,T,G,snp
chr18:547375,94,1,51,67,G,C,snp
chr8:5952145,27,80,25,96,T,T,snp
chr14:8694382,68,94,26,30,A,A,snp
chr16:2530921,49,15,79,72,A,T,snp:2530921
chr16:2530921,49,15,79,72,A,G,snp:2530921
chr16:2530921,49,15,79,72,A,T,snp:2530921flat
chr16:2533924,42,13,19,52,G,T,snp:2533924flat
chr16:2543344,4,13,13,42,G,T,snp:2543344flat
chr16:2543344,4,23,13,42,G,A,snp:2543344
chr14:4214117,73,49,18,77,G,A,snp
chr4:7799768,36,28,1,16,C,A,snp
chr3:9141263,27,41,93,90,A,A,snp", stringsAsFactors=FALSE))
Result:
start A T G C REF ALT TYPE bam.AD
[1,] "chr20:5363934" "95" "29" "14" "59" "C" "T" "snp" "59,29"
[2,] "chr5:8529759" "24" " 1" "28" "41" "G" "C" "snp" "28,41"
[3,] "chr14:9620689" "65" "49" "41" "96" "T" "G" "snp" "49,41"
[4,] "chr18:547375" "94" " 1" "51" "67" "G" "C" "snp" "51,67"
[5,] "chr8:5952145" "27" "80" "25" "96" "T" "T" "snp" "80,80"
[6,] "chr14:8694382" "68" "94" "26" "30" "A" "A" "snp" "68,68"
[7,] "chr16:2530921" "49" "15" "79" "72" "A" "T" "snp:2530921" "49,15"
[8,] "chr16:2530921" "49" "15" "79" "72" "A" "G" "snp:2530921" "49,79"
[9,] "chr16:2530921" "49" "15" "79" "72" "A" "T" "snp:2530921flat" "49,94"
[10,] "chr16:2533924" "42" "13" "19" "52" "G" "T" "snp:2533924flat" "19,13"
[11,] "chr16:2543344" "42" "13" "13" "42" "G" "T" "snp:2543344flat" "13,55"
[12,] "chr16:2543344" "42" "23" "13" "42" "G" "A" "snp:2543344" "13,42"
[13,] "chr14:4214117" "73" "49" "18" "77" "G" "A" "snp" "18,73"
[14,] "chr4:7799768" "36" "28" " 1" "16" "C" "A" "snp" "16,36"
[15,] "chr3:9141263" "27" "41" "93" "90" "A" "A" "snp" "27,27"
这是一种矢量化的方法。
首先,请注意无论类型如何,REF 都是相同的。
我们可以通过使用 REF 作为矩阵中的坐标来快速查找它,例如第 1 行有 REF C,所以如果我们查找坐标 (1, "C"),我们会得到该行的 REF 值。
# the REFs are the same regardless of TYPE
rownames(x) <- 1:nrow(x)
ref <- x[cbind(1:nrow(x), x[, 'REF'])]
看看cbind(1:nrow(x), x[, 'REF'])
:这只是一个坐标列表(row number, REF)
,我们用它来查找参考编号。
然后我们对 ALT 做同样的事情:
alt <- x[cbind(1:nrow(x), x[, 'ALT'])]
但是我们必须确保如果类型是 'flat',我们将所有其他 ALT 添加到 'flat' 行的 ALT(如您所说,只有唯一的)。
首先,找出哪些行是平的:
which.flat <- grep('flat$', x[, 'TYPE'])
接下来,对于每个平面行,查找具有相同 'start' 的其他行的 ALT(即 x[, 'start'] == x[i, 'start']
位),并排除具有重复 ALT 的行(即 x[, 'ALT'] != x[i, 'ALT']
位)。这里i
是当前平线的索引。将它们全部添加到扁平线的 ALT。 sapply
只是对每条平线进行矢量化处理。
# add the other alts to the alt of the 'flat' line.
alt[which.flat] <- as.numeric(alt[which.flat]) + sapply(which.flat,
function (i) {
sum(as.numeric(alt[ x[, 'start'] == x[i, 'start'] &
x[, 'ALT'] != x[i, 'ALT'] ]))
})
现在我们只是粘贴在一起:
x <- cbind(x, bam.AD=paste(ref, alt, sep=','))
除第 10 行外,结果与您的相同,我认为您犯了一个错误 - 只有一行带有 "chr16:2533924",其 ALT 为 "T"(值 13),所以bam.AD
是“19,13”(你有“19,42”,就好像 ALT 是 "A",但它不是)。
如果您必须坚持问题中的函数形式(非常慢且效率低下!),它与我所做的基本相同(因此为什么您可以在没有 apply
调用的情况下完成并跳过整个循环):
flatCase <- function(x, mat, ns) {
# 获取平行的 alt
alt <- as.numeric(x[x['ALT']])
# get the other rows with the same 'start' and different 'ALT'
xx <- mat[mat[, 'start'] == x['start'] & mat[, 'ALT'] != x['ALT'], ,drop=F]
if (nrow(xx) > 0) {
# grab all the alts as done before
rownames(xx) <- 1:nrow(xx)
alt <- alt + sum(as.numeric(xx[cbind(1:nrow(xx), xx[, 'ALT'])]))
}
ref <- x[x['REF']]
return(paste(ref, alt, sep=','))
}
然而,如前所述,如果将其矢量化,上面的整个代码将减少到几行,而且速度更快:
newBamAD <- function (x) {
# the version above
rownames(x) <- 1:nrow(x)
ref <- x[cbind(1:nrow(x), x[, 'REF'])]
alt <- x[cbind(1:nrow(x), x[, 'ALT'])]
which.flat <- grep('flat$', x[, 'TYPE'])
alt[which.flat] <- as.numeric(alt[which.flat]) + sapply(which.flat,
function (i) {
sum(as.numeric(alt[ x[, 'start'] == x[i, 'start'] &
x[, 'ALT'] != x[i, 'ALT'] ]))
})
cbind(x, bam.AD=paste(ref, alt, sep=','))
}
library(rbenchmark)
benchmark(
bamAD=bamAD(x),
newBamAD=newBamAD(x)
)
# test replications elapsed relative user.self sys.self user.child sys.child
# 1 bamAD 100 0.082 3.905 0.072 0.004 0 0
# 2 newBamAD 100 0.021 1.000 0.020 0.000 0 0
矢量化版本几乎快 4 倍。
另一种方法:
# create dataframe
mydf <- as.data.frame(x, stringsAsFactors=FALSE)
# create temporary values based on REF and ALT
mydf$REFval <- diag(as.matrix(mydf[, mydf$REF]))
mydf$ALTval <- diag(as.matrix(mydf[, mydf$ALT]))
在下一步中,您说要对 ALT "if the ALT letters are unique" 求和,但没有指定如果 ALT 相同但值不同,则使用哪个值。这在您的样本数据集中无关紧要,因为值相同,因此在我下面的代码中,我假定使用最后一个 ALT 值。
# sum up ALT values for all start ID
require(dplyr)
mydfs <- mydf %>% group_by(start, ALT) %>%
summarize(ALTkeep=last(ALTval)) %>% # assume keep last one if same ALT
group_by(start) %>%
summarize(ALTflat=sum(as.numeric(ALTkeep)))
# merge back into main dataframe
mydf <- left_join(mydf, mydfs)
# select ALT value for bam.AD depending on "flat$" in TYPE
mydf$bam.AD <- with(mydf,
paste(REFval, ifelse(grepl("flat$", TYPE), ALTflat, ALTval), sep=","))
# optional clean up of temporary values
mydf <- mydf[, !(names(mydf) %in% c("REFval", "ALTval", "ALTflat"))]
如你所愿的输出
start A T G C REF ALT TYPE bam.AD
1 chr20:5363934 95 29 14 59 C T snp 59,29
2 chr5:8529759 24 1 28 41 G C snp 28,41
3 chr14:9620689 65 49 41 96 T G snp 49,41
4 chr18:547375 94 1 51 67 G C snp 51,67
5 chr8:5952145 27 80 25 96 T T snp 80,80
6 chr14:8694382 68 94 26 30 A A snp 68,68
7 chr16:2530921 49 15 79 72 A T snp:2530921 49,15
8 chr16:2530921 49 15 79 72 A G snp:2530921 49,79
9 chr16:2530921 49 15 79 72 A T snp:2530921flat 49,94
10 chr16:2533924 42 13 19 52 G T snp:2533924flat 19,13
11 chr16:2543344 4 13 13 42 G T snp:2543344flat 13,55
12 chr16:2543344 42 23 13 42 G A snp:2543344 13,42
13 chr14:4214117 73 49 18 77 G A snp 18,73
14 chr4:7799768 36 28 1 16 C A snp 16,36
15 chr3:9141263 27 41 93 90 A A snp 27,27
我有这个数据叫做 mydf
。
我需要将 REF
和 ALT
列中的字母(DNA 字母)与 colnames(x)
("A","T","G","C"
) 进行匹配,并将相应的数值粘贴在一起作为 "REF,ALT"
。
但是,有些行在 TYPE
列中有 "snp:+[0-9]"
和 "flat$"
。
现在 "flat$"
行我想要:
- sum 来自
"snp:+[0-9]"
个相应"start"
id 的ALT
值,包括扁平线本身,如果ALT
个字母是唯一的(请参阅 curly 中包含的脚本 一条直线的括号) - 粘贴那个
ALT
值再次作为"REF,ALT"
(REF
值将是 对于具有相同起始 ID 的"snp:+[0-9]"
和"flat$"
相同) - 得到输出如结果所示。
我已经为一条扁平线完成了此操作,但我需要帮助制作 flatcase
的功能,以便它对所有扁平线执行相同的操作。
如何为 flatcase
创建一个函数来执行此操作?
Code
normalCase <- function(x, ns) {
ref.idx <- which(ns == "REF")
ref.allele <- x[ref.idx]
ref.count <- x[which(ns == ref.allele)]
alt.idx <- which(ns == "ALT")
alt.allele <- x[alt.idx]
alt.count <- x[which(ns == alt.allele)]
paste(ref.count, alt.count, sep=",")
}
flatcase??{
g<-x[,"start"]=="chr16:2530921"& grepl("snp:+[0-9]",x[,"TYPE"])
myt<-x[g,]
x[g,"ALT"]
unique(x[g,"ALT"])
c<-unique(x[g,"ALT"])
flat<-myt[grepl("flat$",myt[,"TYPE"]),]
c<-unique(x[g,"ALT"])
alt.count<- sum(as.numeric(flat[c]))
}
calculateAD <- function(x, mat, ns) {
if (grepl("flat$", x[which(ns == 'TYPE')])) {
flatCase(x, mat, ns)
} else {
normalCase(x, ns)
}
}
bamAD <- function(x) {
new.x <- cbind(x, apply(x, 1, calculateAD, x, colnames(x)))
colnames(new.x)[ncol(new.x)] <- "bam.AD"
new.x
}
我试过的flatCase函数是:
flatCase <- function(x, mat, ns) {
id.idx <- which(ns == 'start')
type.idx <- which(ns == 'TYPE')
ref.idx <- which(ns == 'REF')
alt.idx <- which(ns == 'ALT')
id <- x[id.idx]
#m <- mat[mat[, id.idx] == id & mat[, type.idx] == "snp", ]
#m <- mat[mat[, id.idx] == id & mat[, type.idx] == "snp", ]
m<-mat[grepl(id,mat[, id.idx]) & grepl("snp:+[0-9]",mat[, type.idx]),]
#flat<-mat[grepl("flat$",mat[, type.idx]),]
ref.allele <- x[ref.idx]
ref.count<-x[which(ns == ref.allele)]
alt.count <- sum(apply(m, 1, function(x) as.numeric(x[which(ns == x[alt.idx])])))
paste(ref.count, alt.count, sep=",")
}
mydf
x <- as.matrix(read.csv(text="start,A,T,G,C,REF,ALT,TYPE
chr20:5363934,95,29,14,59,C,T,snp
chr5:8529759,24,1,28,41,G,C,snp
chr14:9620689,65,49,41,96,T,G,snp
chr18:547375,94,1,51,67,G,C,snp
chr8:5952145,27,80,25,96,T,T,snp
chr14:8694382,68,94,26,30,A,A,snp
chr16:2530921,49,15,79,72,A,T,snp:2530921
chr16:2530921,49,15,79,72,A,G,snp:2530921
chr16:2530921,49,15,79,72,A,T,snp:2530921flat
chr16:2533924,42,13,19,52,G,T,snp:2533924flat
chr16:2543344,4,13,13,42,G,T,snp:2543344flat
chr16:2543344,4,23,13,42,G,A,snp:2543344
chr14:4214117,73,49,18,77,G,A,snp
chr4:7799768,36,28,1,16,C,A,snp
chr3:9141263,27,41,93,90,A,A,snp", stringsAsFactors=FALSE))
Result:
start A T G C REF ALT TYPE bam.AD
[1,] "chr20:5363934" "95" "29" "14" "59" "C" "T" "snp" "59,29"
[2,] "chr5:8529759" "24" " 1" "28" "41" "G" "C" "snp" "28,41"
[3,] "chr14:9620689" "65" "49" "41" "96" "T" "G" "snp" "49,41"
[4,] "chr18:547375" "94" " 1" "51" "67" "G" "C" "snp" "51,67"
[5,] "chr8:5952145" "27" "80" "25" "96" "T" "T" "snp" "80,80"
[6,] "chr14:8694382" "68" "94" "26" "30" "A" "A" "snp" "68,68"
[7,] "chr16:2530921" "49" "15" "79" "72" "A" "T" "snp:2530921" "49,15"
[8,] "chr16:2530921" "49" "15" "79" "72" "A" "G" "snp:2530921" "49,79"
[9,] "chr16:2530921" "49" "15" "79" "72" "A" "T" "snp:2530921flat" "49,94"
[10,] "chr16:2533924" "42" "13" "19" "52" "G" "T" "snp:2533924flat" "19,13"
[11,] "chr16:2543344" "42" "13" "13" "42" "G" "T" "snp:2543344flat" "13,55"
[12,] "chr16:2543344" "42" "23" "13" "42" "G" "A" "snp:2543344" "13,42"
[13,] "chr14:4214117" "73" "49" "18" "77" "G" "A" "snp" "18,73"
[14,] "chr4:7799768" "36" "28" " 1" "16" "C" "A" "snp" "16,36"
[15,] "chr3:9141263" "27" "41" "93" "90" "A" "A" "snp" "27,27"
这是一种矢量化的方法。
首先,请注意无论类型如何,REF 都是相同的。 我们可以通过使用 REF 作为矩阵中的坐标来快速查找它,例如第 1 行有 REF C,所以如果我们查找坐标 (1, "C"),我们会得到该行的 REF 值。
# the REFs are the same regardless of TYPE
rownames(x) <- 1:nrow(x)
ref <- x[cbind(1:nrow(x), x[, 'REF'])]
看看cbind(1:nrow(x), x[, 'REF'])
:这只是一个坐标列表(row number, REF)
,我们用它来查找参考编号。
然后我们对 ALT 做同样的事情:
alt <- x[cbind(1:nrow(x), x[, 'ALT'])]
但是我们必须确保如果类型是 'flat',我们将所有其他 ALT 添加到 'flat' 行的 ALT(如您所说,只有唯一的)。
首先,找出哪些行是平的:
which.flat <- grep('flat$', x[, 'TYPE'])
接下来,对于每个平面行,查找具有相同 'start' 的其他行的 ALT(即 x[, 'start'] == x[i, 'start']
位),并排除具有重复 ALT 的行(即 x[, 'ALT'] != x[i, 'ALT']
位)。这里i
是当前平线的索引。将它们全部添加到扁平线的 ALT。 sapply
只是对每条平线进行矢量化处理。
# add the other alts to the alt of the 'flat' line.
alt[which.flat] <- as.numeric(alt[which.flat]) + sapply(which.flat,
function (i) {
sum(as.numeric(alt[ x[, 'start'] == x[i, 'start'] &
x[, 'ALT'] != x[i, 'ALT'] ]))
})
现在我们只是粘贴在一起:
x <- cbind(x, bam.AD=paste(ref, alt, sep=','))
除第 10 行外,结果与您的相同,我认为您犯了一个错误 - 只有一行带有 "chr16:2533924",其 ALT 为 "T"(值 13),所以bam.AD
是“19,13”(你有“19,42”,就好像 ALT 是 "A",但它不是)。
如果您必须坚持问题中的函数形式(非常慢且效率低下!),它与我所做的基本相同(因此为什么您可以在没有 apply
调用的情况下完成并跳过整个循环):
flatCase <- function(x, mat, ns) { # 获取平行的 alt alt <- as.numeric(x[x['ALT']])
# get the other rows with the same 'start' and different 'ALT'
xx <- mat[mat[, 'start'] == x['start'] & mat[, 'ALT'] != x['ALT'], ,drop=F]
if (nrow(xx) > 0) {
# grab all the alts as done before
rownames(xx) <- 1:nrow(xx)
alt <- alt + sum(as.numeric(xx[cbind(1:nrow(xx), xx[, 'ALT'])]))
}
ref <- x[x['REF']]
return(paste(ref, alt, sep=','))
}
然而,如前所述,如果将其矢量化,上面的整个代码将减少到几行,而且速度更快:
newBamAD <- function (x) {
# the version above
rownames(x) <- 1:nrow(x)
ref <- x[cbind(1:nrow(x), x[, 'REF'])]
alt <- x[cbind(1:nrow(x), x[, 'ALT'])]
which.flat <- grep('flat$', x[, 'TYPE'])
alt[which.flat] <- as.numeric(alt[which.flat]) + sapply(which.flat,
function (i) {
sum(as.numeric(alt[ x[, 'start'] == x[i, 'start'] &
x[, 'ALT'] != x[i, 'ALT'] ]))
})
cbind(x, bam.AD=paste(ref, alt, sep=','))
}
library(rbenchmark)
benchmark(
bamAD=bamAD(x),
newBamAD=newBamAD(x)
)
# test replications elapsed relative user.self sys.self user.child sys.child
# 1 bamAD 100 0.082 3.905 0.072 0.004 0 0
# 2 newBamAD 100 0.021 1.000 0.020 0.000 0 0
矢量化版本几乎快 4 倍。
另一种方法:
# create dataframe
mydf <- as.data.frame(x, stringsAsFactors=FALSE)
# create temporary values based on REF and ALT
mydf$REFval <- diag(as.matrix(mydf[, mydf$REF]))
mydf$ALTval <- diag(as.matrix(mydf[, mydf$ALT]))
在下一步中,您说要对 ALT "if the ALT letters are unique" 求和,但没有指定如果 ALT 相同但值不同,则使用哪个值。这在您的样本数据集中无关紧要,因为值相同,因此在我下面的代码中,我假定使用最后一个 ALT 值。
# sum up ALT values for all start ID
require(dplyr)
mydfs <- mydf %>% group_by(start, ALT) %>%
summarize(ALTkeep=last(ALTval)) %>% # assume keep last one if same ALT
group_by(start) %>%
summarize(ALTflat=sum(as.numeric(ALTkeep)))
# merge back into main dataframe
mydf <- left_join(mydf, mydfs)
# select ALT value for bam.AD depending on "flat$" in TYPE
mydf$bam.AD <- with(mydf,
paste(REFval, ifelse(grepl("flat$", TYPE), ALTflat, ALTval), sep=","))
# optional clean up of temporary values
mydf <- mydf[, !(names(mydf) %in% c("REFval", "ALTval", "ALTflat"))]
如你所愿的输出
start A T G C REF ALT TYPE bam.AD
1 chr20:5363934 95 29 14 59 C T snp 59,29
2 chr5:8529759 24 1 28 41 G C snp 28,41
3 chr14:9620689 65 49 41 96 T G snp 49,41
4 chr18:547375 94 1 51 67 G C snp 51,67
5 chr8:5952145 27 80 25 96 T T snp 80,80
6 chr14:8694382 68 94 26 30 A A snp 68,68
7 chr16:2530921 49 15 79 72 A T snp:2530921 49,15
8 chr16:2530921 49 15 79 72 A G snp:2530921 49,79
9 chr16:2530921 49 15 79 72 A T snp:2530921flat 49,94
10 chr16:2533924 42 13 19 52 G T snp:2533924flat 19,13
11 chr16:2543344 4 13 13 42 G T snp:2543344flat 13,55
12 chr16:2543344 42 23 13 42 G A snp:2543344 13,42
13 chr14:4214117 73 49 18 77 G A snp 18,73
14 chr4:7799768 36 28 1 16 C A snp 16,36
15 chr3:9141263 27 41 93 90 A A snp 27,27