如何转换基因分型数据
How to convert genotyping data
我有这个 dataframe
(大约暗淡的 446664 X 234)叫做 mydf
(提供 dput
)。此 dataframe
包含列 REF
和 ALT
.
REF
每行只有一个字母,但 ALT
可以有一个、两个甚至三个字母,用逗号 (",") 分隔,其余列(样本列)是我需要完成所有工作的列。
考虑到 REF
中的任何字母为 0,ALT
中的第一个字母分别为 1,第二个字母 2 和第三个字母 3,我需要创建一个函数以便:
我可以用字母替换所有示例列中的数字(即 REF 和 ALT 中除外),如果有“./.”;
用 NA/NA 填充它们并折叠“/”以在每个单元格中获得成对的字母。
最后,我需要反转行 (transpose
) 中的所有示例列,如 result
所示。谢谢!
mydf<-
structure(list(REF = structure(c(1L, 4L, 3L, 2L, 3L), .Label = c("A",
"C", "G", "T"), class = "factor"), ALT = structure(c(6L, 6L,
1L, 9L, 1L), .Label = c("A", "A,C", "A,G", "A,T", "C", "C,G",
"C,T", "G", "G,T", "T"), class = "factor"), X860 = structure(c(1L,
3L, 2L, 1L, 1L), .Label = c("./.", "0/0", "0/1", "0/2", "1/1"
), class = "factor"), X861 = structure(c(1L, 6L, 2L, 1L, 1L), .Label = c("./.",
"0/0", "0/1", "0/2", "1/1", "1/2"), class = "factor"), X862 = structure(c(6L,
3L, 1L, 2L, 1L), .Label = c("./.", "0/0", "0/1", "0/2", "1/1",
"2/2"), class = "factor")), .Names = c("REF", "ALT", "X860",
"X861", "X862"), row.names = c(NA, -5L), class = "data.frame")
预期输出:
X860 NANA TC GG NANA NANA
X861 NANA CG GG NANA NANA
X862 GG TC NANA CC NANA
知道了,但我不太确定它的性能:
letters <- strsplit(paste(mydf$REF,mydf$ALT,sep=","),",") # concatenate the letters to have an index to work on from the numbers
values <- t(mydf[,3:ncol(mydf)]) # let's work on each column needing values
nbval <- ncol(values) # Save time for later and save the length of values
#Prepare the two temp vectors used later
chars <- vector("character",2)
ret <- vector("character",nbval)
#Loop over the rows (and transpose the result)
t(sapply(rownames(values),
function(x) {
indexes <- strsplit(values[x,],"/") # Get a list with pairs of indexes
for(i in 1:nbval) { # Loop over the number of columns :/
for (j in 1:2) { # Loop over the pair
chars[j] <- ifelse(indexes[[i]][j] == ".", "NA",letters[[i]][as.integer(indexes[[i]][j])+1]) # Get NA if . or the letter with the correct index at this postion
}
ret[i] <- paste0(chars[1],chars[2]) # concatenate the two chars
}
return(ret) # return this for this row
}
))
样本数据输出:
[,1] [,2] [,3] [,4] [,5]
X860 "NANA" "TC" "GG" "NANA" "NANA"
X861 "NANA" "CG" "GG" "NANA" "NANA"
X862 "GG" "TC" "NANA" "CC" "NANA"
函数的更新版本(因为其余代码没有改变)来自评论:
#Loop over the rows (and transpose the result)
t(sapply(rownames(values),
function(x) {
indexes <- strsplit(values[x,],"/") # Get a list with pairs of indexes
for(i in 1:nbval) { # Loop over the number of columns :/
if (values[x,i] == "./.") { # test if we have ./. and if yes, set to NA
ret[i] <- "NA"
} else { # if it's not ./. then try to find the corresponding letters
for (j in 1:2) { # Loop over the pair
chars[j] <- ifelse(indexes[[i]][j] == ".", "NA",letters[[i]][as.integer(indexes[[i]][j])+1]) # Get NA if . or the letter with the correct index at this postion
}
ret[i] <- paste0(chars[1],chars[2]) # concatenate the two chars
}
}
return(ret) # return this for this row
}
))
输出:
[,1] [,2] [,3] [,4] [,5]
X860 "NA" "TC" "GG" "NA" "NA"
X861 "NA" "CG" "GG" "NA" "NA"
X862 "GG" "TC" "NA" "CC" "NA"
这是我的尝试:
#ref alt together, we will access by index of each allele
refalt <- paste(mydf$REF,mydf$ALT,sep=",")
#loop over for each SNP column
t(
sapply(3:5, function(snp){
#take one SNPs
x <- as.character(mydf[,snp])
sapply(1:length(x), function(ix){
#get genotype and make numeric alleles
geno <- unlist(strsplit(x[ix],"/",fixed=TRUE))
a1 <- as.numeric(geno[1])+1
a2 <- as.numeric(geno[2])+1
#match with refalt on ix
allele <- unlist(strsplit(refalt[ix],",",fixed=TRUE))
#output - here I would prefer to output "NANA" as NA but it is up to you.
paste0(allele[a1],allele[a2])
})
})
)
#output
# [,1] [,2] [,3] [,4] [,5]
# [1,] "NANA" "TC" "GG" "NANA" "NANA"
# [2,] "NANA" "CG" "GG" "NANA" "NANA"
# [3,] "GG" "TC" "NANA" "CC" "NANA"
我有这个 dataframe
(大约暗淡的 446664 X 234)叫做 mydf
(提供 dput
)。此 dataframe
包含列 REF
和 ALT
.
REF
每行只有一个字母,但 ALT
可以有一个、两个甚至三个字母,用逗号 (",") 分隔,其余列(样本列)是我需要完成所有工作的列。
考虑到 REF
中的任何字母为 0,ALT
中的第一个字母分别为 1,第二个字母 2 和第三个字母 3,我需要创建一个函数以便:
我可以用字母替换所有示例列中的数字(即 REF 和 ALT 中除外),如果有“./.”;
用 NA/NA 填充它们并折叠“/”以在每个单元格中获得成对的字母。
最后,我需要反转行 (
transpose
) 中的所有示例列,如result
所示。谢谢!mydf<- structure(list(REF = structure(c(1L, 4L, 3L, 2L, 3L), .Label = c("A", "C", "G", "T"), class = "factor"), ALT = structure(c(6L, 6L, 1L, 9L, 1L), .Label = c("A", "A,C", "A,G", "A,T", "C", "C,G", "C,T", "G", "G,T", "T"), class = "factor"), X860 = structure(c(1L, 3L, 2L, 1L, 1L), .Label = c("./.", "0/0", "0/1", "0/2", "1/1" ), class = "factor"), X861 = structure(c(1L, 6L, 2L, 1L, 1L), .Label = c("./.", "0/0", "0/1", "0/2", "1/1", "1/2"), class = "factor"), X862 = structure(c(6L, 3L, 1L, 2L, 1L), .Label = c("./.", "0/0", "0/1", "0/2", "1/1", "2/2"), class = "factor")), .Names = c("REF", "ALT", "X860", "X861", "X862"), row.names = c(NA, -5L), class = "data.frame")
预期输出:
X860 NANA TC GG NANA NANA
X861 NANA CG GG NANA NANA
X862 GG TC NANA CC NANA
知道了,但我不太确定它的性能:
letters <- strsplit(paste(mydf$REF,mydf$ALT,sep=","),",") # concatenate the letters to have an index to work on from the numbers
values <- t(mydf[,3:ncol(mydf)]) # let's work on each column needing values
nbval <- ncol(values) # Save time for later and save the length of values
#Prepare the two temp vectors used later
chars <- vector("character",2)
ret <- vector("character",nbval)
#Loop over the rows (and transpose the result)
t(sapply(rownames(values),
function(x) {
indexes <- strsplit(values[x,],"/") # Get a list with pairs of indexes
for(i in 1:nbval) { # Loop over the number of columns :/
for (j in 1:2) { # Loop over the pair
chars[j] <- ifelse(indexes[[i]][j] == ".", "NA",letters[[i]][as.integer(indexes[[i]][j])+1]) # Get NA if . or the letter with the correct index at this postion
}
ret[i] <- paste0(chars[1],chars[2]) # concatenate the two chars
}
return(ret) # return this for this row
}
))
样本数据输出:
[,1] [,2] [,3] [,4] [,5]
X860 "NANA" "TC" "GG" "NANA" "NANA"
X861 "NANA" "CG" "GG" "NANA" "NANA"
X862 "GG" "TC" "NANA" "CC" "NANA"
函数的更新版本(因为其余代码没有改变)来自评论:
#Loop over the rows (and transpose the result)
t(sapply(rownames(values),
function(x) {
indexes <- strsplit(values[x,],"/") # Get a list with pairs of indexes
for(i in 1:nbval) { # Loop over the number of columns :/
if (values[x,i] == "./.") { # test if we have ./. and if yes, set to NA
ret[i] <- "NA"
} else { # if it's not ./. then try to find the corresponding letters
for (j in 1:2) { # Loop over the pair
chars[j] <- ifelse(indexes[[i]][j] == ".", "NA",letters[[i]][as.integer(indexes[[i]][j])+1]) # Get NA if . or the letter with the correct index at this postion
}
ret[i] <- paste0(chars[1],chars[2]) # concatenate the two chars
}
}
return(ret) # return this for this row
}
))
输出:
[,1] [,2] [,3] [,4] [,5]
X860 "NA" "TC" "GG" "NA" "NA"
X861 "NA" "CG" "GG" "NA" "NA"
X862 "GG" "TC" "NA" "CC" "NA"
这是我的尝试:
#ref alt together, we will access by index of each allele
refalt <- paste(mydf$REF,mydf$ALT,sep=",")
#loop over for each SNP column
t(
sapply(3:5, function(snp){
#take one SNPs
x <- as.character(mydf[,snp])
sapply(1:length(x), function(ix){
#get genotype and make numeric alleles
geno <- unlist(strsplit(x[ix],"/",fixed=TRUE))
a1 <- as.numeric(geno[1])+1
a2 <- as.numeric(geno[2])+1
#match with refalt on ix
allele <- unlist(strsplit(refalt[ix],",",fixed=TRUE))
#output - here I would prefer to output "NANA" as NA but it is up to you.
paste0(allele[a1],allele[a2])
})
})
)
#output
# [,1] [,2] [,3] [,4] [,5]
# [1,] "NANA" "TC" "GG" "NANA" "NANA"
# [2,] "NANA" "CG" "GG" "NANA" "NANA"
# [3,] "GG" "TC" "NANA" "CC" "NANA"