比较不匹配的数据帧,如果 > 不匹配则重新分配行
compare data frames for mismatches and reassign row if > mismatches
为了尝试替换下面两个数据框之间的不匹配项,我已经设法创建了一个新的数据框,其中不匹配项被替换了。我现在正在寻找一种更有效的方法来使用 ifelse 或 data.table package:
dfA <- structure(list(animal1 = c("AA", "TT", "AG", "CA"), animal2 = c("AA", "TB", "AG", "CA"), animal3 = c("AA", "TT", "AG", "CA")), .Names = c("animal1", "animal2", "animal3"), row.names = c("snp1", "snp2", "snp3", "snp4"), class = "data.frame")
# > dfA
# animal1 animal2 animal3
# snp1 AA AA AA
# snp2 TT TB TT
# snp3 AG AG AG
# snp4 CA CA CA
dfB <- structure(list(animal1 = c("AA", "TT", "AG", "CA"), animal2 = c("AA", "TB", "AG", "DF"), animal3 = c("AA", "TB", "AG", "DF")), .Names = c("animal1", "animal2", "animal3"), row.names = c("snp1", "snp2", "snp3", "snp4"), class = "data.frame")
#> dfB
# animal1 animal2 animal3
#snp1 AA AA AA
#snp2 TT TB TB
#snp3 AG AG AG
#snp4 CA DF DF
当一行中有超过 50% 的不匹配时,我将“00”分配给 snp 的所有列:
dfC <- do.call(rbind, lapply(rownames(dfA), function(x){
mismatchpercentage <- length(which(dfA[x,] != dfB[x,]) == FALSE) / length(dfA[x,])
if(mismatchpercentage > 0.5){
dfA[x,] <- "00"
}
dfA[x, which(dfA[x,] != dfB[x,])] <- "00"
dfA[x,]
}))
data.frame(dfC)
# > data.frame(dfC)
# animal1 animal2 animal3
# snp1 AA AA AA
# snp2 TT TB 00
# snp3 AG AG AG
# snp4 00 00 00
部分可以通过以下代码完成,但这只是解决方案的一部分,现在我需要将最后一行全部替换为 00:
as.data.frame(ifelse(as.matrix(dfA) == as.matrix(dfB), as.matrix(dfA), "00"))
# animal1 animal2 animal3
# snp1 AA AA AA
# snp2 TT TB 00
# snp3 AG AG AG
# snp4 CA 00 00
这可以实现您的 50% 规则:
dfA.m <- as.matrix(dfA)
dfB.m <- as.matrix(dfB)
i.arr <- which(dfA.m != dfB.m, arr.ind=TRUE)
mm <- (dfA.m != dfB.m) # mismatches
mm[rowSums(mm) > ncol(dfA.m)/2, ] <- TRUE
为了尝试替换下面两个数据框之间的不匹配项,我已经设法创建了一个新的数据框,其中不匹配项被替换了。我现在正在寻找一种更有效的方法来使用 ifelse 或 data.table package:
dfA <- structure(list(animal1 = c("AA", "TT", "AG", "CA"), animal2 = c("AA", "TB", "AG", "CA"), animal3 = c("AA", "TT", "AG", "CA")), .Names = c("animal1", "animal2", "animal3"), row.names = c("snp1", "snp2", "snp3", "snp4"), class = "data.frame")
# > dfA
# animal1 animal2 animal3
# snp1 AA AA AA
# snp2 TT TB TT
# snp3 AG AG AG
# snp4 CA CA CA
dfB <- structure(list(animal1 = c("AA", "TT", "AG", "CA"), animal2 = c("AA", "TB", "AG", "DF"), animal3 = c("AA", "TB", "AG", "DF")), .Names = c("animal1", "animal2", "animal3"), row.names = c("snp1", "snp2", "snp3", "snp4"), class = "data.frame")
#> dfB
# animal1 animal2 animal3
#snp1 AA AA AA
#snp2 TT TB TB
#snp3 AG AG AG
#snp4 CA DF DF
当一行中有超过 50% 的不匹配时,我将“00”分配给 snp 的所有列:
dfC <- do.call(rbind, lapply(rownames(dfA), function(x){
mismatchpercentage <- length(which(dfA[x,] != dfB[x,]) == FALSE) / length(dfA[x,])
if(mismatchpercentage > 0.5){
dfA[x,] <- "00"
}
dfA[x, which(dfA[x,] != dfB[x,])] <- "00"
dfA[x,]
}))
data.frame(dfC)
# > data.frame(dfC)
# animal1 animal2 animal3
# snp1 AA AA AA
# snp2 TT TB 00
# snp3 AG AG AG
# snp4 00 00 00
部分可以通过以下代码完成,但这只是解决方案的一部分,现在我需要将最后一行全部替换为 00:
as.data.frame(ifelse(as.matrix(dfA) == as.matrix(dfB), as.matrix(dfA), "00"))
# animal1 animal2 animal3
# snp1 AA AA AA
# snp2 TT TB 00
# snp3 AG AG AG
# snp4 CA 00 00
这可以实现您的 50% 规则:
dfA.m <- as.matrix(dfA)
dfB.m <- as.matrix(dfB)
i.arr <- which(dfA.m != dfB.m, arr.ind=TRUE)
mm <- (dfA.m != dfB.m) # mismatches
mm[rowSums(mm) > ncol(dfA.m)/2, ] <- TRUE