如何比较两个数据 frames/tables 并在 R 中提取数据?
How to compare two data frames/tables and extract data in R?
为了尝试提取下面两个数据框之间的不匹配项,我已经设法创建了一个新的数据框,其中不匹配项被替换了。
我现在需要的是一个不匹配的列表:
dfA <- structure(list(animal1 = c("AA", "TT", "AG", "CA"), animal2 = c("AA", "TB", "AG", "CA"), animal3 = c("AA", "TT", "AG", "CA")), .Names = c("animal1", "animal2", "animal3"), row.names = c("snp1", "snp2", "snp3", "snp4"), class = "data.frame")
# > dfA
# animal1 animal2 animal3
# snp1 AA AA AA
# snp2 TT TB TT
# snp3 AG AG AG
# snp4 CA CA CA
dfB <- structure(list(animal1 = c("AA", "TT", "AG", "CA"), animal2 = c("AA", "TB", "AG", "DF"), animal3 = c("AA", "TB", "AG", "DF")), .Names = c("animal1", "animal2", "animal3"), row.names = c("snp1", "snp2", "snp3", "snp4"), class = "data.frame")
#> dfB
# animal1 animal2 animal3
#snp1 AA AA AA
#snp2 TT TB TB
#snp3 AG AG AG
#snp4 CA DF DF
为了澄清不匹配,这里将它们标记为 00:
# animal1 animal2 animal3
# snp1 AA AA AA
# snp2 TT TB 00
# snp3 AG AG AG
# snp4 CA 00 00
我需要以下输出:
structure(list(snpname = structure(c(1L, 2L, 2L), .Label = c("snp2", "snp4"), class = "factor"), animalname = structure(c(2L, 1L, 2L), .Label = c("animal2", "animal3"), class = "factor"), alleledfA = structure(c(2L, 1L, 1L), .Label = c("CA", "TT"), class = "factor"), alleledfB = structure(c(2L, 1L, 1L), .Label = c("DF", "TB"), class = "factor")), .Names = c("snpname", "animalname", "alleledfA", "alleledfB"), class = "data.frame", row.names = c(NA, -3L))
# snpname animalname alleledfA alleledfB
#1 snp2 animal3 TT TB
#2 snp4 animal2 CA DF
#3 snp4 animal3 CA DF
到目前为止,我一直在尝试从 lapply
函数中提取额外数据,我用它来将不匹配项替换为零,但没有成功。我还尝试编写一个 ifelse 函数但没有成功。希望你们能帮帮我!
最终这将是 运行 对于维度为 100K x 1000 的数据集,因此效率是一个 pro
这是一个使用矩阵 array.indices 的解决方案:
i.arr <- which(dfA != dfB, arr.ind=TRUE)
data.frame(snp=rownames(dfA)[i.arr[,1]], animal=colnames(dfA)[i.arr[,2]],
A=dfA[i.arr], B=dfB[i.arr])
# snp animal A B
#1 snp4 animal2 CA DF
#2 snp2 animal3 TT TB
#3 snp4 animal3 CA DF
这个问题有 data.table
标签,所以这是我使用这个包的尝试。第一步是将行名转换为列,因为 data.table
不喜欢那些,然后在 rbind
ing 后转换为长格式并为每个数据集设置一个 id,找到有多个唯一值的地方并转换回宽格式
library(data.table)
setDT(dfA, keep.rownames = TRUE)
setDT(dfB, keep.rownames = TRUE)
dcast(melt(rbind(dfA,
dfB,
idcol = TRUE),
id = 1:2
)[,
if(uniqueN(value) > 1L) .SD,
by = .(rn, variable)],
rn + variable ~ .id)
# rn variable 1 2
# 1: snp2 animal3 TT TB
# 2: snp4 animal2 CA DF
# 3: snp4 animal3 CA DF
这可以通过 dplyr/tidyr
使用与@David Arenburg 的 post.
类似的方法来完成
library(dplyr)
library(tidyr)
bind_rows(add_rownames(dfA), add_rownames(dfB)) %>%
gather(Var, Val, -rowname) %>%
group_by(rowname, Var) %>%
filter(n_distinct(Val)>1) %>%
mutate(id = 1:2) %>%
spread(id, Val)
# rowname Var 1 2
# (chr) (chr) (chr) (chr)
#1 snp2 animal3 TT TB
#2 snp4 animal2 CA DF
#3 snp4 animal3 CA DF
为了尝试提取下面两个数据框之间的不匹配项,我已经设法创建了一个新的数据框,其中不匹配项被替换了。
我现在需要的是一个不匹配的列表:
dfA <- structure(list(animal1 = c("AA", "TT", "AG", "CA"), animal2 = c("AA", "TB", "AG", "CA"), animal3 = c("AA", "TT", "AG", "CA")), .Names = c("animal1", "animal2", "animal3"), row.names = c("snp1", "snp2", "snp3", "snp4"), class = "data.frame")
# > dfA
# animal1 animal2 animal3
# snp1 AA AA AA
# snp2 TT TB TT
# snp3 AG AG AG
# snp4 CA CA CA
dfB <- structure(list(animal1 = c("AA", "TT", "AG", "CA"), animal2 = c("AA", "TB", "AG", "DF"), animal3 = c("AA", "TB", "AG", "DF")), .Names = c("animal1", "animal2", "animal3"), row.names = c("snp1", "snp2", "snp3", "snp4"), class = "data.frame")
#> dfB
# animal1 animal2 animal3
#snp1 AA AA AA
#snp2 TT TB TB
#snp3 AG AG AG
#snp4 CA DF DF
为了澄清不匹配,这里将它们标记为 00:
# animal1 animal2 animal3
# snp1 AA AA AA
# snp2 TT TB 00
# snp3 AG AG AG
# snp4 CA 00 00
我需要以下输出:
structure(list(snpname = structure(c(1L, 2L, 2L), .Label = c("snp2", "snp4"), class = "factor"), animalname = structure(c(2L, 1L, 2L), .Label = c("animal2", "animal3"), class = "factor"), alleledfA = structure(c(2L, 1L, 1L), .Label = c("CA", "TT"), class = "factor"), alleledfB = structure(c(2L, 1L, 1L), .Label = c("DF", "TB"), class = "factor")), .Names = c("snpname", "animalname", "alleledfA", "alleledfB"), class = "data.frame", row.names = c(NA, -3L))
# snpname animalname alleledfA alleledfB
#1 snp2 animal3 TT TB
#2 snp4 animal2 CA DF
#3 snp4 animal3 CA DF
到目前为止,我一直在尝试从 lapply
函数中提取额外数据,我用它来将不匹配项替换为零,但没有成功。我还尝试编写一个 ifelse 函数但没有成功。希望你们能帮帮我!
最终这将是 运行 对于维度为 100K x 1000 的数据集,因此效率是一个 pro
这是一个使用矩阵 array.indices 的解决方案:
i.arr <- which(dfA != dfB, arr.ind=TRUE)
data.frame(snp=rownames(dfA)[i.arr[,1]], animal=colnames(dfA)[i.arr[,2]],
A=dfA[i.arr], B=dfB[i.arr])
# snp animal A B
#1 snp4 animal2 CA DF
#2 snp2 animal3 TT TB
#3 snp4 animal3 CA DF
这个问题有 data.table
标签,所以这是我使用这个包的尝试。第一步是将行名转换为列,因为 data.table
不喜欢那些,然后在 rbind
ing 后转换为长格式并为每个数据集设置一个 id,找到有多个唯一值的地方并转换回宽格式
library(data.table)
setDT(dfA, keep.rownames = TRUE)
setDT(dfB, keep.rownames = TRUE)
dcast(melt(rbind(dfA,
dfB,
idcol = TRUE),
id = 1:2
)[,
if(uniqueN(value) > 1L) .SD,
by = .(rn, variable)],
rn + variable ~ .id)
# rn variable 1 2
# 1: snp2 animal3 TT TB
# 2: snp4 animal2 CA DF
# 3: snp4 animal3 CA DF
这可以通过 dplyr/tidyr
使用与@David Arenburg 的 post.
library(dplyr)
library(tidyr)
bind_rows(add_rownames(dfA), add_rownames(dfB)) %>%
gather(Var, Val, -rowname) %>%
group_by(rowname, Var) %>%
filter(n_distinct(Val)>1) %>%
mutate(id = 1:2) %>%
spread(id, Val)
# rowname Var 1 2
# (chr) (chr) (chr) (chr)
#1 snp2 animal3 TT TB
#2 snp4 animal2 CA DF
#3 snp4 animal3 CA DF