检查一个数据框中的两列在 R 中是否具有相同的值 [除了 NA's]
Check whether two columns in one dataframe have the same values [aside from NA's] in R
我有 1 个数据框,j:
Chr|Pos|A0|A1|rsID|Beta-A1|P|info|maf|se|rsid
1|16021|C|T|NA|0.410|0.26|0.842|0.01|NA|rs1163602158
1|17483|C|T|rs845637483|-0.356|0.32|0.856|0.01|NA|rs845637483
1|19250|T|C|NA|-0.255|0.54|0.812|0.01|NA|rs7465843777
1|39402|T|TCAA|NA|-0.873|0.37|0.821|0.01|NA|rs2746475333
1|39883|G|C|NA|0.195|0.59|0.808|0.01|NA|rs2726463882
我想检查 rsID 和 rsid 中的行是否与前一列中的 NA 相同
所以我能做到
table(ifelse(j$rsID==j$rsid,"Yes","No"))
No Yes
701232 18207968
我能做到
table(is.na(j$rsID))
FALSE TRUE
18909200 2550533
table(is.na(j$rsid))
FALSE
21459733
所以我可以看到有 701232 个实例不匹配,但这些并不是全部因为 NA,因为 NA 比它们不匹配的实例多 (2550533)?
是否有更好/更简洁的方法来执行此操作,以便我对此有更好的了解?
谢谢
可以删除 NA 然后过滤它们不相等的地方:
library(dplyr)
library(tidyr)
j %>%
drop_na(rsID, rsid) %>%
filter(rsID != rsid) # Or == instead of != to keep where they are equal
我们可以使用base R
with(na.omit(j[c('rsID', 'rsid')]),table(ifelse(rsID == rsid, "Yes", "No")) )
另一个 dplyr 选项
j %>%
rowwise() %>%
mutate(duplicate = anyDuplicated(na.omit(c(rsid, rsID)))) %>%
mutate(duplicate = ifelse(duplicate > 1, "Yes", "No")) %>% count(duplicate)
输出
# A tibble: 2 x 2
# Rowwise:
duplicate n
<chr> <int>
1 No 4
2 Yes 1
# Load dplyr library
library(dplyr, warn.conflicts = FALSE, quietly = TRUE)
# you already have j defined so this step is only for this demo
j <- tibble(Chr = c(1, 1, 1, 1, 1),
Pos = c(16021, 17483, 19250, 39402, 39883),
A0 = c("C", "C", "T", "T", "G"),
A1 = c("T", "T", "C", "TCAA", "C"),
rsID = c(NA, "rs845637483", NA, NA, NA),
`Beta-A1` = c(0.41, -0.356, -0.255, -0.873, 0.195), P = c(0.26,0.32, 0.54, 0.37, 0.59),
info = c(0.842, 0.856, 0.812, 0.821, 0.808),
maf = c(0.01, 0.01, 0.01, 0.01, 0.01), se = c(NA, NA, NA, NA, NA),
rsid = c("rs1163602158", "rs845637483", "rs7465843777","rs2746475333", "rs2726463882"))
# create a column is_same and use count()
j %>%
mutate(is_same = if_else(rsid == rsID, "Yes", "No", "No")) %>%
count(is_same)
#> # A tibble: 2 x 2
#> is_same n
#> <chr> <int>
#> 1 No 4
#> 2 Yes 1
我有 1 个数据框,j:
Chr|Pos|A0|A1|rsID|Beta-A1|P|info|maf|se|rsid
1|16021|C|T|NA|0.410|0.26|0.842|0.01|NA|rs1163602158
1|17483|C|T|rs845637483|-0.356|0.32|0.856|0.01|NA|rs845637483
1|19250|T|C|NA|-0.255|0.54|0.812|0.01|NA|rs7465843777
1|39402|T|TCAA|NA|-0.873|0.37|0.821|0.01|NA|rs2746475333
1|39883|G|C|NA|0.195|0.59|0.808|0.01|NA|rs2726463882
我想检查 rsID 和 rsid 中的行是否与前一列中的 NA 相同
所以我能做到
table(ifelse(j$rsID==j$rsid,"Yes","No"))
No Yes
701232 18207968
我能做到
table(is.na(j$rsID))
FALSE TRUE
18909200 2550533
table(is.na(j$rsid))
FALSE
21459733
所以我可以看到有 701232 个实例不匹配,但这些并不是全部因为 NA,因为 NA 比它们不匹配的实例多 (2550533)?
是否有更好/更简洁的方法来执行此操作,以便我对此有更好的了解?
谢谢
可以删除 NA 然后过滤它们不相等的地方:
library(dplyr)
library(tidyr)
j %>%
drop_na(rsID, rsid) %>%
filter(rsID != rsid) # Or == instead of != to keep where they are equal
我们可以使用base R
with(na.omit(j[c('rsID', 'rsid')]),table(ifelse(rsID == rsid, "Yes", "No")) )
另一个 dplyr 选项
j %>%
rowwise() %>%
mutate(duplicate = anyDuplicated(na.omit(c(rsid, rsID)))) %>%
mutate(duplicate = ifelse(duplicate > 1, "Yes", "No")) %>% count(duplicate)
输出
# A tibble: 2 x 2
# Rowwise:
duplicate n
<chr> <int>
1 No 4
2 Yes 1
# Load dplyr library
library(dplyr, warn.conflicts = FALSE, quietly = TRUE)
# you already have j defined so this step is only for this demo
j <- tibble(Chr = c(1, 1, 1, 1, 1),
Pos = c(16021, 17483, 19250, 39402, 39883),
A0 = c("C", "C", "T", "T", "G"),
A1 = c("T", "T", "C", "TCAA", "C"),
rsID = c(NA, "rs845637483", NA, NA, NA),
`Beta-A1` = c(0.41, -0.356, -0.255, -0.873, 0.195), P = c(0.26,0.32, 0.54, 0.37, 0.59),
info = c(0.842, 0.856, 0.812, 0.821, 0.808),
maf = c(0.01, 0.01, 0.01, 0.01, 0.01), se = c(NA, NA, NA, NA, NA),
rsid = c("rs1163602158", "rs845637483", "rs7465843777","rs2746475333", "rs2726463882"))
# create a column is_same and use count()
j %>%
mutate(is_same = if_else(rsid == rsID, "Yes", "No", "No")) %>%
count(is_same)
#> # A tibble: 2 x 2
#> is_same n
#> <chr> <int>
#> 1 No 4
#> 2 Yes 1