两个 df R 值之间的对应关系
Correspondance between values in two df R
我有两个df要对抗。我的第一个 df 是“sum”
> head(sum)
File_pdb Res1 Chain1 Res2 Chain2
1: 7LD1_CM GLN 81 M ASN 501 C
2: 7LD1_CM TYR 128 M PHE 377 C
3: 7LD1_CM ILE 78 M SER 375 C
4: 7LD1_CM ASN 76 M ALA 372 C
5: 7LD1_CM THR 20 M TYR 369 C
6: 7LD1_CM ARG 408 C LEU 131 M
第二个是“mut”
> head(mut)
RefAA MutAA LineagesCount
1 VAL 3 GLY 3 1
2 LEU 5 PHE 5 2
3 LEU 8 VAL 8 1
4 SER 13 ILE 13 2
5 LEU 18 PHE 18 5
6 THR 20 ILE 20 1
我必须检查 sum$res1 和 sum$res2 中是否有等于 mut$refAA 的值。如果是这样,我需要将整行 mut$refAA 添加到 sum$res1 或 sum$res2 附近。
举个例子:
File_pdb Res1 Chain1 Res2 Chain2 RefAA MutAA LineagesCount
1: 7LD1_CM GLN 81 M ASN 501 C
2: 7LD1_CM TYR 128 M PHE 377 C
3: 7LD1_CM ILE 78 M SER 375 C
4: 7LD1_CM ASN 76 M ALA 372 C
5: 7LD1_CM THR 20 M TYR 369 C THR 20 ILE 20 1
6: 7LD1_CM ARG 408 C LEU 131 M
我该怎么做?我正在尝试使用合并和连接功能,但我不是很有经验,所以我需要多练习。有人能帮我吗?谢谢!
我不得不稍微修复一下数据,以便轻松导入数据。然后你可以试试 tidyverse
library(tidyverse)
SUM %>%
mutate(index = 1:n()) %>%
pivot_longer(c(Res1, Res2)) %>%
left_join(mutate(MUT, value=RefAA), by = "value") %>%
group_by(index) %>%
fill(MutAA, RefAA, LineagesCount, .direction = "downup") %>%
ungroup() %>%
pivot_wider(names_from = name, values_from = value, values_fn = toString) %>%
mutate(which_Res = ifelse(RefAA == Res1, "Res1", "Res2"))
# A tibble: 6 x 10
File_pdb Chain1 Chain2 index RefAA MutAA LineagesCount Res1 Res2 which_Res
<chr> <chr> <chr> <int> <chr> <chr> <int> <chr> <chr> <chr>
1 7LD1_CM M C 1 NA NA NA GLN81 ASN501 NA
2 7LD1_CM M C 2 NA NA NA TYR128 PHE377 NA
3 7LD1_CM M C 3 NA NA NA ILE78 SER375 NA
4 7LD1_CM M C 4 NA NA NA ASN76 ALA372 NA
5 7LD1_CM M C 5 THR20 ILE20 1 THR20 TYR369 Res1
6 7LD1_CM C M 6 NA NA NA ARG408 LEU131 NA
数据
SUM <- read.table(text = " File_pdb Res1 Chain1 Res2 Chain2
1: 7LD1_CM GLN81 M ASN501 C
2: 7LD1_CM TYR128 M PHE377 C
3: 7LD1_CM ILE78 M SER375 C
4: 7LD1_CM ASN76 M ALA372 C
5: 7LD1_CM THR20 M TYR369 C
6: 7LD1_CM ARG408 C LEU131 M")
SUM
MUT <- read.table(text = " RefAA MutAA LineagesCount
1 VAL3 GLY3 1
2 LEU5 PHE5 2
3 LEU8 VAL8 1
4 SER13 ILE13 2
5 LEU18 PHE18 5
6 THR20 ILE20 1")
希望这会有所帮助
do.call(
dplyr::coalesce,
lapply(
c("Res1", "Res2"),
function(x) merge(SUM, MUT, by.x = x, by.y = "RefAA", all.x = TRUE)
)
)
这给出了
Res1 File_pdb Chain1 Res2 Chain2 MutAA LineagesCount
1 ARG408 7LD1_CM C LEU131 M <NA> NA
2 ASN76 7LD1_CM M ALA372 C <NA> NA
3 GLN81 7LD1_CM M ASN501 C <NA> NA
4 ILE78 7LD1_CM M SER375 C <NA> NA
5 THR20 7LD1_CM M TYR369 C ILE20 1
6 TYR128 7LD1_CM M PHE377 C <NA> NA
数据
> dput(SUM)
structure(list(File_pdb = c("7LD1_CM", "7LD1_CM", "7LD1_CM",
"7LD1_CM", "7LD1_CM", "7LD1_CM"), Res1 = c("GLN81", "TYR128",
"ILE78", "ASN76", "THR20", "ARG408"), Chain1 = c("M", "M", "M",
"M", "M", "C"), Res2 = c("ASN501", "PHE377", "SER375", "ALA372",
"TYR369", "LEU131"), Chain2 = c("C", "C", "C", "C", "C", "M")), class = "data.frame", row.names = c("1:",
"2:", "3:", "4:", "5:", "6:"))
> dput(MUT)
structure(list(RefAA = c("VAL3", "LEU5", "LEU8", "SER13", "LEU18",
"THR20"), MutAA = c("GLY3", "PHE5", "VAL8", "ILE13", "PHE18",
"ILE20"), LineagesCount = c(1L, 2L, 1L, 2L, 5L, 1L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6"))
我有两个df要对抗。我的第一个 df 是“sum”
> head(sum)
File_pdb Res1 Chain1 Res2 Chain2
1: 7LD1_CM GLN 81 M ASN 501 C
2: 7LD1_CM TYR 128 M PHE 377 C
3: 7LD1_CM ILE 78 M SER 375 C
4: 7LD1_CM ASN 76 M ALA 372 C
5: 7LD1_CM THR 20 M TYR 369 C
6: 7LD1_CM ARG 408 C LEU 131 M
第二个是“mut”
> head(mut)
RefAA MutAA LineagesCount
1 VAL 3 GLY 3 1
2 LEU 5 PHE 5 2
3 LEU 8 VAL 8 1
4 SER 13 ILE 13 2
5 LEU 18 PHE 18 5
6 THR 20 ILE 20 1
我必须检查 sum$res1 和 sum$res2 中是否有等于 mut$refAA 的值。如果是这样,我需要将整行 mut$refAA 添加到 sum$res1 或 sum$res2 附近。
举个例子:
File_pdb Res1 Chain1 Res2 Chain2 RefAA MutAA LineagesCount
1: 7LD1_CM GLN 81 M ASN 501 C
2: 7LD1_CM TYR 128 M PHE 377 C
3: 7LD1_CM ILE 78 M SER 375 C
4: 7LD1_CM ASN 76 M ALA 372 C
5: 7LD1_CM THR 20 M TYR 369 C THR 20 ILE 20 1
6: 7LD1_CM ARG 408 C LEU 131 M
我该怎么做?我正在尝试使用合并和连接功能,但我不是很有经验,所以我需要多练习。有人能帮我吗?谢谢!
我不得不稍微修复一下数据,以便轻松导入数据。然后你可以试试 tidyverse
library(tidyverse)
SUM %>%
mutate(index = 1:n()) %>%
pivot_longer(c(Res1, Res2)) %>%
left_join(mutate(MUT, value=RefAA), by = "value") %>%
group_by(index) %>%
fill(MutAA, RefAA, LineagesCount, .direction = "downup") %>%
ungroup() %>%
pivot_wider(names_from = name, values_from = value, values_fn = toString) %>%
mutate(which_Res = ifelse(RefAA == Res1, "Res1", "Res2"))
# A tibble: 6 x 10
File_pdb Chain1 Chain2 index RefAA MutAA LineagesCount Res1 Res2 which_Res
<chr> <chr> <chr> <int> <chr> <chr> <int> <chr> <chr> <chr>
1 7LD1_CM M C 1 NA NA NA GLN81 ASN501 NA
2 7LD1_CM M C 2 NA NA NA TYR128 PHE377 NA
3 7LD1_CM M C 3 NA NA NA ILE78 SER375 NA
4 7LD1_CM M C 4 NA NA NA ASN76 ALA372 NA
5 7LD1_CM M C 5 THR20 ILE20 1 THR20 TYR369 Res1
6 7LD1_CM C M 6 NA NA NA ARG408 LEU131 NA
数据
SUM <- read.table(text = " File_pdb Res1 Chain1 Res2 Chain2
1: 7LD1_CM GLN81 M ASN501 C
2: 7LD1_CM TYR128 M PHE377 C
3: 7LD1_CM ILE78 M SER375 C
4: 7LD1_CM ASN76 M ALA372 C
5: 7LD1_CM THR20 M TYR369 C
6: 7LD1_CM ARG408 C LEU131 M")
SUM
MUT <- read.table(text = " RefAA MutAA LineagesCount
1 VAL3 GLY3 1
2 LEU5 PHE5 2
3 LEU8 VAL8 1
4 SER13 ILE13 2
5 LEU18 PHE18 5
6 THR20 ILE20 1")
希望这会有所帮助
do.call(
dplyr::coalesce,
lapply(
c("Res1", "Res2"),
function(x) merge(SUM, MUT, by.x = x, by.y = "RefAA", all.x = TRUE)
)
)
这给出了
Res1 File_pdb Chain1 Res2 Chain2 MutAA LineagesCount
1 ARG408 7LD1_CM C LEU131 M <NA> NA
2 ASN76 7LD1_CM M ALA372 C <NA> NA
3 GLN81 7LD1_CM M ASN501 C <NA> NA
4 ILE78 7LD1_CM M SER375 C <NA> NA
5 THR20 7LD1_CM M TYR369 C ILE20 1
6 TYR128 7LD1_CM M PHE377 C <NA> NA
数据
> dput(SUM)
structure(list(File_pdb = c("7LD1_CM", "7LD1_CM", "7LD1_CM",
"7LD1_CM", "7LD1_CM", "7LD1_CM"), Res1 = c("GLN81", "TYR128",
"ILE78", "ASN76", "THR20", "ARG408"), Chain1 = c("M", "M", "M",
"M", "M", "C"), Res2 = c("ASN501", "PHE377", "SER375", "ALA372",
"TYR369", "LEU131"), Chain2 = c("C", "C", "C", "C", "C", "M")), class = "data.frame", row.names = c("1:",
"2:", "3:", "4:", "5:", "6:"))
> dput(MUT)
structure(list(RefAA = c("VAL3", "LEU5", "LEU8", "SER13", "LEU18",
"THR20"), MutAA = c("GLY3", "PHE5", "VAL8", "ILE13", "PHE18",
"ILE20"), LineagesCount = c(1L, 2L, 1L, 2L, 5L, 1L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6"))