将两列列表类型行与 dplyr 进行比较
Compare two column of list type rows with dplyr
我有以下问题:
我想根据两列之间的差异在数据框中创建一个新列,其中哪一行是字符串向量:
我的代码:
library(dplyr) # v.1.0.7
seqs <- c("seq1","seq2","seq3","seq4","seq5")
expect_mut <- c("S:T20N,S:D614G","S:T20N,S:D614G","S:T20N,N:G204R,N:G80R", "N:G204R, S:D614G", "N:G204R, S:D614G")
observed_mut <- c("S:T20N","S:D164G","S:T20N, N:G204R","S:D614G,N:G204R","S:D164G,S:T19I")
data_frame <- data.frame(seqs, expect_mut, observed_mut)
data_frame <- data_frame %>%
mutate(expect_mut = strsplit(as.character(expect_mut), ","),
observed_mut = strsplit(as.character(observed_mut), ",")) %>%
group_by(seqs) %>%
mutate(diff_mut = setdiff(observed_mut, expect_mut))
我的期望:
| seqs | expect_mut | observed_mut | diff_mut |
| ----- | ---------------------------------- | ----------------------- | ------------ |
| seq1 | c("S:T20N", "S:D614G") | S:T20N | |
| seq2 | c("S:T20N", "S:D614G") | S:D164G | S:D164G |
| seq3 | c("S:T20N", "N:G204R", "N:G80R") | c("S:T20N", " N:G204R") | |
| seq4 | c("N:G204R", "S:D614G") | c("N:G204R", "S:D614G") | |
| seq5 | c("N:G204R", "S:D614G") | c("S:D164G", "S:T19I") | c("S:D164G", "S:T19I") |
什么returns:
| seqs | expect_mut | observed_mut | diff_mut |
| ----- | ---------------------------------- | ----------------------- | ------------ |
| seq1 | c("S:T20N", "S:D614G") | S:T20N | S:T20N |
| seq2 | c("S:T20N", "S:D614G") | S:D164G | S:D164G |
| seq3 | c("S:T20N", "N:G204R", "N:G80R") | c("S:T20N", " N:G204R") | c("S:T20N", " N:G204R") |
| seq4 | c("N:G204R", "S:D614G") | c("N:G204R", "S:D614G") | c("N:G204R", "S:D614G") |
| seq5 | c("N:G204R", "S:D614G") | c("S:D164G", "S:T19I") | c("S:D164G", "S:T19I") |
基本上是将 observed_mut 的相同值返回到 diff_mut 列...
由于在 strsplit
之后两列都是 list
,因此使用 map2
循环遍历相应的 list
元素
library(dplyr)
library(purrr)
data_frame %>%
mutate(expect_mut = strsplit(as.character(expect_mut), ","),
observed_mut = strsplit(as.character(observed_mut), ",")) %>%
mutate(diff_mut = map2(observed_mut, expect_mut, setdiff)) %>%
as_tibble
-输出
# A tibble: 5 × 4
seqs expect_mut observed_mut diff_mut
<chr> <list> <list> <list>
1 seq1 <chr [2]> <chr [1]> <chr [0]>
2 seq2 <chr [2]> <chr [1]> <chr [1]>
3 seq3 <chr [3]> <chr [2]> <chr [1]>
4 seq4 <chr [2]> <chr [2]> <chr [1]>
5 seq5 <chr [2]> <chr [2]> <chr [2]>
或者如果我们使用 group_by
方法(假设 'seqs' 中的所有元素都是不同的,则使用 [[
提取第一个列表元素
data_frame %>%
mutate(expect_mut = strsplit(as.character(expect_mut), ","),
observed_mut = strsplit(as.character(observed_mut), ",")) %>%
group_by(seqs) %>%
mutate(diff_mut = list(setdiff(observed_mut[[1]], expect_mut[[1]]))) %>%
ungroup
-输出
# A tibble: 5 × 4
seqs expect_mut observed_mut diff_mut
<chr> <list> <list> <list>
1 seq1 <chr [2]> <chr [1]> <chr [0]>
2 seq2 <chr [2]> <chr [1]> <chr [1]>
3 seq3 <chr [3]> <chr [2]> <chr [1]>
4 seq4 <chr [2]> <chr [2]> <chr [1]>
5 seq5 <chr [2]> <chr [2]> <chr [2]>
注意:rowwise
与 group_by
相比可能没有错误(以防 'seqs' 有重复项)
我有以下问题:
我想根据两列之间的差异在数据框中创建一个新列,其中哪一行是字符串向量:
我的代码:
library(dplyr) # v.1.0.7
seqs <- c("seq1","seq2","seq3","seq4","seq5")
expect_mut <- c("S:T20N,S:D614G","S:T20N,S:D614G","S:T20N,N:G204R,N:G80R", "N:G204R, S:D614G", "N:G204R, S:D614G")
observed_mut <- c("S:T20N","S:D164G","S:T20N, N:G204R","S:D614G,N:G204R","S:D164G,S:T19I")
data_frame <- data.frame(seqs, expect_mut, observed_mut)
data_frame <- data_frame %>%
mutate(expect_mut = strsplit(as.character(expect_mut), ","),
observed_mut = strsplit(as.character(observed_mut), ",")) %>%
group_by(seqs) %>%
mutate(diff_mut = setdiff(observed_mut, expect_mut))
我的期望:
| seqs | expect_mut | observed_mut | diff_mut |
| ----- | ---------------------------------- | ----------------------- | ------------ |
| seq1 | c("S:T20N", "S:D614G") | S:T20N | |
| seq2 | c("S:T20N", "S:D614G") | S:D164G | S:D164G |
| seq3 | c("S:T20N", "N:G204R", "N:G80R") | c("S:T20N", " N:G204R") | |
| seq4 | c("N:G204R", "S:D614G") | c("N:G204R", "S:D614G") | |
| seq5 | c("N:G204R", "S:D614G") | c("S:D164G", "S:T19I") | c("S:D164G", "S:T19I") |
什么returns:
| seqs | expect_mut | observed_mut | diff_mut |
| ----- | ---------------------------------- | ----------------------- | ------------ |
| seq1 | c("S:T20N", "S:D614G") | S:T20N | S:T20N |
| seq2 | c("S:T20N", "S:D614G") | S:D164G | S:D164G |
| seq3 | c("S:T20N", "N:G204R", "N:G80R") | c("S:T20N", " N:G204R") | c("S:T20N", " N:G204R") |
| seq4 | c("N:G204R", "S:D614G") | c("N:G204R", "S:D614G") | c("N:G204R", "S:D614G") |
| seq5 | c("N:G204R", "S:D614G") | c("S:D164G", "S:T19I") | c("S:D164G", "S:T19I") |
基本上是将 observed_mut 的相同值返回到 diff_mut 列...
由于在 strsplit
之后两列都是 list
,因此使用 map2
循环遍历相应的 list
元素
library(dplyr)
library(purrr)
data_frame %>%
mutate(expect_mut = strsplit(as.character(expect_mut), ","),
observed_mut = strsplit(as.character(observed_mut), ",")) %>%
mutate(diff_mut = map2(observed_mut, expect_mut, setdiff)) %>%
as_tibble
-输出
# A tibble: 5 × 4
seqs expect_mut observed_mut diff_mut
<chr> <list> <list> <list>
1 seq1 <chr [2]> <chr [1]> <chr [0]>
2 seq2 <chr [2]> <chr [1]> <chr [1]>
3 seq3 <chr [3]> <chr [2]> <chr [1]>
4 seq4 <chr [2]> <chr [2]> <chr [1]>
5 seq5 <chr [2]> <chr [2]> <chr [2]>
或者如果我们使用 group_by
方法(假设 'seqs' 中的所有元素都是不同的,则使用 [[
data_frame %>%
mutate(expect_mut = strsplit(as.character(expect_mut), ","),
observed_mut = strsplit(as.character(observed_mut), ",")) %>%
group_by(seqs) %>%
mutate(diff_mut = list(setdiff(observed_mut[[1]], expect_mut[[1]]))) %>%
ungroup
-输出
# A tibble: 5 × 4
seqs expect_mut observed_mut diff_mut
<chr> <list> <list> <list>
1 seq1 <chr [2]> <chr [1]> <chr [0]>
2 seq2 <chr [2]> <chr [1]> <chr [1]>
3 seq3 <chr [3]> <chr [2]> <chr [1]>
4 seq4 <chr [2]> <chr [2]> <chr [1]>
5 seq5 <chr [2]> <chr [2]> <chr [2]>
注意:rowwise
与 group_by
相比可能没有错误(以防 'seqs' 有重复项)