将两列列表类型行与 dplyr 进行比较

Compare two column of list type rows with dplyr

我有以下问题:

我想根据两列之间的差异在数据框中创建一个新列,其中哪一行是字符串向量:

我的代码:

library(dplyr) # v.1.0.7

seqs <- c("seq1","seq2","seq3","seq4","seq5")
expect_mut <- c("S:T20N,S:D614G","S:T20N,S:D614G","S:T20N,N:G204R,N:G80R", "N:G204R, S:D614G", "N:G204R, S:D614G")
observed_mut <- c("S:T20N","S:D164G","S:T20N, N:G204R","S:D614G,N:G204R","S:D164G,S:T19I")

data_frame <- data.frame(seqs, expect_mut, observed_mut)
data_frame <- data_frame %>% 
  mutate(expect_mut = strsplit(as.character(expect_mut), ","),
         observed_mut = strsplit(as.character(observed_mut), ",")) %>%
  group_by(seqs) %>%
  mutate(diff_mut = setdiff(observed_mut, expect_mut))

我的期望:

| seqs  |              expect_mut            |       observed_mut      |   diff_mut   |
| ----- | ---------------------------------- | ----------------------- | ------------ |
| seq1  | c("S:T20N", "S:D614G")             | S:T20N                  |              | 
| seq2  | c("S:T20N", "S:D614G")             | S:D164G                 | S:D164G      | 
| seq3  | c("S:T20N", "N:G204R", "N:G80R")   | c("S:T20N", " N:G204R") |              | 
| seq4  | c("N:G204R", "S:D614G")            | c("N:G204R", "S:D614G") |              | 
| seq5  | c("N:G204R", "S:D614G")            | c("S:D164G", "S:T19I")  | c("S:D164G", "S:T19I") | 

什么returns:

| seqs  |              expect_mut            |       observed_mut      |   diff_mut   |
| ----- | ---------------------------------- | ----------------------- | ------------ |
| seq1  | c("S:T20N", "S:D614G")             | S:T20N                  | S:T20N       | 
| seq2  | c("S:T20N", "S:D614G")             | S:D164G                 | S:D164G      | 
| seq3  | c("S:T20N", "N:G204R", "N:G80R")   | c("S:T20N", " N:G204R") | c("S:T20N", " N:G204R") | 
| seq4  | c("N:G204R", "S:D614G")            | c("N:G204R", "S:D614G") | c("N:G204R", "S:D614G") | 
| seq5  | c("N:G204R", "S:D614G")            | c("S:D164G", "S:T19I")  | c("S:D164G", "S:T19I")  | 

基本上是将 observed_mut 的相同值返回到 diff_mut 列...

由于在 strsplit 之后两列都是 list,因此使用 map2 循环遍历相应的 list 元素

library(dplyr)
library(purrr)
data_frame %>% 
  mutate(expect_mut = strsplit(as.character(expect_mut), ","),
         observed_mut = strsplit(as.character(observed_mut), ",")) %>% 
  mutate(diff_mut = map2(observed_mut, expect_mut, setdiff)) %>%
  as_tibble

-输出

# A tibble: 5 × 4
  seqs  expect_mut observed_mut diff_mut 
  <chr> <list>     <list>       <list>   
1 seq1  <chr [2]>  <chr [1]>    <chr [0]>
2 seq2  <chr [2]>  <chr [1]>    <chr [1]>
3 seq3  <chr [3]>  <chr [2]>    <chr [1]>
4 seq4  <chr [2]>  <chr [2]>    <chr [1]>
5 seq5  <chr [2]>  <chr [2]>    <chr [2]>

或者如果我们使用 group_by 方法(假设 'seqs' 中的所有元素都是不同的,则使用 [[

提取第一个列表元素
data_frame %>% 
   mutate(expect_mut = strsplit(as.character(expect_mut), ","),
         observed_mut = strsplit(as.character(observed_mut), ",")) %>%
   group_by(seqs) %>% 
   mutate(diff_mut = list(setdiff(observed_mut[[1]], expect_mut[[1]]))) %>%
   ungroup

-输出

# A tibble: 5 × 4
  seqs  expect_mut observed_mut diff_mut 
  <chr> <list>     <list>       <list>   
1 seq1  <chr [2]>  <chr [1]>    <chr [0]>
2 seq2  <chr [2]>  <chr [1]>    <chr [1]>
3 seq3  <chr [3]>  <chr [2]>    <chr [1]>
4 seq4  <chr [2]>  <chr [2]>    <chr [1]>
5 seq5  <chr [2]>  <chr [2]>    <chr [2]>

注意:rowwisegroup_by 相比可能没有错误(以防 'seqs' 有重复项)