在 summarize() 中提取匹配变量
extracting matching variable in summarize()
我有一个示例数据集
gene_name
motif_id
matched_sequence
A
y1
CCC
A
y2
CCAAA
A
y3
AAG
A
y3
AT
B
y1
AAAA
B
y4
AAT
C
y5
AAGG
并尝试像 R 中那样获取数据集:
gene_name
Node1
Node2
sequence
occurence
A
y1
y2
CCC, CCAAA
2
A
y1
y3
CCC,AAG,AAT
3
A
y2
y3
CCAAA,AGG,AAT
3
B
y1
y4
AAAA,AAT
2
motif_id 列总是有一个目标并从起始列的每个组合中寻找常见的 gene_name没有任何重叠及其序列列表。
我试过了:
data%>%
group_by(gene_name, motif_id) %>%
summarize(matched_sequence = paste0(matched_sequence, collapse = ",")) %>%
mutate(count = n()) %>% filter(count>=2) %>%
summarize(motif_id = combn(motif_id, 2, function(x) list(setNames(x, c('Node1', 'Node2')))), matched_sequence = toString(matched_sequence),
.groups = 'keep') %>%
tidyr::unnest_wider(motif_id)
但是无法获取序列和出现列。谁能给我一个建议?
我们按 'gene_name' 进行分组,仅保留 'motif_id' 中不同(n_distinct
元素的数量大于 1 的组。得到成对的 combn
ations 'unique' 元素,通过提取与 'motif_id' 值匹配的 'matched_sequence' 创建 'sequence',获取 list
中的 lengths
'occurence',使用 unnest_wider
从 list
列创建列,并通过 paste
ing 将 'sequence' list
转换为 character
列list
中的元素
library(dplyr)
library(purrr)
library(tidyr)
library(stringr)
data %>%
dplyr::group_by(gene_name) %>%
dplyr::filter(n() > 1, n_distinct(motif_id) > 1) %>%
dplyr::summarise(Node = combn(unique(motif_id), 2,
simplify = FALSE),
sequence = purrr::map(Node, ~
matched_sequence[motif_id %in% .x]),
occurence = lengths(sequence), .groups = 'drop') %>%
tidyr::unnest_wider(Node) %>%
dplyr::mutate(sequence = purrr::map_chr(sequence, toString)) %>%
dplyr::rename_with(~ stringr::str_c("Node", seq_along(.x)), starts_with("..."))
-输出
# A tibble: 4 × 5
gene_name Node1 Node2 sequence occurence
<chr> <chr> <chr> <chr> <int>
1 A y1 y2 CCC, CCAAA 2
2 A y1 y3 CCC, AAG, AT 3
3 A y2 y3 CCAAA, AAG, AT 3
4 B y1 y4 AAAA, AAT 2
数据
data <- structure(list(gene_name = c("A", "A", "A", "A", "B", "B", "C"
), motif_id = c("y1", "y2", "y3", "y3", "y1", "y4", "y5"),
matched_sequence = c("CCC",
"CCAAA", "AAG", "AT", "AAAA", "AAT", "AAGG")),
class = "data.frame", row.names = c(NA,
-7L))
我有一个示例数据集
gene_name | motif_id | matched_sequence |
---|---|---|
A | y1 | CCC |
A | y2 | CCAAA |
A | y3 | AAG |
A | y3 | AT |
B | y1 | AAAA |
B | y4 | AAT |
C | y5 | AAGG |
并尝试像 R 中那样获取数据集:
gene_name | Node1 | Node2 | sequence | occurence |
---|---|---|---|---|
A | y1 | y2 | CCC, CCAAA | 2 |
A | y1 | y3 | CCC,AAG,AAT | 3 |
A | y2 | y3 | CCAAA,AGG,AAT | 3 |
B | y1 | y4 | AAAA,AAT | 2 |
motif_id 列总是有一个目标并从起始列的每个组合中寻找常见的 gene_name没有任何重叠及其序列列表。
我试过了:
data%>%
group_by(gene_name, motif_id) %>%
summarize(matched_sequence = paste0(matched_sequence, collapse = ",")) %>%
mutate(count = n()) %>% filter(count>=2) %>%
summarize(motif_id = combn(motif_id, 2, function(x) list(setNames(x, c('Node1', 'Node2')))), matched_sequence = toString(matched_sequence),
.groups = 'keep') %>%
tidyr::unnest_wider(motif_id)
但是无法获取序列和出现列。谁能给我一个建议?
我们按 'gene_name' 进行分组,仅保留 'motif_id' 中不同(n_distinct
元素的数量大于 1 的组。得到成对的 combn
ations 'unique' 元素,通过提取与 'motif_id' 值匹配的 'matched_sequence' 创建 'sequence',获取 list
中的 lengths
'occurence',使用 unnest_wider
从 list
列创建列,并通过 paste
ing 将 'sequence' list
转换为 character
列list
library(dplyr)
library(purrr)
library(tidyr)
library(stringr)
data %>%
dplyr::group_by(gene_name) %>%
dplyr::filter(n() > 1, n_distinct(motif_id) > 1) %>%
dplyr::summarise(Node = combn(unique(motif_id), 2,
simplify = FALSE),
sequence = purrr::map(Node, ~
matched_sequence[motif_id %in% .x]),
occurence = lengths(sequence), .groups = 'drop') %>%
tidyr::unnest_wider(Node) %>%
dplyr::mutate(sequence = purrr::map_chr(sequence, toString)) %>%
dplyr::rename_with(~ stringr::str_c("Node", seq_along(.x)), starts_with("..."))
-输出
# A tibble: 4 × 5
gene_name Node1 Node2 sequence occurence
<chr> <chr> <chr> <chr> <int>
1 A y1 y2 CCC, CCAAA 2
2 A y1 y3 CCC, AAG, AT 3
3 A y2 y3 CCAAA, AAG, AT 3
4 B y1 y4 AAAA, AAT 2
数据
data <- structure(list(gene_name = c("A", "A", "A", "A", "B", "B", "C"
), motif_id = c("y1", "y2", "y3", "y3", "y1", "y4", "y5"),
matched_sequence = c("CCC",
"CCAAA", "AAG", "AT", "AAAA", "AAT", "AAGG")),
class = "data.frame", row.names = c(NA,
-7L))