在给定带坐标的 table 的情况下,在没有注释的段中查找区域
find regions in a segment without annotation given a table with coordinates
我正在尝试弄清楚如何在蛋白质中找到未注释的区域。
我有一个可分组的 table,它包含蛋白质中的一维坐标,这意味着,同一片段有很多片段,但几乎在两个片段之间有一个空的 space(但是不总是)。
如果我有这个
structure(list(acc = c("AAF73695.1", "NP_224303.2", "WP_010882745.1",
"ABG85315.1", "ABG85315.1", "ABG85315.1"), domain = c("Pkinase",
"Pkinase", "Pkinase", "sCache_like", "HAMP", "PAS"), start = c(4,
4, 4, 30, 178, 242), end = c(287, 287, 287, 142, 231, 341), evalue = c(3.8e-41,
3.8e-41, 3.8e-41, 0.00064, 1.4e-09, 0.0038), len = c(800, 800,
800, 800, 800, 800)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -6L))
# A tibble: 6 x 6
acc domain start end evalue len
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 AAF73695.1 Pkinase 4 287 3.80e-41 800
2 NP_224303.2 Pkinase 4 287 3.80e-41 800
3 WP_010882745.1 Pkinase 4 287 3.80e-41 800
4 ABG85315.1 sCache_like 30 142 6.40e- 4 800
5 ABG85315.1 HAMP 178 231 1.40e- 9 800
6 ABG85315.1 PAS 242 341 3.80e- 3 800
鉴于此,我想得到:
# A tibble: 12 x 6
acc domain start end evalue len
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 AAF73695.1 empty 1 3 0. 800
2 AAF73695.1 Pkinase 4 287 3.80e-41 800
3 AAF73695.1 empty 288 800 0. 800
4 ABG85315.1 empty 1 29 0. 1000
5 ABG85315.1 sCache_like 30 142 6.40e- 4 1000
6 ABG85315.1 empty 143 177 0. 1000
7 ABG85315.1 HAMP 178 231 1.40e- 9 1000
8 ABG85315.1 empty 232 241 0. 1000
9 ABG85315.1 PAS 242 341 3.80e- 3 1000
10 ABG85315.1 empty 342 1000 0. 1000
11 NP_224303.2 empty 1 3 0. 300
12 NP_224303.2 Pkinase 4 287 3.80e-41 300
13 NP_224303.2 empty 288 300 0. 300
14 WP_010882745.1 empty 1 3 3.80e-41 300
15 WP_010882745.1 Pkinase 4 287 3.80e-41 300
16 WP_010882745.1 empty 288 300 0. 300
使用 tidyvers 我正在尝试这样的事情
df %>%
group_by(acc) %>%
arrange(start, end) %>%
mutate(
domain = "empty",
start = end + 1,
end = lead(start - 1, default = len[1),
evalue = 0
) %>%
ungroup() %>%
distinct() %>%
bind_rows( df ) %>%
arrange(acc, start, end)
但是我得到了一个错误的结果,这个:
structure(list(acc = c("AAF73695.1", "AAF73695.1", "ABG85315.1",
"ABG85315.1", "ABG85315.1", "ABG85315.1", "ABG85315.1", "ABG85315.1",
"NP_224303.2", "NP_224303.2", "WP_010882745.1", "WP_010882745.1"
), domain = c("Pkinase", "empty", "sCache_like", "empty", "HAMP",
"empty", "PAS", "empty", "Pkinase", "empty", "Pkinase", "empty"
), start = c(4, 288, 30, 143, 178, 232, 242, 342, 4, 288, 4,
288), end = c(287, 800, 142, 231, 231, 341, 341, 800, 287, 800,
287, 800), evalue = c(3.8e-41, 0, 0.00064, 0, 1.4e-09, 0, 0.0038,
0, 3.8e-41, 0, 3.8e-41, 0), len = c(800, 800, 800, 800, 800,
800, 800, 800, 800, 800, 800, 800)), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -12L))
# A tibble: 12 x 6
acc domain start end evalue len
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 AAF73695.1 Pkinase 4 287 3.80e-41 800
2 AAF73695.1 empty 288 800 0. 800
3 ABG85315.1 sCache_like 30 142 6.40e- 4 800
4 ABG85315.1 empty 143 231 0. 800
5 ABG85315.1 HAMP 178 231 1.40e- 9 800
6 ABG85315.1 empty 232 341 0. 800
7 ABG85315.1 PAS 242 341 3.80e- 3 800
8 ABG85315.1 empty 342 800 0. 800
9 NP_224303.2 Pkinase 4 287 3.80e-41 800
10 NP_224303.2 empty 288 800 0. 800
11 WP_010882745.1 Pkinase 4 287 3.80e-41 800
12 WP_010882745.1 empty 288 800 0. 800
感谢任何帮助。
提前致谢。
我认为这可以满足您的需求。它创建一个摘要数据框,其中包含所有可能的开始和结束作为列表列,然后 unnest
s 它们,使用这些值通过 left_join
重新填充原始数据(或 NA
s)。
library(tidyverse)
df2 <- df %>% group_by(acc) %>%
summarise(start2=list(sort(unique(c(1,start,end+1)))), #all possible starts
end=list(sort(unique(c(start-1,end,len))))) %>% #all possible ends
rename(start=start2) %>% #avoids overwrite in prev line
unnest(c(start, end)) %>% #expand lists
left_join(df) %>% #join df data (rest will be NA)
mutate(domain = replace_na(domain,"empty")) %>% #replace NA with 'empty'
group_by(acc) %>% #summarise destroyed first grouping
fill(evalue, len, .direction = "downup") #fill NA values
df2
# A tibble: 16 x 6
# Groups: acc [4]
acc start end domain evalue len
<chr> <dbl> <dbl> <chr> <dbl> <dbl>
1 AAF73695.1 1 3 empty 3.80e-41 800
2 AAF73695.1 4 287 Pkinase 3.80e-41 800
3 AAF73695.1 288 800 empty 3.80e-41 800
4 ABG85315.1 1 29 empty 6.40e- 4 800
5 ABG85315.1 30 142 sCache_like 6.40e- 4 800
6 ABG85315.1 143 177 empty 6.40e- 4 800
7 ABG85315.1 178 231 HAMP 1.40e- 9 800
8 ABG85315.1 232 241 empty 1.40e- 9 800
9 ABG85315.1 242 341 PAS 3.80e- 3 800
10 ABG85315.1 342 800 empty 3.80e- 3 800
11 NP_224303.2 1 3 empty 3.80e-41 800
12 NP_224303.2 4 287 Pkinase 3.80e-41 800
13 NP_224303.2 288 800 empty 3.80e-41 800
14 WP_010882745.1 1 3 empty 3.80e-41 800
15 WP_010882745.1 4 287 Pkinase 3.80e-41 800
16 WP_010882745.1 288 800 empty 3.80e-41 800
我正在尝试弄清楚如何在蛋白质中找到未注释的区域。
我有一个可分组的 table,它包含蛋白质中的一维坐标,这意味着,同一片段有很多片段,但几乎在两个片段之间有一个空的 space(但是不总是)。
如果我有这个
structure(list(acc = c("AAF73695.1", "NP_224303.2", "WP_010882745.1",
"ABG85315.1", "ABG85315.1", "ABG85315.1"), domain = c("Pkinase",
"Pkinase", "Pkinase", "sCache_like", "HAMP", "PAS"), start = c(4,
4, 4, 30, 178, 242), end = c(287, 287, 287, 142, 231, 341), evalue = c(3.8e-41,
3.8e-41, 3.8e-41, 0.00064, 1.4e-09, 0.0038), len = c(800, 800,
800, 800, 800, 800)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -6L))
# A tibble: 6 x 6
acc domain start end evalue len
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 AAF73695.1 Pkinase 4 287 3.80e-41 800
2 NP_224303.2 Pkinase 4 287 3.80e-41 800
3 WP_010882745.1 Pkinase 4 287 3.80e-41 800
4 ABG85315.1 sCache_like 30 142 6.40e- 4 800
5 ABG85315.1 HAMP 178 231 1.40e- 9 800
6 ABG85315.1 PAS 242 341 3.80e- 3 800
鉴于此,我想得到:
# A tibble: 12 x 6
acc domain start end evalue len
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 AAF73695.1 empty 1 3 0. 800
2 AAF73695.1 Pkinase 4 287 3.80e-41 800
3 AAF73695.1 empty 288 800 0. 800
4 ABG85315.1 empty 1 29 0. 1000
5 ABG85315.1 sCache_like 30 142 6.40e- 4 1000
6 ABG85315.1 empty 143 177 0. 1000
7 ABG85315.1 HAMP 178 231 1.40e- 9 1000
8 ABG85315.1 empty 232 241 0. 1000
9 ABG85315.1 PAS 242 341 3.80e- 3 1000
10 ABG85315.1 empty 342 1000 0. 1000
11 NP_224303.2 empty 1 3 0. 300
12 NP_224303.2 Pkinase 4 287 3.80e-41 300
13 NP_224303.2 empty 288 300 0. 300
14 WP_010882745.1 empty 1 3 3.80e-41 300
15 WP_010882745.1 Pkinase 4 287 3.80e-41 300
16 WP_010882745.1 empty 288 300 0. 300
使用 tidyvers 我正在尝试这样的事情
df %>%
group_by(acc) %>%
arrange(start, end) %>%
mutate(
domain = "empty",
start = end + 1,
end = lead(start - 1, default = len[1),
evalue = 0
) %>%
ungroup() %>%
distinct() %>%
bind_rows( df ) %>%
arrange(acc, start, end)
但是我得到了一个错误的结果,这个:
structure(list(acc = c("AAF73695.1", "AAF73695.1", "ABG85315.1",
"ABG85315.1", "ABG85315.1", "ABG85315.1", "ABG85315.1", "ABG85315.1",
"NP_224303.2", "NP_224303.2", "WP_010882745.1", "WP_010882745.1"
), domain = c("Pkinase", "empty", "sCache_like", "empty", "HAMP",
"empty", "PAS", "empty", "Pkinase", "empty", "Pkinase", "empty"
), start = c(4, 288, 30, 143, 178, 232, 242, 342, 4, 288, 4,
288), end = c(287, 800, 142, 231, 231, 341, 341, 800, 287, 800,
287, 800), evalue = c(3.8e-41, 0, 0.00064, 0, 1.4e-09, 0, 0.0038,
0, 3.8e-41, 0, 3.8e-41, 0), len = c(800, 800, 800, 800, 800,
800, 800, 800, 800, 800, 800, 800)), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -12L))
# A tibble: 12 x 6
acc domain start end evalue len
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 AAF73695.1 Pkinase 4 287 3.80e-41 800
2 AAF73695.1 empty 288 800 0. 800
3 ABG85315.1 sCache_like 30 142 6.40e- 4 800
4 ABG85315.1 empty 143 231 0. 800
5 ABG85315.1 HAMP 178 231 1.40e- 9 800
6 ABG85315.1 empty 232 341 0. 800
7 ABG85315.1 PAS 242 341 3.80e- 3 800
8 ABG85315.1 empty 342 800 0. 800
9 NP_224303.2 Pkinase 4 287 3.80e-41 800
10 NP_224303.2 empty 288 800 0. 800
11 WP_010882745.1 Pkinase 4 287 3.80e-41 800
12 WP_010882745.1 empty 288 800 0. 800
感谢任何帮助。
提前致谢。
我认为这可以满足您的需求。它创建一个摘要数据框,其中包含所有可能的开始和结束作为列表列,然后 unnest
s 它们,使用这些值通过 left_join
重新填充原始数据(或 NA
s)。
library(tidyverse)
df2 <- df %>% group_by(acc) %>%
summarise(start2=list(sort(unique(c(1,start,end+1)))), #all possible starts
end=list(sort(unique(c(start-1,end,len))))) %>% #all possible ends
rename(start=start2) %>% #avoids overwrite in prev line
unnest(c(start, end)) %>% #expand lists
left_join(df) %>% #join df data (rest will be NA)
mutate(domain = replace_na(domain,"empty")) %>% #replace NA with 'empty'
group_by(acc) %>% #summarise destroyed first grouping
fill(evalue, len, .direction = "downup") #fill NA values
df2
# A tibble: 16 x 6
# Groups: acc [4]
acc start end domain evalue len
<chr> <dbl> <dbl> <chr> <dbl> <dbl>
1 AAF73695.1 1 3 empty 3.80e-41 800
2 AAF73695.1 4 287 Pkinase 3.80e-41 800
3 AAF73695.1 288 800 empty 3.80e-41 800
4 ABG85315.1 1 29 empty 6.40e- 4 800
5 ABG85315.1 30 142 sCache_like 6.40e- 4 800
6 ABG85315.1 143 177 empty 6.40e- 4 800
7 ABG85315.1 178 231 HAMP 1.40e- 9 800
8 ABG85315.1 232 241 empty 1.40e- 9 800
9 ABG85315.1 242 341 PAS 3.80e- 3 800
10 ABG85315.1 342 800 empty 3.80e- 3 800
11 NP_224303.2 1 3 empty 3.80e-41 800
12 NP_224303.2 4 287 Pkinase 3.80e-41 800
13 NP_224303.2 288 800 empty 3.80e-41 800
14 WP_010882745.1 1 3 empty 3.80e-41 800
15 WP_010882745.1 4 287 Pkinase 3.80e-41 800
16 WP_010882745.1 288 800 empty 3.80e-41 800