根据关联的数字对字符串中唯一出现的事件求和

Sum unique occurences in a string based on associated number

我使用单序列读取分类,并希望根据分类质量进行过滤。但是,需要更改输出格式才能执行此操作。我有一个分类统计数据(分数),如下所示,代表 ["taxonomy":"kmers assigned to that taxonomy" "taxonomy":"kmers assigned to that taxonomy" etc.],每个分类法可以出现多个次。

classification_stats<-c("3:1 7:4 0:34 3:7 0:27", 
"0:110 561:19 0:37",
"0:3 562:5 0:7 543:55 0:47")

read_ID<-c("read1", "read2", "read3")

df<-data.frame(read_ID, classification_stats)

> df
  read_ID classification_stats
1   read1 3:1 7:4 0:34 3:7 0:27
2   read2 0:110 561:19 0:37
3   read3 0:3 562:5 0:7 543:55 0:47

对于每次读取(每行),我想计算分配给分类法的 kmers 总数(在 classification_stats 中),但由于每个分类法不连续出现多次,这变得更加困难。这意味着例如read1 分类法 3 有 1+7 个 kmers,分类法 7 有 4 个 kmers,分类法 0 有 34 + 27 个 kmers。

我想要的输出看起来像这样,最好排序以便 tax1 是具有最多 kmers 的分类法。

  read_ID     classification_stats        tax1 kmer1 tax2 kmer2 tax3 kmer3
  read1       3:1 7:4 0:34 3:7 0:27       0    61    3    8     7    4
  read2       0:110 561:19 0:37           0    147   561  19    NA   NA
  read3       0:3 562:5 0:7 543:55 0:47   0    57    543  55    562  5

R 或 bash 解决方案都很有趣。

library(tidyverse)

classification_stats <- c(
  "3:1 7:4 0:34 3:7 0:27",
  "0:110 561:19 0:37",
  "0:3 562:5 0:7 543:55 0:47"
)

read_ID <- c("read1", "read2", "read3")

df <- tibble(read_ID, classification_stats)

df %>%
  separate_rows(classification_stats, sep = " ") %>%
  separate(classification_stats, into = c("tax", "kmer")) %>%
  type_convert() %>%
  arrange(-kmer) %>%
  nest(tax) %>%
  mutate(id = row_number()) %>%
  unnest(data) %>%
  pivot_wider(names_from = id, values_from = c(kmer, tax))
#> Warning: All elements of `...` must be named.
#> Did you want `data = tax`?
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   read_ID = col_character(),
#>   tax = col_double(),
#>   kmer = col_double()
#> )
#> # A tibble: 3 × 27
#>   read_ID kmer_1 kmer_2 kmer_3 kmer_4 kmer_5 kmer_6 kmer_7 kmer_8 kmer_9 kmer_10
#>   <chr>    <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>   <dbl>
#> 1 read2      110     NA     NA     37     NA     NA     19     NA     NA      NA
#> 2 read3       NA     55     47     NA     NA     NA     NA     NA      7       5
#> 3 read1       NA     NA     NA     NA     34     27     NA      7     NA      NA
#> # … with 16 more variables: kmer_11 <dbl>, kmer_12 <dbl>, kmer_13 <dbl>,
#> #   tax_1 <dbl>, tax_2 <dbl>, tax_3 <dbl>, tax_4 <dbl>, tax_5 <dbl>,
#> #   tax_6 <dbl>, tax_7 <dbl>, tax_8 <dbl>, tax_9 <dbl>, tax_10 <dbl>,
#> #   tax_11 <dbl>, tax_12 <dbl>, tax_13 <dbl>

reprex package (v2.0.0)

于 2022 年 3 月 10 日创建

这里计算的是每次读取的 kmer 或分类单元出现的频率:

df %>%
  separate_rows(classification_stats, sep = " ") %>%
  separate(classification_stats, into = c("tax", "kmer")) %>%
  type_convert() %>%
  arrange(-kmer) %>%
  nest(-tax) %>%
  mutate(id = row_number()) %>%
  unnest(data) %>%
  pivot_wider(names_from = id, values_from = c(kmer, tax), values_fn = length)
#> Warning: All elements of `...` must be named.
#> Did you want `data = -tax`?
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   read_ID = col_character(),
#>   tax = col_double(),
#>   kmer = col_double()
#> )
#> # A tibble: 3 × 13
#>   read_ID kmer_1 kmer_2 kmer_3 kmer_4 kmer_5 kmer_6 tax_1 tax_2 tax_3 tax_4
#>   <chr>    <int>  <int>  <int>  <int>  <int>  <int> <int> <int> <int> <int>
#> 1 read2        2     NA      1     NA     NA     NA     2    NA     1    NA
#> 2 read3        3      1     NA     NA      1     NA     3     1    NA    NA
#> 3 read1        2     NA     NA      2     NA      1     2    NA    NA     2
#> # … with 2 more variables: tax_5 <int>, tax_6 <int>

这是 R 中的另一个解决方案。请注意,我在 read 3 中做了一些更改以测试我的代码(543:55543:80

输入

df
  read_ID      classification_stats
1   read1     3:1 7:4 0:34 3:7 0:27
2   read2         0:110 561:19 0:37
3   read3 0:3 562:5 0:7 543:80 0:47

代码和输出

  1. classification_stats 列分成单独的行,以白色 space 作为分隔符。
  2. 将结果列一分为二,用冒号“:”分隔。
  3. 然后创建一个unique_tax列,其中包含每个read_ID中的tax个数。这将用作 pivot_wider 中列名的一部分(告诉我们应该生成多少对 kmertax)。
  4. group_by(read_ID, tax) 以便一切都在该级别上运行
  5. summarise kmer 列汇总相同 read_IDtax
  6. 下的所有值
  7. 先按read_ID排列数据,再按kmer排列数据,这样我们就可以用row_number()生成正确的索引对
  8. 一个pivot_wider在这个版本的代码中就足够了
  9. left_join 合并 classification_stats
  10. 将列重新排序到您想要的位置
library(tidyverse)

left_join(
  df %>% 
    separate_rows(!read_ID, sep = " ") %>% 
    separate(classification_stats, into = c("tax", "kmer")) %>%
    group_by(read_ID, tax) %>%  
    summarize(kmer = sum(as.numeric(kmer))) %>% 
    arrange(read_ID, desc(kmer)) %>% 
    mutate(unique_tax = row_number()) %>% 
    pivot_wider(everything(), names_from = "unique_tax", values_from = c(tax, kmer)),
  df,
  by = "read_ID"
  ) %>% 
  select(read_ID, classification_stats, tax_1, kmer_1, tax_2, kmer_2, tax_3, kmer_3)

# A tibble: 3 × 8
# Groups:   read_ID [3]
  read_ID classification_stats   tax_1 kmer_1 tax_2 kmer_2 tax_3 kmer_3
  <chr>   <chr>                  <chr>  <dbl> <chr>  <dbl> <chr>  <dbl>
1 read1   3:1 7:4 0:34 3:7 0:27  0         61 3          8 7          4
2 read2   0:110 561:19 0:37      0        147 561       19 NA        NA
3 read3   0:3 562:5 0:7 543:80 … 543       80 0         57 562        5

解决方案使用data.table

library(data.table)

setDT(df) # make the tibble a data.table

dt <- df[, .(tax_kmer = unlist(str_split(classification_stats, " "))), by = read_ID]
dt[, c("tax", "kmer") := tstrsplit(tax_kmer, ":")]
dt <- dt[, .(kmer = sum(as.numeric(kmer))), by = .(read_ID, tax)]
dt[, order := frankv(kmer, ties.method = "first", order = c(-1L)), by = read_ID]
dt <- dcast(dt, read_ID ~ order, value.var = c("kmer", "tax"), sep = "")
setcolorder(dt, c("read_ID", paste0(c("tax", "kmer"), rep(1:((ncol(dt)-1)/2), each = 2))))

结果

dt
#    read_ID tax1 kmer1 tax2 kmer2 tax3 kmer3
# 1:   read1    0    61    3     8    7     4
# 2:   read2    0   147  561    19 <NA>    NA
# 3:   read3    0    57  543    55  562     5

数据

classification_stats <- c(
  "3:1 7:4 0:34 3:7 0:27",
  "0:110 561:19 0:37",
  "0:3 562:5 0:7 543:55 0:47"
)
read_ID <- c("read1", "read2", "read3")
df <- tibble(read_ID, classification_stats)

分解代码

dt <- df[, .(tax_kmer = unlist(str_split(classification_stats, " "))), by = read_ID]

我们在这里为您的输出创建一个新的 table,并确保每一行都包含您的每个 ID 组的一对“tax:kmer”。

dt[, c("tax", "kmer") := tstrsplit(tax_kmer, ":")]

在这里,我们将您的 tax_kmer 对分成两个不同的列,以保存税值和 kmer 值。

dt <- dt[, .(kmer = sum(as.numeric(kmer))), by = .(read_ID, tax)] 总结每个 tax-ID 组合的 kmer 值。

dt[, order := frankv(kmer, ties.method = "first", order = c(-1L)), by = read_ID]

确定什么税成为 tax1、tax2 等的顺序的最安全方法是使用我存储在列“顺序”中的 kmer 总和(-1 从高到低的排名) “

dt <- dcast(dt, read_ID ~ order, value.var = c("kmer", "tax"), sep = "")

dcast 是“pivot_wider”的 data.table 变体,注意我们使用 order 进行 dcast 并使用来自 kmer 和 tax 的值。

setcolorder(dt, c("read_ID", paste0(c("tax", "kmer"), rep(1:((ncol(dt)-1)/2), each = 2))))

由于我们的 dcast 程序使用 id 对您的 table 列进行排序,然后是所有 tax 列,然后是所有 kmer 列,我们根据您的喜好动态设置顺序。我们查看列的总数(我们事先不知道我们找到了多少税?)并为 ID 减去一个,根据定义我们将它除以 2,因为它是 tax/kmer 的交替对。这将创建此向量 [1] "read_ID" "tax1" "kmer1" "tax2" "kmer2" "tax3" "kmer3",这正是我们要设置的列的顺序。