如何在不同的bin中剪切一个数字并用新的bin扩展数据框?

How to cut a number in different bin and expand the data frame with the new bins?

我想计算一些非常简单的东西,但找不到解决方案。我想削减一定数量的箱子,但我想保存箱子。

bin.size = 100 
df = data.frame(x =c(300,400), 
                y = c("sca1","sca2"))
cut(df$x, seq(0, 400, bin.size), 
    include.lowest = TRUE)

给我

[1] (200,300] (300,400]
Levels: [0,100] (100,200] (200,300] (300,400]

但我想要这样的东西:

        bin    y
1   (0,100] sca1
2 (100,200] sca1
3 (200,300] sca1
4   (0,100] sca2
5 (100,200] sca2
6 (200,300] sca2
7 (300,400] sca2

我想这样做是因为我想计算输入 100 个 bin 中的值的数量。例如:

df2 = data.frame(snp = c(1,2,10,100,1,2,14,16,399), 
                 sca = c("sca1","sca1","sca1","sca1","sca2","sca2","sca2","sca2","sca2"))
df2
  snp  sca
1   1 sca1
2   2 sca1
3  10 sca1
4 100 sca1
5   1 sca2
6   2 sca2
7  14 sca2
8  16 sca2
9 399 sca2

snp 可以是向量 sca1 中的位置。

最终目标是获得这样的东西:

        bin    y num
1   (0,100] sca1   4
2 (100,200] sca1   0
3 (200,300] sca1   0
4   (0,100] sca2   4
5 (100,200] sca2   0
6 (200,300] sca2   0
7 (300,400] sca2   1

我能做的最好的是:

df2$cat = cut(df2$snp, seq(0, 400, bin.size), 
include.lowest = TRUE)
df2
  snp  sca       cat
1   1 sca1   [0,100]
2   2 sca1   [0,100]
3  10 sca1   [0,100]
4 100 sca1   [0,100]
5   1 sca2   [0,100]
6   2 sca2   [0,100]
7  14 sca2   [0,100]
8  16 sca2   [0,100]
9 399 sca2 (300,400]

或者这样:

table(df2$cat,df2$sca)
            sca1 sca2
  [0,100]      4    4
  (100,200]    0    0
  (200,300]    0    0
  (300,400]    0    1

但是最后一次尝试的问题是类别 (300,400]sca1 没有意义,因为它不存在。它应该是 NA 或不出现。如何解决这个问题?

这是使用 tidyverse 中的几个软件包的一种方法:

library(dplyr)
library(tidyr)
library(purrr)

df %>%
  left_join(nest(df2, snp, .key = "snp"), by = c("y" = "sca")) %>%
  mutate(
    cuts = map(x, ~ seq(0, ., by = 100)),
    tbls = pmap(
      .l = list(snp, cuts),
      .f = function(xx, breaks) {
        z <- table(cut(xx$snp, breaks))
        data_frame(cut = names(z), count = z)
      }
    )
  ) %>%
  select(y, tbls) %>%
  unnest()
#      y       cut count
# 1 sca1   (0,100]     4
# 2 sca1 (100,200]     0
# 3 sca1 (200,300]     0
# 4 sca2   (0,100]     4
# 5 sca2 (100,200]     0
# 6 sca2 (200,300]     0
# 7 sca2 (300,400]     1

在基数 R 中:

data.frame(table(cat=cut(df2$snp, seq(0,400,100)),sca=df2$sca))

#       cat   sca Freq
#1   (0,100] sca1    4
#2 (100,200] sca1    0
#3 (200,300] sca1    0
#4 (300,400] sca1    0
#5   (0,100] sca2    4
#6 (100,200] sca2    0
#7 (200,300] sca2    0
#8 (300,400] sca2    1