如何在不同的bin中剪切一个数字并用新的bin扩展数据框?
How to cut a number in different bin and expand the data frame with the new bins?
我想计算一些非常简单的东西,但找不到解决方案。我想削减一定数量的箱子,但我想保存箱子。
bin.size = 100
df = data.frame(x =c(300,400),
y = c("sca1","sca2"))
cut(df$x, seq(0, 400, bin.size),
include.lowest = TRUE)
给我
[1] (200,300] (300,400]
Levels: [0,100] (100,200] (200,300] (300,400]
但我想要这样的东西:
bin y
1 (0,100] sca1
2 (100,200] sca1
3 (200,300] sca1
4 (0,100] sca2
5 (100,200] sca2
6 (200,300] sca2
7 (300,400] sca2
我想这样做是因为我想计算输入 100 个 bin 中的值的数量。例如:
df2 = data.frame(snp = c(1,2,10,100,1,2,14,16,399),
sca = c("sca1","sca1","sca1","sca1","sca2","sca2","sca2","sca2","sca2"))
df2
snp sca
1 1 sca1
2 2 sca1
3 10 sca1
4 100 sca1
5 1 sca2
6 2 sca2
7 14 sca2
8 16 sca2
9 399 sca2
snp 可以是向量 sca1 中的位置。
最终目标是获得这样的东西:
bin y num
1 (0,100] sca1 4
2 (100,200] sca1 0
3 (200,300] sca1 0
4 (0,100] sca2 4
5 (100,200] sca2 0
6 (200,300] sca2 0
7 (300,400] sca2 1
我能做的最好的是:
df2$cat = cut(df2$snp, seq(0, 400, bin.size),
include.lowest = TRUE)
df2
snp sca cat
1 1 sca1 [0,100]
2 2 sca1 [0,100]
3 10 sca1 [0,100]
4 100 sca1 [0,100]
5 1 sca2 [0,100]
6 2 sca2 [0,100]
7 14 sca2 [0,100]
8 16 sca2 [0,100]
9 399 sca2 (300,400]
或者这样:
table(df2$cat,df2$sca)
sca1 sca2
[0,100] 4 4
(100,200] 0 0
(200,300] 0 0
(300,400] 0 1
但是最后一次尝试的问题是类别 (300,400]
对 sca1
没有意义,因为它不存在。它应该是 NA
或不出现。如何解决这个问题?
这是使用 tidyverse
中的几个软件包的一种方法:
library(dplyr)
library(tidyr)
library(purrr)
df %>%
left_join(nest(df2, snp, .key = "snp"), by = c("y" = "sca")) %>%
mutate(
cuts = map(x, ~ seq(0, ., by = 100)),
tbls = pmap(
.l = list(snp, cuts),
.f = function(xx, breaks) {
z <- table(cut(xx$snp, breaks))
data_frame(cut = names(z), count = z)
}
)
) %>%
select(y, tbls) %>%
unnest()
# y cut count
# 1 sca1 (0,100] 4
# 2 sca1 (100,200] 0
# 3 sca1 (200,300] 0
# 4 sca2 (0,100] 4
# 5 sca2 (100,200] 0
# 6 sca2 (200,300] 0
# 7 sca2 (300,400] 1
在基数 R 中:
data.frame(table(cat=cut(df2$snp, seq(0,400,100)),sca=df2$sca))
# cat sca Freq
#1 (0,100] sca1 4
#2 (100,200] sca1 0
#3 (200,300] sca1 0
#4 (300,400] sca1 0
#5 (0,100] sca2 4
#6 (100,200] sca2 0
#7 (200,300] sca2 0
#8 (300,400] sca2 1
我想计算一些非常简单的东西,但找不到解决方案。我想削减一定数量的箱子,但我想保存箱子。
bin.size = 100
df = data.frame(x =c(300,400),
y = c("sca1","sca2"))
cut(df$x, seq(0, 400, bin.size),
include.lowest = TRUE)
给我
[1] (200,300] (300,400]
Levels: [0,100] (100,200] (200,300] (300,400]
但我想要这样的东西:
bin y
1 (0,100] sca1
2 (100,200] sca1
3 (200,300] sca1
4 (0,100] sca2
5 (100,200] sca2
6 (200,300] sca2
7 (300,400] sca2
我想这样做是因为我想计算输入 100 个 bin 中的值的数量。例如:
df2 = data.frame(snp = c(1,2,10,100,1,2,14,16,399),
sca = c("sca1","sca1","sca1","sca1","sca2","sca2","sca2","sca2","sca2"))
df2
snp sca
1 1 sca1
2 2 sca1
3 10 sca1
4 100 sca1
5 1 sca2
6 2 sca2
7 14 sca2
8 16 sca2
9 399 sca2
snp 可以是向量 sca1 中的位置。
最终目标是获得这样的东西:
bin y num
1 (0,100] sca1 4
2 (100,200] sca1 0
3 (200,300] sca1 0
4 (0,100] sca2 4
5 (100,200] sca2 0
6 (200,300] sca2 0
7 (300,400] sca2 1
我能做的最好的是:
df2$cat = cut(df2$snp, seq(0, 400, bin.size),
include.lowest = TRUE)
df2
snp sca cat
1 1 sca1 [0,100]
2 2 sca1 [0,100]
3 10 sca1 [0,100]
4 100 sca1 [0,100]
5 1 sca2 [0,100]
6 2 sca2 [0,100]
7 14 sca2 [0,100]
8 16 sca2 [0,100]
9 399 sca2 (300,400]
或者这样:
table(df2$cat,df2$sca)
sca1 sca2
[0,100] 4 4
(100,200] 0 0
(200,300] 0 0
(300,400] 0 1
但是最后一次尝试的问题是类别 (300,400]
对 sca1
没有意义,因为它不存在。它应该是 NA
或不出现。如何解决这个问题?
这是使用 tidyverse
中的几个软件包的一种方法:
library(dplyr)
library(tidyr)
library(purrr)
df %>%
left_join(nest(df2, snp, .key = "snp"), by = c("y" = "sca")) %>%
mutate(
cuts = map(x, ~ seq(0, ., by = 100)),
tbls = pmap(
.l = list(snp, cuts),
.f = function(xx, breaks) {
z <- table(cut(xx$snp, breaks))
data_frame(cut = names(z), count = z)
}
)
) %>%
select(y, tbls) %>%
unnest()
# y cut count
# 1 sca1 (0,100] 4
# 2 sca1 (100,200] 0
# 3 sca1 (200,300] 0
# 4 sca2 (0,100] 4
# 5 sca2 (100,200] 0
# 6 sca2 (200,300] 0
# 7 sca2 (300,400] 1
在基数 R 中:
data.frame(table(cat=cut(df2$snp, seq(0,400,100)),sca=df2$sca))
# cat sca Freq
#1 (0,100] sca1 4
#2 (100,200] sca1 0
#3 (200,300] sca1 0
#4 (300,400] sca1 0
#5 (0,100] sca2 4
#6 (100,200] sca2 0
#7 (200,300] sca2 0
#8 (300,400] sca2 1