计算级别内的值
Counting values within levels
我在 R 中使用 cut
生成了一组级别,例如说 0 到 1 之间的小数值,分成 0.1 个 bin:
> frac <- cut(c(0, 1), breaks=10)
> levels(frac)
[1] "(-0.001,0.1]" "(0.1,0.2]" "(0.2,0.3]" "(0.3,0.4]" "(0.4,0.5]"
[6] "(0.5,0.6]" "(0.6,0.7]" "(0.7,0.8]" "(0.8,0.9]" "(0.9,1]"
给定一个包含 [0.0, 1.0]
之间连续值的向量 v
,我如何计算 v
中属于 levels(frac)
中每个级别的元素的频率?
我可以自定义中断的数量 and/or 我制作关卡的时间间隔,所以我正在寻找一种使用标准 R 命令执行此操作的方法,以便我可以构建两个 -列数据框:一列作为因子的级别,第二列作为级别上 v
中总元素的分数或百分比值。
注意:以下不起作用:
> table(frac)
frac
(-0.001,0.1] (0.1,0.2] (0.2,0.3] (0.3,0.4] (0.4,0.5] (0.5,0.6]
1 0 0 0 0 0
(0.6,0.7] (0.7,0.8] (0.8,0.9] (0.9,1]
0 0 0 1
如果我直接在 v
上使用 cut
,那么当我在不同的向量上使用 运行 cut
时,我不会得到相同的级别,因为值的范围— 它们的最小值和最大值 — 在任意向量之间会有所不同,因此虽然我可能有相同数量的中断,但电平间隔不会相同。
我的目标是采用不同的向量并将它们分箱到同一组级别。希望这有助于澄清我的问题。感谢您的帮助。
修改 frac
以实际表示您想要的间隔,然后使用 table
函数:
x = runif(100) # For example.
frac = cut(x, breaks = seq(0, 1, 0.1))
table(frac)
结果:
frac
(0,0.1] (0.1,0.2] (0.2,0.3] (0.3,0.4] (0.4,0.5] (0.5,0.6] (0.6,0.7] (0.7,0.8]
14 9 8 10 8 12 7 7
(0.8,0.9] (0.9,1]
16 9
使用 findInterval 而不是 cut:
v<-data.frame(v=runif(100,0,1))
library(plyr)
v$x<-findInterval(v$v,seq(0,1,by=0.1))*0.1
ddply(v, .(x), summarize, n=length(x))
frac = seq(0,1,by=0.1)
ranges = paste(head(frac,-1), frac[-1], sep=" - ")
freq = hist(v, breaks=frac, include.lowest=TRUE, plot=FALSE)
data.frame(range = ranges, frequency = freq$counts)
frac = seq(0, 1, 0.1)
set.seed(42); v = rnorm(10, 0.5, 0.2)
sapply(1:(length(frac)-1), function(i) sum(frac[i]<v & frac[i+1]>=v))
#[1] 0 0 0 1 3 2 1 1 1 1
引入极端 c(0, 1)
到 v
然后使用相同的 cut
:
library(dplyr)
#dummy data
set.seed(1)
v <- round(runif(7), 2)
#result
data.frame(v,
vFrac = cut(c(0, 1, v), breaks = 10)[-c(1, 2)]) %>%
group_by(vFrac) %>%
mutate(vFreq = n())
# Source: local data frame [10 x 3]
# Groups: vFrac [8]
#
# v vFrac vFreq
# <dbl> <fctr> <int>
# 1 0.27 (0.2,0.3] 1
# 2 0.37 (0.3,0.4] 1
# 3 0.57 (0.5,0.6] 1
# 4 0.91 (0.9,1] 2
# 5 0.20 (0.1,0.2] 1
# 6 0.90 (0.8,0.9] 1
# 7 0.94 (0.9,1] 2
我在 R 中使用 cut
生成了一组级别,例如说 0 到 1 之间的小数值,分成 0.1 个 bin:
> frac <- cut(c(0, 1), breaks=10)
> levels(frac)
[1] "(-0.001,0.1]" "(0.1,0.2]" "(0.2,0.3]" "(0.3,0.4]" "(0.4,0.5]"
[6] "(0.5,0.6]" "(0.6,0.7]" "(0.7,0.8]" "(0.8,0.9]" "(0.9,1]"
给定一个包含 [0.0, 1.0]
之间连续值的向量 v
,我如何计算 v
中属于 levels(frac)
中每个级别的元素的频率?
我可以自定义中断的数量 and/or 我制作关卡的时间间隔,所以我正在寻找一种使用标准 R 命令执行此操作的方法,以便我可以构建两个 -列数据框:一列作为因子的级别,第二列作为级别上 v
中总元素的分数或百分比值。
注意:以下不起作用:
> table(frac)
frac
(-0.001,0.1] (0.1,0.2] (0.2,0.3] (0.3,0.4] (0.4,0.5] (0.5,0.6]
1 0 0 0 0 0
(0.6,0.7] (0.7,0.8] (0.8,0.9] (0.9,1]
0 0 0 1
如果我直接在 v
上使用 cut
,那么当我在不同的向量上使用 运行 cut
时,我不会得到相同的级别,因为值的范围— 它们的最小值和最大值 — 在任意向量之间会有所不同,因此虽然我可能有相同数量的中断,但电平间隔不会相同。
我的目标是采用不同的向量并将它们分箱到同一组级别。希望这有助于澄清我的问题。感谢您的帮助。
修改 frac
以实际表示您想要的间隔,然后使用 table
函数:
x = runif(100) # For example.
frac = cut(x, breaks = seq(0, 1, 0.1))
table(frac)
结果:
frac
(0,0.1] (0.1,0.2] (0.2,0.3] (0.3,0.4] (0.4,0.5] (0.5,0.6] (0.6,0.7] (0.7,0.8]
14 9 8 10 8 12 7 7
(0.8,0.9] (0.9,1]
16 9
使用 findInterval 而不是 cut:
v<-data.frame(v=runif(100,0,1))
library(plyr)
v$x<-findInterval(v$v,seq(0,1,by=0.1))*0.1
ddply(v, .(x), summarize, n=length(x))
frac = seq(0,1,by=0.1)
ranges = paste(head(frac,-1), frac[-1], sep=" - ")
freq = hist(v, breaks=frac, include.lowest=TRUE, plot=FALSE)
data.frame(range = ranges, frequency = freq$counts)
frac = seq(0, 1, 0.1)
set.seed(42); v = rnorm(10, 0.5, 0.2)
sapply(1:(length(frac)-1), function(i) sum(frac[i]<v & frac[i+1]>=v))
#[1] 0 0 0 1 3 2 1 1 1 1
引入极端 c(0, 1)
到 v
然后使用相同的 cut
:
library(dplyr)
#dummy data
set.seed(1)
v <- round(runif(7), 2)
#result
data.frame(v,
vFrac = cut(c(0, 1, v), breaks = 10)[-c(1, 2)]) %>%
group_by(vFrac) %>%
mutate(vFreq = n())
# Source: local data frame [10 x 3]
# Groups: vFrac [8]
#
# v vFrac vFreq
# <dbl> <fctr> <int>
# 1 0.27 (0.2,0.3] 1
# 2 0.37 (0.3,0.4] 1
# 3 0.57 (0.5,0.6] 1
# 4 0.91 (0.9,1] 2
# 5 0.20 (0.1,0.2] 1
# 6 0.90 (0.8,0.9] 1
# 7 0.94 (0.9,1] 2