分组因素内的下分位数和上分位数
Lower and upper quantiles within grouping factors
我有一个数据框,其中包含两个分组变量 grp1 和 grp2 以及一个连续变量 val.我想 trim 两个分组变量中的较低和较高的十分位数。我这里有一个有点麻烦的解决方案:
# data
library(slqdf)
x0 <- data.frame(grp1 = rep(c("A","B","C"), c(20,30,50))
, grp2 = rep(c("f","m"),50)
, val = rnorm(100)
)
head(x0)
grp1 grp2 val
1 A f -0.006799051
2 A m -0.554560465
3 A f 3.254289174
4 A m 0.609900847
5 A f -0.554915077
6 A m -0.010012371
在分组变量中添加一个计数器:
library(dplyr)
# sort within grouping factors
x0 <- sqldf("SELECT * FROM x0 ORDER BY grp1,grp2, val")
# counter within grouping factors
x0 <- x0 %>% group_by(paste(grp1,grp2)) %>% mutate(counter = row_number(paste(grp1,grp2)))
添加分组变量内的观察数:
x0 <- sqldf(paste("SELECT a.*"
," , (SELECT COUNT(*)"
," FROM x0 b"
," WHERE a.grp1 = b.grp1"
," AND a.grp2 = b.grp2"
," ) n"
," FROM x0 a"
)
)
计算百分位数:
x0$p <- with(x0, counter/n)
删除第一个和最后一个十分位数:
x1 <- subset(x0, p > 0.1 & p < 0.9)
sqldf("SELECT * FROM x1 ORDER BY grp1,grp2,counter LIMIT 10")
grp1 grp2 val paste(grp1, grp2) counter n p
1 A f -1.20616577 A f 2 10 0.2
2 A f -0.56528613 A f 3 10 0.3
3 A f -0.38902717 A f 4 10 0.4
4 A f 0.07112568 A f 5 10 0.5
5 A f 0.21885681 A f 6 10 0.6
6 A f 0.29956892 A f 7 10 0.7
7 A f 0.58522900 A f 8 10 0.8
8 A m -1.37907878 A m 2 10 0.2
9 A m -0.53148055 A m 3 10 0.3
10 A m -0.26385376 A m 4 10 0.4
# control
nrow(x0)
[1] 100
nrow(x1)
[1] 78
是否有一个函数可以在一行中完成,或者有人知道如何以更优雅的方式完成它?
也许这就是您想要的或至少是接近的。可重现数据优先:
set.seed(2015)
x0 <- data.frame(grp1 = rep(c("A","B","C"), c(20,30,50))
, grp2 = rep(c("f","m"),50)
, val = rnorm(100)
)
现在我们可以使用 dplyr 的 ntile
函数来计算十分位数并删除第一个和最后一个:
library(dplyr)
x0 %>%
group_by(grp1, grp2) %>%
mutate(n = ntile(val, 10)) %>%
filter(!n %in% c(1, 10))
#Source: local data frame [80 x 4]
#Groups: grp1, grp2
#
# grp1 grp2 val n
#1 A f -1.545448388 2
#2 A m -0.528393243 3
#3 A f -1.086758791 4
#4 A m -0.000111512 4
#5 A f 0.388953783 8
#6 A m 0.532631272 6
#7 A m 0.650598453 7
#8 A f -0.624245435 5
#9 A m 0.891555027 9
#10 A f 0.501271527 9
#.. ... ... ...
或更短(不创建列 n):
x0 %>%
group_by(grp1, grp2) %>%
filter(!ntile(val, 10) %in% c(1, 10))
dplyr::ntile
的描述是:
ntile: a rough rank, which breaks the input vector into n buckets.
我有一个数据框,其中包含两个分组变量 grp1 和 grp2 以及一个连续变量 val.我想 trim 两个分组变量中的较低和较高的十分位数。我这里有一个有点麻烦的解决方案:
# data
library(slqdf)
x0 <- data.frame(grp1 = rep(c("A","B","C"), c(20,30,50))
, grp2 = rep(c("f","m"),50)
, val = rnorm(100)
)
head(x0)
grp1 grp2 val
1 A f -0.006799051
2 A m -0.554560465
3 A f 3.254289174
4 A m 0.609900847
5 A f -0.554915077
6 A m -0.010012371
在分组变量中添加一个计数器:
library(dplyr)
# sort within grouping factors
x0 <- sqldf("SELECT * FROM x0 ORDER BY grp1,grp2, val")
# counter within grouping factors
x0 <- x0 %>% group_by(paste(grp1,grp2)) %>% mutate(counter = row_number(paste(grp1,grp2)))
添加分组变量内的观察数:
x0 <- sqldf(paste("SELECT a.*"
," , (SELECT COUNT(*)"
," FROM x0 b"
," WHERE a.grp1 = b.grp1"
," AND a.grp2 = b.grp2"
," ) n"
," FROM x0 a"
)
)
计算百分位数:
x0$p <- with(x0, counter/n)
删除第一个和最后一个十分位数:
x1 <- subset(x0, p > 0.1 & p < 0.9)
sqldf("SELECT * FROM x1 ORDER BY grp1,grp2,counter LIMIT 10")
grp1 grp2 val paste(grp1, grp2) counter n p
1 A f -1.20616577 A f 2 10 0.2
2 A f -0.56528613 A f 3 10 0.3
3 A f -0.38902717 A f 4 10 0.4
4 A f 0.07112568 A f 5 10 0.5
5 A f 0.21885681 A f 6 10 0.6
6 A f 0.29956892 A f 7 10 0.7
7 A f 0.58522900 A f 8 10 0.8
8 A m -1.37907878 A m 2 10 0.2
9 A m -0.53148055 A m 3 10 0.3
10 A m -0.26385376 A m 4 10 0.4
# control
nrow(x0)
[1] 100
nrow(x1)
[1] 78
是否有一个函数可以在一行中完成,或者有人知道如何以更优雅的方式完成它?
也许这就是您想要的或至少是接近的。可重现数据优先:
set.seed(2015)
x0 <- data.frame(grp1 = rep(c("A","B","C"), c(20,30,50))
, grp2 = rep(c("f","m"),50)
, val = rnorm(100)
)
现在我们可以使用 dplyr 的 ntile
函数来计算十分位数并删除第一个和最后一个:
library(dplyr)
x0 %>%
group_by(grp1, grp2) %>%
mutate(n = ntile(val, 10)) %>%
filter(!n %in% c(1, 10))
#Source: local data frame [80 x 4]
#Groups: grp1, grp2
#
# grp1 grp2 val n
#1 A f -1.545448388 2
#2 A m -0.528393243 3
#3 A f -1.086758791 4
#4 A m -0.000111512 4
#5 A f 0.388953783 8
#6 A m 0.532631272 6
#7 A m 0.650598453 7
#8 A f -0.624245435 5
#9 A m 0.891555027 9
#10 A f 0.501271527 9
#.. ... ... ...
或更短(不创建列 n):
x0 %>%
group_by(grp1, grp2) %>%
filter(!ntile(val, 10) %in% c(1, 10))
dplyr::ntile
的描述是:
ntile: a rough rank, which breaks the input vector into n buckets.