在 tidyverse 中按组计算 n 个分位数
calculating n quantiles by group in tidyverse
我有一个独特的问题,我想为数据框中的每个组添加一列百分位数。这是我的数据的样子:
library(tidyverse)
set.seed(123)
df <- tibble(id = 1:100,
group = rep(letters[1:4], 25),
x = c(sample(1:100, 25, replace = T),
sample(101:200, 25, replace = T),
sample(201:300, 25, replace = T),
sample(301:400, 25, replace = T)))
> df
# A tibble: 100 x 3
id group x
<int> <chr> <int>
1 1 a 78
2 2 b 80
3 3 c 7
4 4 d 100
5 5 a 45
6 6 b 76
7 7 c 25
8 8 d 91
9 9 a 13
10 10 b 84
# ... with 90 more rows
# Function to create a table ten percentiles for a numeric vector
percentiles_table <- function(x) {
res <- round(quantile(x, probs = seq(from=.1, to=1, by=0.1)), 0)
res <- data.frame(percentile = names(res), to = res )
res <- res %>%
mutate(from = lag(to, default = 0)) %>%
select(from,to,percentile)
}
# Table of percentiles
percentiles <- df %>%
group_by(group) %>%
summarise(percentiles_table(x)) %>%
ungroup()
> percentiles
# A tibble: 40 x 4
group from to percentile
<chr> <dbl> <dbl> <chr>
1 a 0 25 10%
2 a 25 71 20%
3 a 71 106 30%
4 a 106 125 40%
5 a 125 198 50%
6 a 198 236 60%
7 a 236 278 70%
8 a 278 325 80%
9 a 325 379 90%
10 a 379 389 100%
我想为 x
的值介于 from
和 to
之间的每个组将 percentile
列添加到 df。
可能有一些方法可以直接计算 percentile
列,而无需在单独的 data.frame
中计算,然后将其附加回 df
。
install.packages("zoo")
library(zoo)
y=as.data.frame(c(0:max(percentiles$to)))
y=merge(y,unique(percentiles[,c(1)]))
y=merge(y,percentiles[,c(1,2,4)], by.x = c("group","c(0:max(percentiles$to))"), by.y = c("group","from"), all.x = TRUE)
y=na.locf(y)
df=merge(df,y, all.x = TRUE, by.x = c("group","x"), by.y = c("group","c(0:max(percentiles$to))"))
使用data.table
:
setDT(df)[
,
percentile := cut(
x,
quantile(x, seq(0, 1, 0.1)),
include.lowest = TRUE,
labels = paste0(seq(10, 100, 10), "%")
),
by = group
]
我得到了这个有效的解决方案。
percentile_ranks <- function(x) {
res <- trunc(rank(x))/length(x) * 100
res <- floor(res/10) }
df <- df %>%
group_by(group) %>%
arrange(x) %>%
mutate(percentile = percentile_ranks(x)) %>%
mutate(percentile_pct = paste0(percentile*10,"%")) %>%
ungroup() %>%
arrange(id) # original data.frame order
一个 one-liner 和我的 santoku
包裹:
library(santoku)
df |>
group_by(group) |>
mutate(
percentile = chop_quantiles(x, 0:100/100,
labels = lbl_endpoint())
)
# A tibble: 100 × 4
# Groups: group [4]
id group x percentile
<int> <chr> <int> <fct>
1 1 a 35 8%
2 2 b 97 20%
3 3 c 39 4%
4 4 d 20 8%
5 5 a 89 16%
...
我有一个独特的问题,我想为数据框中的每个组添加一列百分位数。这是我的数据的样子:
library(tidyverse)
set.seed(123)
df <- tibble(id = 1:100,
group = rep(letters[1:4], 25),
x = c(sample(1:100, 25, replace = T),
sample(101:200, 25, replace = T),
sample(201:300, 25, replace = T),
sample(301:400, 25, replace = T)))
> df
# A tibble: 100 x 3
id group x
<int> <chr> <int>
1 1 a 78
2 2 b 80
3 3 c 7
4 4 d 100
5 5 a 45
6 6 b 76
7 7 c 25
8 8 d 91
9 9 a 13
10 10 b 84
# ... with 90 more rows
# Function to create a table ten percentiles for a numeric vector
percentiles_table <- function(x) {
res <- round(quantile(x, probs = seq(from=.1, to=1, by=0.1)), 0)
res <- data.frame(percentile = names(res), to = res )
res <- res %>%
mutate(from = lag(to, default = 0)) %>%
select(from,to,percentile)
}
# Table of percentiles
percentiles <- df %>%
group_by(group) %>%
summarise(percentiles_table(x)) %>%
ungroup()
> percentiles
# A tibble: 40 x 4
group from to percentile
<chr> <dbl> <dbl> <chr>
1 a 0 25 10%
2 a 25 71 20%
3 a 71 106 30%
4 a 106 125 40%
5 a 125 198 50%
6 a 198 236 60%
7 a 236 278 70%
8 a 278 325 80%
9 a 325 379 90%
10 a 379 389 100%
我想为 x
的值介于 from
和 to
之间的每个组将 percentile
列添加到 df。
可能有一些方法可以直接计算 percentile
列,而无需在单独的 data.frame
中计算,然后将其附加回 df
。
install.packages("zoo")
library(zoo)
y=as.data.frame(c(0:max(percentiles$to)))
y=merge(y,unique(percentiles[,c(1)]))
y=merge(y,percentiles[,c(1,2,4)], by.x = c("group","c(0:max(percentiles$to))"), by.y = c("group","from"), all.x = TRUE)
y=na.locf(y)
df=merge(df,y, all.x = TRUE, by.x = c("group","x"), by.y = c("group","c(0:max(percentiles$to))"))
使用data.table
:
setDT(df)[
,
percentile := cut(
x,
quantile(x, seq(0, 1, 0.1)),
include.lowest = TRUE,
labels = paste0(seq(10, 100, 10), "%")
),
by = group
]
我得到了这个有效的解决方案。
percentile_ranks <- function(x) {
res <- trunc(rank(x))/length(x) * 100
res <- floor(res/10) }
df <- df %>%
group_by(group) %>%
arrange(x) %>%
mutate(percentile = percentile_ranks(x)) %>%
mutate(percentile_pct = paste0(percentile*10,"%")) %>%
ungroup() %>%
arrange(id) # original data.frame order
一个 one-liner 和我的 santoku
包裹:
library(santoku)
df |>
group_by(group) |>
mutate(
percentile = chop_quantiles(x, 0:100/100,
labels = lbl_endpoint())
)
# A tibble: 100 × 4
# Groups: group [4]
id group x percentile
<int> <chr> <int> <fct>
1 1 a 35 8%
2 2 b 97 20%
3 3 c 39 4%
4 4 d 20 8%
5 5 a 89 16%
...