计算列中每个值的经验分位数
Calculate empirical quantile for each value in a column
我想为 data.frame 的多个列中的每个值计算经验分位数。
df <- structure(list(id = 1:20, number_1 = c(32, 220, -176, 113, 4,
317, -55, 41, -97, 43, -8, -129, 125, 30, 44, 88, -42, 7, 73,
-5), number_2 = c(15, -105, 77, 41, -183, 120, -216, -46, 40,
-9, -46, 318, -92, 223, -66, 88, -45, -48, -1, -208), number_3 = c(-16,
223, 34, 46, -62, 18, -158, 68, 124, 130, 54, 89, 89, -93, 46,
-126, 21, -20, 118, 20)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -20L))
# head(df)
# A tibble: 6 x 4
id number_1 number_2 number_3
<int> <dbl> <dbl> <dbl>
1 1 32 15 -16
2 2 220 -105 223
3 3 -176 77 34
4 4 113 41 46
5 5 4 -183 -62
6 6 317 120 18
我在 SO 上找到了 ,但是函数 ecdf()
似乎不适用于计算 mutate()
.
中的列
我使用这种方法让它工作:
# generate a temporary df
quantiles_temp <- map(df, ~quantile(.x, probs = seq(0, 1, 0.2)))
# Use quantiles_temp list to check each value in df with a case_when
df_quantiles <- df %>%
mutate(across(number_1:number_3,
~case_when(
.x <= quantiles_temp[[cur_column()]]["20%"] ~ "0% - 20%",
.x >= quantiles_temp[[cur_column()]]["20%"] & .x <= quantiles_temp[[cur_column()]]["40%"] ~ "20% - 40%",
.x >= quantiles_temp[[cur_column()]]["40%"] & .x <= quantiles_temp[[cur_column()]]["60%"] ~ "40% - 60%",
.x >= quantiles_temp[[cur_column()]]["60%"] & .x <= quantiles_temp[[cur_column()]]["80%"] ~ "60% - 80%",
.x >= quantiles_temp[[cur_column()]]["80%"] ~ "80% - 100%"))
)
> head(df_quantiles)
# A tibble: 6 x 4
id number_1 number_2 number_3
<int> <chr> <chr> <chr>
1 1 40% - 60% 60% - 80% 20% - 40%
2 2 80% - 100% 0% - 20% 80% - 100%
3 3 0% - 20% 60% - 80% 40% - 60%
4 4 80% - 100% 60% - 80% 40% - 60%
5 5 20% - 40% 0% - 20% 0% - 20%
6 6 80% - 100% 80% - 100% 20% - 40%
有没有更好的方法?
我的方法要求我生成一个临时列表 quantiles_temp
以查找每列中每个值的经验分位数。
当我试图在可读性和性能方面优化我的代码时,我想知道是否有更好的方法来计算每列的分位数?
试试这个:
library(dplyr)
df %>%
mutate(
across(number_1:number_3,
~ cut(., quantile(., seq(0, 1, by = 0.2)),
labels = c("0-20%", "20-40%", "40-60%", "60-80%", "80-100%"),
include.lowest = TRUE))
)
# # A tibble: 20 x 4
# id number_1 number_2 number_3
# <int> <fct> <fct> <fct>
# 1 1 40-60% 60-80% 20-40%
# 2 2 80-100% 0-20% 80-100%
# 3 3 0-20% 60-80% 40-60%
# 4 4 80-100% 60-80% 40-60%
# 5 5 20-40% 0-20% 0-20%
# 6 6 80-100% 80-100% 20-40%
# 7 7 0-20% 0-20% 0-20%
# 8 8 40-60% 20-40% 60-80%
# 9 9 0-20% 60-80% 80-100%
# 10 10 60-80% 40-60% 80-100%
# 11 11 20-40% 20-40% 60-80%
# 12 12 0-20% 80-100% 60-80%
# 13 13 80-100% 20-40% 60-80%
# 14 14 40-60% 80-100% 0-20%
# 15 15 60-80% 20-40% 40-60%
# 16 16 60-80% 80-100% 0-20%
# 17 17 20-40% 40-60% 40-60%
# 18 18 40-60% 20-40% 20-40%
# 19 19 60-80% 40-60% 80-100%
# 20 20 20-40% 0-20% 20-40%
这会产生 factor
s;如果您选择,如果您更喜欢字符串,请将 cut(.)
包装在 as.character
中(尽管我建议因子的序数性质为 sorting/ranking 之类的东西提供了良好的基础)。
我想为 data.frame 的多个列中的每个值计算经验分位数。
df <- structure(list(id = 1:20, number_1 = c(32, 220, -176, 113, 4,
317, -55, 41, -97, 43, -8, -129, 125, 30, 44, 88, -42, 7, 73,
-5), number_2 = c(15, -105, 77, 41, -183, 120, -216, -46, 40,
-9, -46, 318, -92, 223, -66, 88, -45, -48, -1, -208), number_3 = c(-16,
223, 34, 46, -62, 18, -158, 68, 124, 130, 54, 89, 89, -93, 46,
-126, 21, -20, 118, 20)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -20L))
# head(df)
# A tibble: 6 x 4
id number_1 number_2 number_3
<int> <dbl> <dbl> <dbl>
1 1 32 15 -16
2 2 220 -105 223
3 3 -176 77 34
4 4 113 41 46
5 5 4 -183 -62
6 6 317 120 18
我在 SO 上找到了 ecdf()
似乎不适用于计算 mutate()
.
我使用这种方法让它工作:
# generate a temporary df
quantiles_temp <- map(df, ~quantile(.x, probs = seq(0, 1, 0.2)))
# Use quantiles_temp list to check each value in df with a case_when
df_quantiles <- df %>%
mutate(across(number_1:number_3,
~case_when(
.x <= quantiles_temp[[cur_column()]]["20%"] ~ "0% - 20%",
.x >= quantiles_temp[[cur_column()]]["20%"] & .x <= quantiles_temp[[cur_column()]]["40%"] ~ "20% - 40%",
.x >= quantiles_temp[[cur_column()]]["40%"] & .x <= quantiles_temp[[cur_column()]]["60%"] ~ "40% - 60%",
.x >= quantiles_temp[[cur_column()]]["60%"] & .x <= quantiles_temp[[cur_column()]]["80%"] ~ "60% - 80%",
.x >= quantiles_temp[[cur_column()]]["80%"] ~ "80% - 100%"))
)
> head(df_quantiles)
# A tibble: 6 x 4
id number_1 number_2 number_3
<int> <chr> <chr> <chr>
1 1 40% - 60% 60% - 80% 20% - 40%
2 2 80% - 100% 0% - 20% 80% - 100%
3 3 0% - 20% 60% - 80% 40% - 60%
4 4 80% - 100% 60% - 80% 40% - 60%
5 5 20% - 40% 0% - 20% 0% - 20%
6 6 80% - 100% 80% - 100% 20% - 40%
有没有更好的方法?
我的方法要求我生成一个临时列表 quantiles_temp
以查找每列中每个值的经验分位数。
当我试图在可读性和性能方面优化我的代码时,我想知道是否有更好的方法来计算每列的分位数?
试试这个:
library(dplyr)
df %>%
mutate(
across(number_1:number_3,
~ cut(., quantile(., seq(0, 1, by = 0.2)),
labels = c("0-20%", "20-40%", "40-60%", "60-80%", "80-100%"),
include.lowest = TRUE))
)
# # A tibble: 20 x 4
# id number_1 number_2 number_3
# <int> <fct> <fct> <fct>
# 1 1 40-60% 60-80% 20-40%
# 2 2 80-100% 0-20% 80-100%
# 3 3 0-20% 60-80% 40-60%
# 4 4 80-100% 60-80% 40-60%
# 5 5 20-40% 0-20% 0-20%
# 6 6 80-100% 80-100% 20-40%
# 7 7 0-20% 0-20% 0-20%
# 8 8 40-60% 20-40% 60-80%
# 9 9 0-20% 60-80% 80-100%
# 10 10 60-80% 40-60% 80-100%
# 11 11 20-40% 20-40% 60-80%
# 12 12 0-20% 80-100% 60-80%
# 13 13 80-100% 20-40% 60-80%
# 14 14 40-60% 80-100% 0-20%
# 15 15 60-80% 20-40% 40-60%
# 16 16 60-80% 80-100% 0-20%
# 17 17 20-40% 40-60% 40-60%
# 18 18 40-60% 20-40% 20-40%
# 19 19 60-80% 40-60% 80-100%
# 20 20 20-40% 0-20% 20-40%
这会产生 factor
s;如果您选择,如果您更喜欢字符串,请将 cut(.)
包装在 as.character
中(尽管我建议因子的序数性质为 sorting/ranking 之类的东西提供了良好的基础)。