如何自动修改间隔因子水平以更好地显示
how to auto modify interval factor level for better display
假设您的数据看起来像这样
df <- data.frame(income = rnorm(1000,77345,30569))
您添加一列来指示每个观测值所属的四分位数间隔因子
df$quant <- cut(df$income, quantile(df$income))
因子水平看起来像这样
Levels: (-4.48e+04,5.6e+04] (5.6e+04,7.69e+04] (7.69e+04,9.73e+04] (9.73e+04,1.64e+05]
如何以编程方式(而不是手动)更改间隔,以便它们在频率摘要中很好地打印出来?table?
df %>% count(quant)
打印如下:
quant n
1 (-4.48e+04,5.6e+04] 249
2 (5.6e+04,7.69e+04] 250
3 (7.69e+04,9.73e+04] 250
4 (9.73e+04,1.64e+05] 250
我希望它看起来像这样
quant n
1 (,800,,000] 249
2 (,000,,900] 250
3 (,900,,300] 250
4 (,300,4,000] 250
这仅用于打印目的(在 Rmarkdown 报告中)。我已经毫无问题地进行了所有计算和绘图。
cut2
可以接受一个 formatfun
参数
library(Hmisc)
library(scales)
df$quant2 <- cut2(df$income,digits = 5, cuts = quantile(df$income),
formatfun = function(x) paste0("$", comma(x)), onlycuts = TRUE)
-输出
> head(df)
income quant2 quant
1 60657.97 [,485,,547) (5.55e+04,7.65e+04]
2 93747.88 [,547,,620) (7.65e+04,9.66e+04]
3 90172.46 [,547,,620) (7.65e+04,9.66e+04]
4 59504.10 [,485,,547) (5.55e+04,7.65e+04]
5 103251.01 [,620,8,251] (9.66e+04,1.78e+05]
6 85477.03 [,547,,620) (7.65e+04,9.66e+04]
如果我们要修改原来的cut
列
library(tidyr)
library(stringr)
df <- df %>%
mutate(quant = str_remove_all(quant, "\(|\]")) %>%
separate(quant, into = c('q1', 'q2'), sep=",", convert = TRUE) %>%
mutate(across(q1:q2, ~ dollar(.x)),
quant = glue::glue("({q1},{q2}]"), q1 = NULL, q2 = NULL)
-输出
> head(df)
income quant
1 60657.97 (,500,,500]
2 93747.88 (,500,,600]
3 90172.46 (,500,,600]
4 59504.10 (,500,,500]
5 103251.01 (,600,8,000]
6 85477.03 (,500,,600]
这是另一个解决方案:
q <- quantile(df$income)
qlbls <- sapply(1:4,function(i) paste0("(",scales::dollar(q[i]),",",scales::dollar(q[i+1]),"]"))
df$quant <- cut(df$income, q, labels = qlbls)
-输出
> head(df)
income quant
1 43842.61 (,745.22,,569.49]
2 73176.84 (,569.49,,945.41]
3 85658.10 (,945.41,,013.99]
4 84613.72 (,945.41,,013.99]
5 130301.96 (,013.99,9,552]
6 61917.61 (,569.49,,945.41]
这个解决方案非常灵活,你可以把这个因素做得很漂亮。
chop_quantiles()
来自我的 santoku 包对此很有用:
library(santoku)
d <- rnorm(10)
# labelled by the quantiles:
chop_quantiles(d, c(0.1, 0.5, 0.9))
[1] [0%, 10%) [10%, 50%) [50%, 90%] [10%, 50%) [10%, 50%) (90%, 100%]
[7] [50%, 90%] [10%, 50%) [50%, 90%] [50%, 90%]
Levels: [0%, 10%) [10%, 50%) [50%, 90%] (90%, 100%]
# by the raw values:
chop_quantiles(d, c(0.1, 0.5, 0.9), labels = lbl_intervals(raw = TRUE))
[1] [-2.515, -1.633) [-1.633, -0.8172) [-0.8172, 0.3274] [-1.633, -0.8172)
[5] [-1.633, -0.8172) (0.3274, 0.4165] [-0.8172, 0.3274] [-1.633, -0.8172)
[9] [-0.8172, 0.3274] [-0.8172, 0.3274]
4 Levels: [-2.515, -1.633) [-1.633, -0.8172) ... (0.3274, 0.4165]
# format string passed to sprintf():
chop_quantiles(d, c(0.1, 0.5, 0.9), labels = lbl_intervals(raw = TRUE, fmt = "%.2f"))
[1] [-2.52, -1.63) [-1.63, -0.82) [-0.82, 0.33] [-1.63, -0.82) [-1.63, -0.82)
[6] (0.33, 0.42] [-0.82, 0.33] [-1.63, -0.82) [-0.82, 0.33] [-0.82, 0.33]
Levels: [-2.52, -1.63) [-1.63, -0.82) [-0.82, 0.33] (0.33, 0.42]
# different kinds of labels:
chop_quantiles(d, c(0.1, 0.5, 0.9), labels = lbl_dash(" - ", raw = TRUE))
[1] -2.515 - -1.633 -1.633 - -0.8172 -0.8172 - 0.3274 -1.633 - -0.8172
[5] -1.633 - -0.8172 0.3274 - 0.4165 -0.8172 - 0.3274 -1.633 - -0.8172
[9] -0.8172 - 0.3274 -0.8172 - 0.3274
4 Levels: -2.515 - -1.633 -1.633 - -0.8172 ... 0.3274 - 0.4165
# make your own:
chop_quantiles(d, c(0.1, 0.5, 0.9), labels = lbl_glue("{l} to {r}", raw = TRUE))
[1] -2.515 to -1.633 -1.633 to -0.8172 -0.8172 to 0.3274 -1.633 to -0.8172
[5] -1.633 to -0.8172 0.3274 to 0.4165 -0.8172 to 0.3274 -1.633 to -0.8172
[9] -0.8172 to 0.3274 -0.8172 to 0.3274
假设您的数据看起来像这样
df <- data.frame(income = rnorm(1000,77345,30569))
您添加一列来指示每个观测值所属的四分位数间隔因子
df$quant <- cut(df$income, quantile(df$income))
因子水平看起来像这样
Levels: (-4.48e+04,5.6e+04] (5.6e+04,7.69e+04] (7.69e+04,9.73e+04] (9.73e+04,1.64e+05]
如何以编程方式(而不是手动)更改间隔,以便它们在频率摘要中很好地打印出来?table?
df %>% count(quant)
打印如下:
quant n
1 (-4.48e+04,5.6e+04] 249
2 (5.6e+04,7.69e+04] 250
3 (7.69e+04,9.73e+04] 250
4 (9.73e+04,1.64e+05] 250
我希望它看起来像这样
quant n
1 (,800,,000] 249
2 (,000,,900] 250
3 (,900,,300] 250
4 (,300,4,000] 250
这仅用于打印目的(在 Rmarkdown 报告中)。我已经毫无问题地进行了所有计算和绘图。
cut2
可以接受一个 formatfun
参数
library(Hmisc)
library(scales)
df$quant2 <- cut2(df$income,digits = 5, cuts = quantile(df$income),
formatfun = function(x) paste0("$", comma(x)), onlycuts = TRUE)
-输出
> head(df)
income quant2 quant
1 60657.97 [,485,,547) (5.55e+04,7.65e+04]
2 93747.88 [,547,,620) (7.65e+04,9.66e+04]
3 90172.46 [,547,,620) (7.65e+04,9.66e+04]
4 59504.10 [,485,,547) (5.55e+04,7.65e+04]
5 103251.01 [,620,8,251] (9.66e+04,1.78e+05]
6 85477.03 [,547,,620) (7.65e+04,9.66e+04]
如果我们要修改原来的cut
列
library(tidyr)
library(stringr)
df <- df %>%
mutate(quant = str_remove_all(quant, "\(|\]")) %>%
separate(quant, into = c('q1', 'q2'), sep=",", convert = TRUE) %>%
mutate(across(q1:q2, ~ dollar(.x)),
quant = glue::glue("({q1},{q2}]"), q1 = NULL, q2 = NULL)
-输出
> head(df)
income quant
1 60657.97 (,500,,500]
2 93747.88 (,500,,600]
3 90172.46 (,500,,600]
4 59504.10 (,500,,500]
5 103251.01 (,600,8,000]
6 85477.03 (,500,,600]
这是另一个解决方案:
q <- quantile(df$income)
qlbls <- sapply(1:4,function(i) paste0("(",scales::dollar(q[i]),",",scales::dollar(q[i+1]),"]"))
df$quant <- cut(df$income, q, labels = qlbls)
-输出
> head(df)
income quant
1 43842.61 (,745.22,,569.49]
2 73176.84 (,569.49,,945.41]
3 85658.10 (,945.41,,013.99]
4 84613.72 (,945.41,,013.99]
5 130301.96 (,013.99,9,552]
6 61917.61 (,569.49,,945.41]
这个解决方案非常灵活,你可以把这个因素做得很漂亮。
chop_quantiles()
来自我的 santoku 包对此很有用:
library(santoku)
d <- rnorm(10)
# labelled by the quantiles:
chop_quantiles(d, c(0.1, 0.5, 0.9))
[1] [0%, 10%) [10%, 50%) [50%, 90%] [10%, 50%) [10%, 50%) (90%, 100%]
[7] [50%, 90%] [10%, 50%) [50%, 90%] [50%, 90%]
Levels: [0%, 10%) [10%, 50%) [50%, 90%] (90%, 100%]
# by the raw values:
chop_quantiles(d, c(0.1, 0.5, 0.9), labels = lbl_intervals(raw = TRUE))
[1] [-2.515, -1.633) [-1.633, -0.8172) [-0.8172, 0.3274] [-1.633, -0.8172)
[5] [-1.633, -0.8172) (0.3274, 0.4165] [-0.8172, 0.3274] [-1.633, -0.8172)
[9] [-0.8172, 0.3274] [-0.8172, 0.3274]
4 Levels: [-2.515, -1.633) [-1.633, -0.8172) ... (0.3274, 0.4165]
# format string passed to sprintf():
chop_quantiles(d, c(0.1, 0.5, 0.9), labels = lbl_intervals(raw = TRUE, fmt = "%.2f"))
[1] [-2.52, -1.63) [-1.63, -0.82) [-0.82, 0.33] [-1.63, -0.82) [-1.63, -0.82)
[6] (0.33, 0.42] [-0.82, 0.33] [-1.63, -0.82) [-0.82, 0.33] [-0.82, 0.33]
Levels: [-2.52, -1.63) [-1.63, -0.82) [-0.82, 0.33] (0.33, 0.42]
# different kinds of labels:
chop_quantiles(d, c(0.1, 0.5, 0.9), labels = lbl_dash(" - ", raw = TRUE))
[1] -2.515 - -1.633 -1.633 - -0.8172 -0.8172 - 0.3274 -1.633 - -0.8172
[5] -1.633 - -0.8172 0.3274 - 0.4165 -0.8172 - 0.3274 -1.633 - -0.8172
[9] -0.8172 - 0.3274 -0.8172 - 0.3274
4 Levels: -2.515 - -1.633 -1.633 - -0.8172 ... 0.3274 - 0.4165
# make your own:
chop_quantiles(d, c(0.1, 0.5, 0.9), labels = lbl_glue("{l} to {r}", raw = TRUE))
[1] -2.515 to -1.633 -1.633 to -0.8172 -0.8172 to 0.3274 -1.633 to -0.8172
[5] -1.633 to -0.8172 0.3274 to 0.4165 -0.8172 to 0.3274 -1.633 to -0.8172
[9] -0.8172 to 0.3274 -0.8172 to 0.3274