使用 forcats 进行因子排序
Factor ordering with forcats
我有数据要分箱并转换为一个因子。不过,我在理解我的因子变量发生了什么时遇到了一些麻烦。我正在尝试根据连续变量对因子变量进行排序。
我已经仔细阅读了它,但我看到的所有示例仅包含每个因子级别的一个实例,而我的示例包含某些因子级别的多个实例。
示例数据如下:
df <- structure(list(Group = c("Grp1", "Grp1", "Grp1", "Grp1", "Grp1",
"Grp1", "Grp1", "Grp2", "Grp2", "Grp2", "Grp2", "Grp2"), Ind = c("A",
"B", "C", "D", "E", "F", "G", "A", "B", "C", "D", "E"), Value = c(0.155903329567489,
0.0582906870761889, 0.180600101489814, 0.26357423622443, 0.0637832368895064,
0.213803701918138, 0.0640447068344333, 0.333501508730367, 0.160676738803951,
0.279178514111584, 0.145767023637501, 0.0808762147165962)), row.names = c(NA,
-12L), class = c("tbl_df", "tbl", "data.frame"))
根据这些数据,我创建了一个因子并检查了每个元素的顺序。
library(dplyr)
library(forcats)
df %>%
group_by(Group) %>%
mutate(Bin = cut_interval(Value, n = nrow(.))) %>%
mutate(Order = labels(Bin)) %>%
ungroup()
# A tibble: 12 x 5
Group Ind Value Bin Order
<chr> <chr> <dbl> <fct> <chr>
1 Grp1 A 0.156 (0.144,0.161] 1
2 Grp1 B 0.0583 [0.0583,0.0754] 2
3 Grp1 C 0.181 (0.178,0.195] 3
4 Grp1 D 0.264 (0.246,0.264] 4
5 Grp1 E 0.0638 [0.0583,0.0754] 5
6 Grp1 F 0.214 (0.212,0.229] 6
7 Grp1 G 0.0640 [0.0583,0.0754] 7
8 Grp2 A 0.334 (0.312,0.334] 1
9 Grp2 B 0.161 (0.144,0.165] 2
10 Grp2 C 0.279 (0.27,0.291] 3
11 Grp2 D 0.146 (0.144,0.165] 4
12 Grp2 E 0.0809 [0.0809,0.102] 5
然后尝试在创建因子后根据“值”对因子重新排序,但顺序似乎没有改变。
df %>%
group_by(Group) %>%
mutate(Bin = cut_interval(Value, n = nrow(.)),
Bin = fct_reorder(Bin, Value)) %>%
mutate(Order = labels(Bin)) %>%
ungroup()
# A tibble: 12 x 5
Group Ind Value Bin Order
<chr> <chr> <dbl> <fct> <chr>
1 Grp1 A 0.156 (0.144,0.161] 1
2 Grp1 B 0.0583 [0.0583,0.0754] 2
3 Grp1 C 0.181 (0.178,0.195] 3
4 Grp1 D 0.264 (0.246,0.264] 4
5 Grp1 E 0.0638 [0.0583,0.0754] 5
6 Grp1 F 0.214 (0.212,0.229] 6
7 Grp1 G 0.0640 [0.0583,0.0754] 7
8 Grp2 A 0.334 (0.312,0.334] 1
9 Grp2 B 0.161 (0.144,0.165] 2
10 Grp2 C 0.279 (0.27,0.291] 3
11 Grp2 D 0.146 (0.144,0.165] 4
12 Grp2 E 0.0809 [0.0809,0.102] 5
然后我在创建因子之前将数据排列在“值”上并得到了正确的顺序。
df %>%
arrange(Group, Value) %>%
group_by(Group) %>%
mutate(Bin = cut_interval(Value, n = nrow(.))) %>%
mutate(Order = labels(Bin)) %>%
ungroup()
# A tibble: 12 x 5
Group Ind Value Bin Order
<chr> <chr> <dbl> <fct> <chr>
1 Grp1 B 0.0583 [0.0583,0.0754] 1
2 Grp1 E 0.0638 [0.0583,0.0754] 2
3 Grp1 G 0.0640 [0.0583,0.0754] 3
4 Grp1 A 0.156 (0.144,0.161] 4
5 Grp1 C 0.181 (0.178,0.195] 5
6 Grp1 F 0.214 (0.212,0.229] 6
7 Grp1 D 0.264 (0.246,0.264] 7
8 Grp2 E 0.0809 [0.0809,0.102] 1
9 Grp2 D 0.146 (0.144,0.165] 2
10 Grp2 B 0.161 (0.144,0.165] 3
11 Grp2 C 0.279 (0.27,0.291] 4
12 Grp2 A 0.334 (0.312,0.334] 5
那么首先,为什么 fct_reorder
没有按照我的意愿去做?其次,为什么“Grp1”中有 7 个值而“Grp2”中有 5 个值?由于每组中重复的“Bin”值,不应该分别只有 5 和 4 吗?
是levels
点的。根据?fct_reorder
.x, .y - The levels of f are reordered so that the values of .fun(.x) (for fct_reorder()) and fun(.x, .y) (for fct_reorder2()) are in ascending order.
在 arrange
ing Bin
之后,通过在删除未使用的级别 (droplevels
)[=24= 后转换为 integer
来创建 'Order' ]
library(dplyr)
library(forcats)
out <- df %>%
group_by(Group) %>%
mutate(Bin = cut_interval(Value, n = nrow(.)),
Bin = fct_reorder(Bin, Value)) %>%
arrange(as.integer(Bin)) %>%
mutate(Order = as.integer(droplevels(Bin))) %>%
ungroup
out
# A tibble: 12 x 5
Group Ind Value Bin Order
<chr> <chr> <dbl> <fct> <int>
1 Grp1 B 0.0583 [0.0583,0.0754] 1
2 Grp1 E 0.0638 [0.0583,0.0754] 1
3 Grp1 G 0.0640 [0.0583,0.0754] 1
4 Grp1 A 0.156 (0.144,0.161] 2
5 Grp1 C 0.181 (0.178,0.195] 3
6 Grp1 F 0.214 (0.212,0.229] 4
7 Grp1 D 0.264 (0.246,0.264] 5
8 Grp2 E 0.0809 [0.0809,0.102] 1
9 Grp2 B 0.161 (0.144,0.165] 2
10 Grp2 D 0.146 (0.144,0.165] 2
11 Grp2 C 0.279 (0.27,0.291] 3
12 Grp2 A 0.334 (0.312,0.334] 4
或使用 match
和 unique
df %>%
group_by(Group) %>%
mutate(Bin = cut_interval(Value, n = nrow(.)),
Bin = fct_reorder(Bin, Value)) %>%
arrange(as.integer(Bin)) %>% mutate(Order = match(Bin, unique(Bin))) %>%
ungroup
# A tibble: 12 x 5
Group Ind Value Bin Order
<chr> <chr> <dbl> <fct> <int>
1 Grp1 B 0.0583 [0.0583,0.0754] 1
2 Grp1 E 0.0638 [0.0583,0.0754] 1
3 Grp1 G 0.0640 [0.0583,0.0754] 1
4 Grp1 A 0.156 (0.144,0.161] 2
5 Grp1 C 0.181 (0.178,0.195] 3
6 Grp1 F 0.214 (0.212,0.229] 4
7 Grp1 D 0.264 (0.246,0.264] 5
8 Grp2 E 0.0809 [0.0809,0.102] 1
9 Grp2 B 0.161 (0.144,0.165] 2
10 Grp2 D 0.146 (0.144,0.165] 2
11 Grp2 C 0.279 (0.27,0.291] 3
12 Grp2 A 0.334 (0.312,0.334] 4
关于 fct_reorder
没有完成任何事情,检查 `step
前后的 levels
> tmp <- df %>%
group_by(Group) %>%
mutate(Bin = cut_interval(Value, n = nrow(.)))
> tmp %>% pull(Bin) %>% levels
[1] "[0.0583,0.0754]" "(0.0754,0.0925]" "(0.0925,0.11]" "(0.11,0.127]" "(0.127,0.144]" "(0.144,0.161]" "(0.161,0.178]" "(0.178,0.195]" "(0.195,0.212]"
[10] "(0.212,0.229]" "(0.229,0.246]" "(0.246,0.264]" "[0.0809,0.102]" "(0.102,0.123]" "(0.123,0.144]" "(0.144,0.165]" "(0.165,0.186]" "(0.186,0.207]"
[19] "(0.207,0.228]" "(0.228,0.249]" "(0.249,0.27]" "(0.27,0.291]" "(0.291,0.312]" "(0.312,0.334]"
> tmp %>% mutate(Bin = fct_reorder(Bin, Value)) %>% pull(Bin) %>% levels
[1] "[0.0583,0.0754]" "(0.144,0.161]" "(0.178,0.195]" "(0.212,0.229]" "(0.246,0.264]" "(0.0754,0.0925]" "(0.0925,0.11]" "(0.11,0.127]" "(0.127,0.144]"
[10] "(0.161,0.178]" "(0.195,0.212]" "(0.229,0.246]" "[0.0809,0.102]" "(0.102,0.123]" "(0.123,0.144]" "(0.144,0.165]" "(0.165,0.186]" "(0.186,0.207]"
[19] "(0.207,0.228]" "(0.228,0.249]" "(0.249,0.27]" "(0.27,0.291]" "(0.291,0.312]" "(0.312,0.334]"
我有数据要分箱并转换为一个因子。不过,我在理解我的因子变量发生了什么时遇到了一些麻烦。我正在尝试根据连续变量对因子变量进行排序。
我已经仔细阅读了它,但我看到的所有示例仅包含每个因子级别的一个实例,而我的示例包含某些因子级别的多个实例。
示例数据如下:
df <- structure(list(Group = c("Grp1", "Grp1", "Grp1", "Grp1", "Grp1",
"Grp1", "Grp1", "Grp2", "Grp2", "Grp2", "Grp2", "Grp2"), Ind = c("A",
"B", "C", "D", "E", "F", "G", "A", "B", "C", "D", "E"), Value = c(0.155903329567489,
0.0582906870761889, 0.180600101489814, 0.26357423622443, 0.0637832368895064,
0.213803701918138, 0.0640447068344333, 0.333501508730367, 0.160676738803951,
0.279178514111584, 0.145767023637501, 0.0808762147165962)), row.names = c(NA,
-12L), class = c("tbl_df", "tbl", "data.frame"))
根据这些数据,我创建了一个因子并检查了每个元素的顺序。
library(dplyr)
library(forcats)
df %>%
group_by(Group) %>%
mutate(Bin = cut_interval(Value, n = nrow(.))) %>%
mutate(Order = labels(Bin)) %>%
ungroup()
# A tibble: 12 x 5
Group Ind Value Bin Order
<chr> <chr> <dbl> <fct> <chr>
1 Grp1 A 0.156 (0.144,0.161] 1
2 Grp1 B 0.0583 [0.0583,0.0754] 2
3 Grp1 C 0.181 (0.178,0.195] 3
4 Grp1 D 0.264 (0.246,0.264] 4
5 Grp1 E 0.0638 [0.0583,0.0754] 5
6 Grp1 F 0.214 (0.212,0.229] 6
7 Grp1 G 0.0640 [0.0583,0.0754] 7
8 Grp2 A 0.334 (0.312,0.334] 1
9 Grp2 B 0.161 (0.144,0.165] 2
10 Grp2 C 0.279 (0.27,0.291] 3
11 Grp2 D 0.146 (0.144,0.165] 4
12 Grp2 E 0.0809 [0.0809,0.102] 5
然后尝试在创建因子后根据“值”对因子重新排序,但顺序似乎没有改变。
df %>%
group_by(Group) %>%
mutate(Bin = cut_interval(Value, n = nrow(.)),
Bin = fct_reorder(Bin, Value)) %>%
mutate(Order = labels(Bin)) %>%
ungroup()
# A tibble: 12 x 5
Group Ind Value Bin Order
<chr> <chr> <dbl> <fct> <chr>
1 Grp1 A 0.156 (0.144,0.161] 1
2 Grp1 B 0.0583 [0.0583,0.0754] 2
3 Grp1 C 0.181 (0.178,0.195] 3
4 Grp1 D 0.264 (0.246,0.264] 4
5 Grp1 E 0.0638 [0.0583,0.0754] 5
6 Grp1 F 0.214 (0.212,0.229] 6
7 Grp1 G 0.0640 [0.0583,0.0754] 7
8 Grp2 A 0.334 (0.312,0.334] 1
9 Grp2 B 0.161 (0.144,0.165] 2
10 Grp2 C 0.279 (0.27,0.291] 3
11 Grp2 D 0.146 (0.144,0.165] 4
12 Grp2 E 0.0809 [0.0809,0.102] 5
然后我在创建因子之前将数据排列在“值”上并得到了正确的顺序。
df %>%
arrange(Group, Value) %>%
group_by(Group) %>%
mutate(Bin = cut_interval(Value, n = nrow(.))) %>%
mutate(Order = labels(Bin)) %>%
ungroup()
# A tibble: 12 x 5
Group Ind Value Bin Order
<chr> <chr> <dbl> <fct> <chr>
1 Grp1 B 0.0583 [0.0583,0.0754] 1
2 Grp1 E 0.0638 [0.0583,0.0754] 2
3 Grp1 G 0.0640 [0.0583,0.0754] 3
4 Grp1 A 0.156 (0.144,0.161] 4
5 Grp1 C 0.181 (0.178,0.195] 5
6 Grp1 F 0.214 (0.212,0.229] 6
7 Grp1 D 0.264 (0.246,0.264] 7
8 Grp2 E 0.0809 [0.0809,0.102] 1
9 Grp2 D 0.146 (0.144,0.165] 2
10 Grp2 B 0.161 (0.144,0.165] 3
11 Grp2 C 0.279 (0.27,0.291] 4
12 Grp2 A 0.334 (0.312,0.334] 5
那么首先,为什么 fct_reorder
没有按照我的意愿去做?其次,为什么“Grp1”中有 7 个值而“Grp2”中有 5 个值?由于每组中重复的“Bin”值,不应该分别只有 5 和 4 吗?
是levels
点的。根据?fct_reorder
.x, .y - The levels of f are reordered so that the values of .fun(.x) (for fct_reorder()) and fun(.x, .y) (for fct_reorder2()) are in ascending order.
在 arrange
ing Bin
之后,通过在删除未使用的级别 (droplevels
)[=24= 后转换为 integer
来创建 'Order' ]
library(dplyr)
library(forcats)
out <- df %>%
group_by(Group) %>%
mutate(Bin = cut_interval(Value, n = nrow(.)),
Bin = fct_reorder(Bin, Value)) %>%
arrange(as.integer(Bin)) %>%
mutate(Order = as.integer(droplevels(Bin))) %>%
ungroup
out
# A tibble: 12 x 5
Group Ind Value Bin Order
<chr> <chr> <dbl> <fct> <int>
1 Grp1 B 0.0583 [0.0583,0.0754] 1
2 Grp1 E 0.0638 [0.0583,0.0754] 1
3 Grp1 G 0.0640 [0.0583,0.0754] 1
4 Grp1 A 0.156 (0.144,0.161] 2
5 Grp1 C 0.181 (0.178,0.195] 3
6 Grp1 F 0.214 (0.212,0.229] 4
7 Grp1 D 0.264 (0.246,0.264] 5
8 Grp2 E 0.0809 [0.0809,0.102] 1
9 Grp2 B 0.161 (0.144,0.165] 2
10 Grp2 D 0.146 (0.144,0.165] 2
11 Grp2 C 0.279 (0.27,0.291] 3
12 Grp2 A 0.334 (0.312,0.334] 4
或使用 match
和 unique
df %>%
group_by(Group) %>%
mutate(Bin = cut_interval(Value, n = nrow(.)),
Bin = fct_reorder(Bin, Value)) %>%
arrange(as.integer(Bin)) %>% mutate(Order = match(Bin, unique(Bin))) %>%
ungroup
# A tibble: 12 x 5
Group Ind Value Bin Order
<chr> <chr> <dbl> <fct> <int>
1 Grp1 B 0.0583 [0.0583,0.0754] 1
2 Grp1 E 0.0638 [0.0583,0.0754] 1
3 Grp1 G 0.0640 [0.0583,0.0754] 1
4 Grp1 A 0.156 (0.144,0.161] 2
5 Grp1 C 0.181 (0.178,0.195] 3
6 Grp1 F 0.214 (0.212,0.229] 4
7 Grp1 D 0.264 (0.246,0.264] 5
8 Grp2 E 0.0809 [0.0809,0.102] 1
9 Grp2 B 0.161 (0.144,0.165] 2
10 Grp2 D 0.146 (0.144,0.165] 2
11 Grp2 C 0.279 (0.27,0.291] 3
12 Grp2 A 0.334 (0.312,0.334] 4
关于 fct_reorder
没有完成任何事情,检查 `step
levels
> tmp <- df %>%
group_by(Group) %>%
mutate(Bin = cut_interval(Value, n = nrow(.)))
> tmp %>% pull(Bin) %>% levels
[1] "[0.0583,0.0754]" "(0.0754,0.0925]" "(0.0925,0.11]" "(0.11,0.127]" "(0.127,0.144]" "(0.144,0.161]" "(0.161,0.178]" "(0.178,0.195]" "(0.195,0.212]"
[10] "(0.212,0.229]" "(0.229,0.246]" "(0.246,0.264]" "[0.0809,0.102]" "(0.102,0.123]" "(0.123,0.144]" "(0.144,0.165]" "(0.165,0.186]" "(0.186,0.207]"
[19] "(0.207,0.228]" "(0.228,0.249]" "(0.249,0.27]" "(0.27,0.291]" "(0.291,0.312]" "(0.312,0.334]"
> tmp %>% mutate(Bin = fct_reorder(Bin, Value)) %>% pull(Bin) %>% levels
[1] "[0.0583,0.0754]" "(0.144,0.161]" "(0.178,0.195]" "(0.212,0.229]" "(0.246,0.264]" "(0.0754,0.0925]" "(0.0925,0.11]" "(0.11,0.127]" "(0.127,0.144]"
[10] "(0.161,0.178]" "(0.195,0.212]" "(0.229,0.246]" "[0.0809,0.102]" "(0.102,0.123]" "(0.123,0.144]" "(0.144,0.165]" "(0.165,0.186]" "(0.186,0.207]"
[19] "(0.207,0.228]" "(0.228,0.249]" "(0.249,0.27]" "(0.27,0.291]" "(0.291,0.312]" "(0.312,0.334]"