使用 forcats 进行因子排序

Factor ordering with forcats

我有数据要分箱并转换为一个因子。不过,我在理解我的因子变量发生了什么时遇到了一些麻烦。我正在尝试根据连续变量对因子变量进行排序。

我已经仔细阅读了它,但我看到的所有示例仅包含每个因子级别的一个实例,而我的示例包含某些因子级别的多个实例。

示例数据如下:

df <- structure(list(Group = c("Grp1", "Grp1", "Grp1", "Grp1", "Grp1", 
"Grp1", "Grp1", "Grp2", "Grp2", "Grp2", "Grp2", "Grp2"), Ind = c("A", 
"B", "C", "D", "E", "F", "G", "A", "B", "C", "D", "E"), Value = c(0.155903329567489, 
0.0582906870761889, 0.180600101489814, 0.26357423622443, 0.0637832368895064, 
0.213803701918138, 0.0640447068344333, 0.333501508730367, 0.160676738803951, 
0.279178514111584, 0.145767023637501, 0.0808762147165962)), row.names = c(NA, 
-12L), class = c("tbl_df", "tbl", "data.frame"))

根据这些数据,我创建了一个因子并检查了每个元素的顺序。

library(dplyr)
library(forcats)
df %>% 
  group_by(Group) %>% 
  mutate(Bin = cut_interval(Value, n = nrow(.))) %>% 
  mutate(Order = labels(Bin)) %>% 
  ungroup()

# A tibble: 12 x 5
   Group Ind    Value Bin             Order
   <chr> <chr>  <dbl> <fct>           <chr>
 1 Grp1  A     0.156  (0.144,0.161]   1    
 2 Grp1  B     0.0583 [0.0583,0.0754] 2    
 3 Grp1  C     0.181  (0.178,0.195]   3    
 4 Grp1  D     0.264  (0.246,0.264]   4    
 5 Grp1  E     0.0638 [0.0583,0.0754] 5    
 6 Grp1  F     0.214  (0.212,0.229]   6    
 7 Grp1  G     0.0640 [0.0583,0.0754] 7    
 8 Grp2  A     0.334  (0.312,0.334]   1    
 9 Grp2  B     0.161  (0.144,0.165]   2    
10 Grp2  C     0.279  (0.27,0.291]    3    
11 Grp2  D     0.146  (0.144,0.165]   4    
12 Grp2  E     0.0809 [0.0809,0.102]  5

然后尝试在创建因子后根据“值”对因子重新排序,但顺序似乎没有改变。

df %>% 
  group_by(Group) %>% 
  mutate(Bin = cut_interval(Value, n = nrow(.)), 
         Bin = fct_reorder(Bin, Value)) %>% 
  mutate(Order = labels(Bin)) %>% 
  ungroup()

# A tibble: 12 x 5
   Group Ind    Value Bin             Order
   <chr> <chr>  <dbl> <fct>           <chr>
 1 Grp1  A     0.156  (0.144,0.161]   1    
 2 Grp1  B     0.0583 [0.0583,0.0754] 2    
 3 Grp1  C     0.181  (0.178,0.195]   3    
 4 Grp1  D     0.264  (0.246,0.264]   4    
 5 Grp1  E     0.0638 [0.0583,0.0754] 5    
 6 Grp1  F     0.214  (0.212,0.229]   6    
 7 Grp1  G     0.0640 [0.0583,0.0754] 7    
 8 Grp2  A     0.334  (0.312,0.334]   1    
 9 Grp2  B     0.161  (0.144,0.165]   2    
10 Grp2  C     0.279  (0.27,0.291]    3    
11 Grp2  D     0.146  (0.144,0.165]   4    
12 Grp2  E     0.0809 [0.0809,0.102]  5 

然后我在创建因子之前将数据排列在“值”上并得到了正确的顺序。

df %>% 
  arrange(Group, Value) %>% 
  group_by(Group) %>% 
  mutate(Bin = cut_interval(Value, n = nrow(.))) %>% 
  mutate(Order = labels(Bin)) %>% 
  ungroup()

# A tibble: 12 x 5
   Group Ind    Value Bin             Order
   <chr> <chr>  <dbl> <fct>           <chr>
 1 Grp1  B     0.0583 [0.0583,0.0754] 1    
 2 Grp1  E     0.0638 [0.0583,0.0754] 2    
 3 Grp1  G     0.0640 [0.0583,0.0754] 3    
 4 Grp1  A     0.156  (0.144,0.161]   4    
 5 Grp1  C     0.181  (0.178,0.195]   5    
 6 Grp1  F     0.214  (0.212,0.229]   6    
 7 Grp1  D     0.264  (0.246,0.264]   7    
 8 Grp2  E     0.0809 [0.0809,0.102]  1    
 9 Grp2  D     0.146  (0.144,0.165]   2    
10 Grp2  B     0.161  (0.144,0.165]   3    
11 Grp2  C     0.279  (0.27,0.291]    4    
12 Grp2  A     0.334  (0.312,0.334]   5

那么首先,为什么 fct_reorder 没有按照我的意愿去做?其次,为什么“Grp1”中有 7 个值而“Grp2”中有 5 个值?由于每组中重复的“Bin”值,不应该分别只有 5 和 4 吗?

levels点的。根据?fct_reorder

.x, .y - The levels of f are reordered so that the values of .fun(.x) (for fct_reorder()) and fun(.x, .y) (for fct_reorder2()) are in ascending order.

arrangeing Bin 之后,通过在删除未使用的级别 (droplevels)[=24= 后转换为 integer 来创建 'Order' ]

library(dplyr)
library(forcats)
out <- df %>% 
  group_by(Group) %>% 
  mutate(Bin = cut_interval(Value, n = nrow(.)), 
         Bin = fct_reorder(Bin, Value)) %>% 
  arrange(as.integer(Bin)) %>%
  mutate(Order = as.integer(droplevels(Bin))) %>%
  ungroup
out
# A tibble: 12 x 5
   Group Ind    Value Bin             Order
   <chr> <chr>  <dbl> <fct>           <int>
 1 Grp1  B     0.0583 [0.0583,0.0754]     1
 2 Grp1  E     0.0638 [0.0583,0.0754]     1
 3 Grp1  G     0.0640 [0.0583,0.0754]     1
 4 Grp1  A     0.156  (0.144,0.161]       2
 5 Grp1  C     0.181  (0.178,0.195]       3
 6 Grp1  F     0.214  (0.212,0.229]       4
 7 Grp1  D     0.264  (0.246,0.264]       5
 8 Grp2  E     0.0809 [0.0809,0.102]      1
 9 Grp2  B     0.161  (0.144,0.165]       2
10 Grp2  D     0.146  (0.144,0.165]       2
11 Grp2  C     0.279  (0.27,0.291]        3
12 Grp2  A     0.334  (0.312,0.334]       4

或使用 matchunique

 df %>% 
  group_by(Group) %>% 
  mutate(Bin = cut_interval(Value, n = nrow(.)), 
         Bin = fct_reorder(Bin, Value)) %>% 
  arrange(as.integer(Bin))  %>% mutate(Order = match(Bin, unique(Bin))) %>%
  ungroup
# A tibble: 12 x 5
   Group Ind    Value Bin             Order
   <chr> <chr>  <dbl> <fct>           <int>
 1 Grp1  B     0.0583 [0.0583,0.0754]     1
 2 Grp1  E     0.0638 [0.0583,0.0754]     1
 3 Grp1  G     0.0640 [0.0583,0.0754]     1
 4 Grp1  A     0.156  (0.144,0.161]       2
 5 Grp1  C     0.181  (0.178,0.195]       3
 6 Grp1  F     0.214  (0.212,0.229]       4
 7 Grp1  D     0.264  (0.246,0.264]       5
 8 Grp2  E     0.0809 [0.0809,0.102]      1
 9 Grp2  B     0.161  (0.144,0.165]       2
10 Grp2  D     0.146  (0.144,0.165]       2
11 Grp2  C     0.279  (0.27,0.291]        3
12 Grp2  A     0.334  (0.312,0.334]       4

关于 fct_reorder 没有完成任何事情,检查 `step

前后的 levels
> tmp <-  df %>% 
  group_by(Group) %>% 
  mutate(Bin = cut_interval(Value, n = nrow(.)))
> tmp %>% pull(Bin) %>% levels
 [1] "[0.0583,0.0754]" "(0.0754,0.0925]" "(0.0925,0.11]"   "(0.11,0.127]"    "(0.127,0.144]"   "(0.144,0.161]"   "(0.161,0.178]"   "(0.178,0.195]"   "(0.195,0.212]"  
[10] "(0.212,0.229]"   "(0.229,0.246]"   "(0.246,0.264]"   "[0.0809,0.102]"  "(0.102,0.123]"   "(0.123,0.144]"   "(0.144,0.165]"   "(0.165,0.186]"   "(0.186,0.207]"  
[19] "(0.207,0.228]"   "(0.228,0.249]"   "(0.249,0.27]"    "(0.27,0.291]"    "(0.291,0.312]"   "(0.312,0.334]"  
> tmp %>% mutate(Bin = fct_reorder(Bin, Value))  %>% pull(Bin) %>% levels
 [1] "[0.0583,0.0754]" "(0.144,0.161]"   "(0.178,0.195]"   "(0.212,0.229]"   "(0.246,0.264]"   "(0.0754,0.0925]" "(0.0925,0.11]"   "(0.11,0.127]"    "(0.127,0.144]"  
[10] "(0.161,0.178]"   "(0.195,0.212]"   "(0.229,0.246]"   "[0.0809,0.102]"  "(0.102,0.123]"   "(0.123,0.144]"   "(0.144,0.165]"   "(0.165,0.186]"   "(0.186,0.207]"  
[19] "(0.207,0.228]"   "(0.228,0.249]"   "(0.249,0.27]"    "(0.27,0.291]"    "(0.291,0.312]"   "(0.312,0.334]"