R中的过滤数据框

Filter dataframe in R

我试图查看行业中职业随时间的变化,所以我有工程师、教师和律师等职业 ID,以及与建筑、教育、矿产等行业相对应的几个 ID开采、捕鱼……我想从每个职业中开采——以及最大和最小的变化。数据示例如下。在这个例子中,我想提取前 3 个最大的积极变化和前 3 个最大的消极变化你们能帮帮我吗?

x <- data.frame("occ_id" = c(1010, 1010, 1010, 1010, 1010, 1010, 1010,1234,1234,1234,1234, 4321, 4321,4321,4321,4321),
                "Ind_id" = c(52418,52417,28339,27138,31224,33103,1112,27138,31224,1112,52418,33103,31224,1112,52417,26301),
                "Change_occ_2000_2022" = c(1, -5 , 8 ,9 , - 11 ,15 ,16 ,-50,10,30,-5,20,10,50,30,-50))

然后我试了这个

x %>% 
  count(Change_occ_2000_2022) %>%
  arrange(Change_occ_2000_2022) %>%
  slice(c(head(row_number(), 3), tail(row_number(), 3)))

但是这样做我无法捕获此更改所属的对 occ-ind。我想像这样对这些更改进行配对:

x <- data.frame("occ_id" = c(4321, 4321, 1234, 1234, 4321, 1010),
                    "Ind_id" = c(1112,52417,1112,27138,26301, 31224 ),
                    "Change_occ_2000_2022" = c(50,30,30, -50, -50, -11))
library(dplyr)

x %>% 
  arrange(desc(Change_occ_2000_2022)) %>% 
  slice(c(1:3, (nrow(.) - 2):nrow(.)))

输出

  occ_id Ind_id Change_occ_2000_2022
1   4321   1112                   50
2   1234   1112                   30
3   4321  52417                   30
4   1010  31224                  -11
5   1234  27138                  -50
6   4321  26301                  -50
 library(dplyr) 
 x %>% 
    group_by(occ_id) %>% 
    arrange(-Change_occ_2000_2022) %>% 
    ungroup() %>% 
    slice(c(head(row_number(), 3), tail(row_number(), 3)))

输出:

occ_id Ind_id Change_occ_2000_2022
   <dbl>  <dbl>                <dbl>
1   4321   1112                   50
2   1234   1112                   30
3   4321  52417                   30
4   1010  31224                  -11
5   1234  27138                  -50
6   4321  26301                  -50

基于解决方案LMc

df <- data.frame("occ_id" = c(1010, 1010, 1010, 1010, 1010, 1010, 1010,1234,1234,1234,1234, 4321, 4321,4321,4321,4321),
                "Ind_id" = c(52418,52417,28339,27138,31224,33103,1112,27138,31224,1112,52418,33103,31224,1112,52417,26301),
                "Change_occ_2000_2022" = c(1, -5 , 8 ,9 , - 11 ,15 ,16 ,-50,10,30,-5,20,10,50,30,-50))

library(data.table)
setDT(df)[order(Change_occ_2000_2022), .SD[c(1:3, (.N-2):.N)]]
#>    occ_id Ind_id Change_occ_2000_2022
#> 1:   1234  27138                  -50
#> 2:   4321  26301                  -50
#> 3:   1010  31224                  -11
#> 4:   1234   1112                   30
#> 5:   4321  52417                   30
#> 6:   4321   1112                   50

reprex package (v2.0.1)

于 2022-05-19 创建

setDT(df)[frankv(Change_occ_2000_2022, ties.method = "dense") <= 2 |
            frankv(-Change_occ_2000_2022, ties.method = "dense") <= 2][order(Change_occ_2000_2022)]

如果您需要考虑重复值

x<-x %>%
  arrange(Change_occ_2000_2022)
x<-rbind(head(x, 3), tail(x, 3))

输出:

> x
   occ_id Ind_id Change_occ_2000_2022
1    1234  27138                  -50
2    4321  26301                  -50
3    1010  31224                  -11
14   1234   1112                   30
15   4321  52417                   30
16   4321   1112                   50