R中的过滤数据框
Filter dataframe in R
我试图查看行业中职业随时间的变化,所以我有工程师、教师和律师等职业 ID,以及与建筑、教育、矿产等行业相对应的几个 ID开采、捕鱼……我想从每个职业中开采——以及最大和最小的变化。数据示例如下。在这个例子中,我想提取前 3 个最大的积极变化和前 3 个最大的消极变化你们能帮帮我吗?
x <- data.frame("occ_id" = c(1010, 1010, 1010, 1010, 1010, 1010, 1010,1234,1234,1234,1234, 4321, 4321,4321,4321,4321),
"Ind_id" = c(52418,52417,28339,27138,31224,33103,1112,27138,31224,1112,52418,33103,31224,1112,52417,26301),
"Change_occ_2000_2022" = c(1, -5 , 8 ,9 , - 11 ,15 ,16 ,-50,10,30,-5,20,10,50,30,-50))
然后我试了这个
x %>%
count(Change_occ_2000_2022) %>%
arrange(Change_occ_2000_2022) %>%
slice(c(head(row_number(), 3), tail(row_number(), 3)))
但是这样做我无法捕获此更改所属的对 occ-ind。我想像这样对这些更改进行配对:
x <- data.frame("occ_id" = c(4321, 4321, 1234, 1234, 4321, 1010),
"Ind_id" = c(1112,52417,1112,27138,26301, 31224 ),
"Change_occ_2000_2022" = c(50,30,30, -50, -50, -11))
library(dplyr)
x %>%
arrange(desc(Change_occ_2000_2022)) %>%
slice(c(1:3, (nrow(.) - 2):nrow(.)))
输出
occ_id Ind_id Change_occ_2000_2022
1 4321 1112 50
2 1234 1112 30
3 4321 52417 30
4 1010 31224 -11
5 1234 27138 -50
6 4321 26301 -50
library(dplyr)
x %>%
group_by(occ_id) %>%
arrange(-Change_occ_2000_2022) %>%
ungroup() %>%
slice(c(head(row_number(), 3), tail(row_number(), 3)))
输出:
occ_id Ind_id Change_occ_2000_2022
<dbl> <dbl> <dbl>
1 4321 1112 50
2 1234 1112 30
3 4321 52417 30
4 1010 31224 -11
5 1234 27138 -50
6 4321 26301 -50
基于解决方案LMc
df <- data.frame("occ_id" = c(1010, 1010, 1010, 1010, 1010, 1010, 1010,1234,1234,1234,1234, 4321, 4321,4321,4321,4321),
"Ind_id" = c(52418,52417,28339,27138,31224,33103,1112,27138,31224,1112,52418,33103,31224,1112,52417,26301),
"Change_occ_2000_2022" = c(1, -5 , 8 ,9 , - 11 ,15 ,16 ,-50,10,30,-5,20,10,50,30,-50))
library(data.table)
setDT(df)[order(Change_occ_2000_2022), .SD[c(1:3, (.N-2):.N)]]
#> occ_id Ind_id Change_occ_2000_2022
#> 1: 1234 27138 -50
#> 2: 4321 26301 -50
#> 3: 1010 31224 -11
#> 4: 1234 1112 30
#> 5: 4321 52417 30
#> 6: 4321 1112 50
由 reprex package (v2.0.1)
于 2022-05-19 创建
或
setDT(df)[frankv(Change_occ_2000_2022, ties.method = "dense") <= 2 |
frankv(-Change_occ_2000_2022, ties.method = "dense") <= 2][order(Change_occ_2000_2022)]
如果您需要考虑重复值
x<-x %>%
arrange(Change_occ_2000_2022)
x<-rbind(head(x, 3), tail(x, 3))
输出:
> x
occ_id Ind_id Change_occ_2000_2022
1 1234 27138 -50
2 4321 26301 -50
3 1010 31224 -11
14 1234 1112 30
15 4321 52417 30
16 4321 1112 50
我试图查看行业中职业随时间的变化,所以我有工程师、教师和律师等职业 ID,以及与建筑、教育、矿产等行业相对应的几个 ID开采、捕鱼……我想从每个职业中开采——以及最大和最小的变化。数据示例如下。在这个例子中,我想提取前 3 个最大的积极变化和前 3 个最大的消极变化你们能帮帮我吗?
x <- data.frame("occ_id" = c(1010, 1010, 1010, 1010, 1010, 1010, 1010,1234,1234,1234,1234, 4321, 4321,4321,4321,4321),
"Ind_id" = c(52418,52417,28339,27138,31224,33103,1112,27138,31224,1112,52418,33103,31224,1112,52417,26301),
"Change_occ_2000_2022" = c(1, -5 , 8 ,9 , - 11 ,15 ,16 ,-50,10,30,-5,20,10,50,30,-50))
然后我试了这个
x %>%
count(Change_occ_2000_2022) %>%
arrange(Change_occ_2000_2022) %>%
slice(c(head(row_number(), 3), tail(row_number(), 3)))
但是这样做我无法捕获此更改所属的对 occ-ind。我想像这样对这些更改进行配对:
x <- data.frame("occ_id" = c(4321, 4321, 1234, 1234, 4321, 1010),
"Ind_id" = c(1112,52417,1112,27138,26301, 31224 ),
"Change_occ_2000_2022" = c(50,30,30, -50, -50, -11))
library(dplyr)
x %>%
arrange(desc(Change_occ_2000_2022)) %>%
slice(c(1:3, (nrow(.) - 2):nrow(.)))
输出
occ_id Ind_id Change_occ_2000_2022
1 4321 1112 50
2 1234 1112 30
3 4321 52417 30
4 1010 31224 -11
5 1234 27138 -50
6 4321 26301 -50
library(dplyr)
x %>%
group_by(occ_id) %>%
arrange(-Change_occ_2000_2022) %>%
ungroup() %>%
slice(c(head(row_number(), 3), tail(row_number(), 3)))
输出:
occ_id Ind_id Change_occ_2000_2022
<dbl> <dbl> <dbl>
1 4321 1112 50
2 1234 1112 30
3 4321 52417 30
4 1010 31224 -11
5 1234 27138 -50
6 4321 26301 -50
基于解决方案LMc
df <- data.frame("occ_id" = c(1010, 1010, 1010, 1010, 1010, 1010, 1010,1234,1234,1234,1234, 4321, 4321,4321,4321,4321),
"Ind_id" = c(52418,52417,28339,27138,31224,33103,1112,27138,31224,1112,52418,33103,31224,1112,52417,26301),
"Change_occ_2000_2022" = c(1, -5 , 8 ,9 , - 11 ,15 ,16 ,-50,10,30,-5,20,10,50,30,-50))
library(data.table)
setDT(df)[order(Change_occ_2000_2022), .SD[c(1:3, (.N-2):.N)]]
#> occ_id Ind_id Change_occ_2000_2022
#> 1: 1234 27138 -50
#> 2: 4321 26301 -50
#> 3: 1010 31224 -11
#> 4: 1234 1112 30
#> 5: 4321 52417 30
#> 6: 4321 1112 50
由 reprex package (v2.0.1)
于 2022-05-19 创建或
setDT(df)[frankv(Change_occ_2000_2022, ties.method = "dense") <= 2 |
frankv(-Change_occ_2000_2022, ties.method = "dense") <= 2][order(Change_occ_2000_2022)]
如果您需要考虑重复值
x<-x %>%
arrange(Change_occ_2000_2022)
x<-rbind(head(x, 3), tail(x, 3))
输出:
> x
occ_id Ind_id Change_occ_2000_2022
1 1234 27138 -50
2 4321 26301 -50
3 1010 31224 -11
14 1234 1112 30
15 4321 52417 30
16 4321 1112 50