具有条件值的数据帧子集
Dataframe subsetting with conditional values
我有一些包含不同树种及其尺寸的数据,我想对其进行子集化,只留下树的尺寸大于第 75 个百分位数的那些行。第 75 个百分位因物种而异,因此我想过滤/子集数据框,以便在每个物种的子集中使用正确的第 75 个百分位。
这是一些虚拟数据
## GENERATE DUMMY DATA
sample_data <- data.frame(SPECIES = rep(c('OAK', 'ELM', 'BEECH', 'ASH'), each = 50),
SIZE = runif(200, min = 15, max = 200))
## COMPUTE 75TH PERCENTILES FOR EACH SPECIES
library(dplyr)
data <- data.frame (sample_data %>%
group_by(SPECIES) %>%
summarise(SIZE = quantile(SIZE, probs = .75)))
data
SPECIES SIZE
1 ASH 141.2837
2 BEECH 152.4670
3 ELM 154.6553
4 OAK 121.5114
现在我想对原始数据框 (sample_data) 进行子集化,这样我就得到一个新的数据框,它只显示 >141.3 的 ASH 树,>141.3 的 BEECH 树152.5,ELM 树 > 154.7 和 OAK 树 >121.5
很高兴接受 dplyr
或 data.table
或其他解决方案
谢谢
一个选项是 left_join
,然后我们可以过滤以仅保留大于第 75 个百分位数的行。
library(dplyr)
sample_data %>%
left_join(., data, by = "SPECIES") %>%
filter(SIZE.x > SIZE.y) %>%
select(-SIZE.y) %>%
rename(SIZE = SIZE.x)
输出
SPECIES SIZE
1 OAK 183.1905
2 OAK 194.0178
3 OAK 190.2304
4 OAK 177.6368
5 OAK 176.5812
6 OAK 188.6490
7 OAK 180.5927
8 OAK 183.7877
9 OAK 179.2605
10 OAK 190.0034
11 OAK 185.7922
12 OAK 187.4172
13 OAK 174.8787
14 ELM 166.7489
15 ELM 176.6458
16 ELM 142.0045
17 ELM 187.6565
18 ELM 149.7347
19 ELM 191.5863
20 ELM 198.2069
21 ELM 190.9030
22 ELM 160.5611
23 ELM 175.1305
24 ELM 186.8030
25 ELM 151.5687
26 ELM 182.5735
27 BEECH 194.4072
28 BEECH 190.6091
29 BEECH 193.4944
30 BEECH 193.5393
31 BEECH 197.1501
32 BEECH 174.7058
33 BEECH 192.5512
34 BEECH 199.9723
35 BEECH 175.9333
36 BEECH 185.4285
37 BEECH 185.3669
38 BEECH 172.7221
39 BEECH 175.0001
40 ASH 199.2849
41 ASH 174.2263
42 ASH 150.8999
43 ASH 170.8359
44 ASH 148.0282
45 ASH 159.4717
46 ASH 188.6389
47 ASH 197.0871
48 ASH 151.5592
49 ASH 187.1625
50 ASH 196.6947
51 ASH 198.6819
52 ASH 185.4842
这是一种避免 pre-computing 百分位数的方法。
按 SPECIES
分组并一次性按百分位数过滤。
## GENERATE DUMMY DATA
set.seed(2022)
sample_data <- data.frame(SPECIES = rep(c('OAK', 'ELM', 'BEECH', 'ASH'), each = 50),
SIZE = runif(200, min = 15, max = 200))
## Compute 75th percentiles for each species
## and filter if the SIZE is greater
suppressPackageStartupMessages(library(dplyr))
sample_data %>%
group_by(SPECIES) %>%
filter(SIZE > quantile(SIZE, probs = 0.75))
#> # A tibble: 52 × 2
#> # Groups: SPECIES [4]
#> SPECIES SIZE
#> <chr> <dbl>
#> 1 OAK 166.
#> 2 OAK 158.
#> 3 OAK 172.
#> 4 OAK 171.
#> 5 OAK 188.
#> 6 OAK 200.
#> 7 OAK 158.
#> 8 OAK 172.
#> 9 OAK 164.
#> 10 OAK 157.
#> # … with 42 more rows
由 reprex package (v2.0.1)
创建于 2022-05-02
使用data.table
,
require(data.table)
#> Loading required package: data.table
df <- data.table(SPECIES = rep(c('OAK', 'ELM', 'BEECH', 'ASH'), each = 50),
SIZE = runif(200, min = 15, max = 200))
df[,criteria := quantile(SIZE, probs=.75),by=SPECIES][SIZE>criteria,.(SPECIES, SIZE)]
输出
#> SPECIES SIZE
#> 1: OAK 166.6108
#> 2: OAK 152.5978
#> 3: OAK 165.3719
#> 4: OAK 169.5894
#> 5: OAK 145.0316
#> 6: OAK 144.9761
#> 7: OAK 193.5518
#> 8: OAK 167.0893
#> 9: OAK 171.8388
#> 10: OAK 163.6997
#> 11: OAK 162.8079
#> 12: OAK 154.3202
#> 13: OAK 155.4055
#> 14: ELM 182.4347
#> 15: ELM 158.9178
#> 16: ELM 160.9994
#> 17: ELM 172.3293
#> 18: ELM 160.3153
#> 19: ELM 171.6101
#> 20: ELM 197.8908
#> 21: ELM 191.5175
#> 22: ELM 187.9439
#> 23: ELM 195.3205
#> 24: ELM 186.9787
#> 25: ELM 185.5459
#> 26: ELM 176.5530
#> 27: BEECH 142.0979
#> 28: BEECH 163.7029
#> 29: BEECH 196.2091
#> 30: BEECH 193.7850
#> 31: BEECH 184.2935
#> 32: BEECH 180.5350
#> 33: BEECH 168.1937
#> 34: BEECH 198.0463
#> 35: BEECH 166.3282
#> 36: BEECH 175.8253
#> 37: BEECH 174.0137
#> 38: BEECH 142.5474
#> 39: BEECH 158.7505
#> 40: ASH 192.6718
#> 41: ASH 161.2425
#> 42: ASH 159.5650
#> 43: ASH 173.2908
#> 44: ASH 198.7314
#> 45: ASH 172.9008
#> 46: ASH 197.5794
#> 47: ASH 173.4665
#> 48: ASH 161.8918
#> 49: ASH 198.9472
#> 50: ASH 161.1769
#> 51: ASH 178.5557
#> 52: ASH 162.5603
#> SPECIES SIZE
由 reprex package (v2.0.1)
创建于 2022-05-02
我有一些包含不同树种及其尺寸的数据,我想对其进行子集化,只留下树的尺寸大于第 75 个百分位数的那些行。第 75 个百分位因物种而异,因此我想过滤/子集数据框,以便在每个物种的子集中使用正确的第 75 个百分位。
这是一些虚拟数据
## GENERATE DUMMY DATA
sample_data <- data.frame(SPECIES = rep(c('OAK', 'ELM', 'BEECH', 'ASH'), each = 50),
SIZE = runif(200, min = 15, max = 200))
## COMPUTE 75TH PERCENTILES FOR EACH SPECIES
library(dplyr)
data <- data.frame (sample_data %>%
group_by(SPECIES) %>%
summarise(SIZE = quantile(SIZE, probs = .75)))
data
SPECIES SIZE
1 ASH 141.2837
2 BEECH 152.4670
3 ELM 154.6553
4 OAK 121.5114
现在我想对原始数据框 (sample_data) 进行子集化,这样我就得到一个新的数据框,它只显示 >141.3 的 ASH 树,>141.3 的 BEECH 树152.5,ELM 树 > 154.7 和 OAK 树 >121.5
很高兴接受 dplyr
或 data.table
或其他解决方案
谢谢
一个选项是 left_join
,然后我们可以过滤以仅保留大于第 75 个百分位数的行。
library(dplyr)
sample_data %>%
left_join(., data, by = "SPECIES") %>%
filter(SIZE.x > SIZE.y) %>%
select(-SIZE.y) %>%
rename(SIZE = SIZE.x)
输出
SPECIES SIZE
1 OAK 183.1905
2 OAK 194.0178
3 OAK 190.2304
4 OAK 177.6368
5 OAK 176.5812
6 OAK 188.6490
7 OAK 180.5927
8 OAK 183.7877
9 OAK 179.2605
10 OAK 190.0034
11 OAK 185.7922
12 OAK 187.4172
13 OAK 174.8787
14 ELM 166.7489
15 ELM 176.6458
16 ELM 142.0045
17 ELM 187.6565
18 ELM 149.7347
19 ELM 191.5863
20 ELM 198.2069
21 ELM 190.9030
22 ELM 160.5611
23 ELM 175.1305
24 ELM 186.8030
25 ELM 151.5687
26 ELM 182.5735
27 BEECH 194.4072
28 BEECH 190.6091
29 BEECH 193.4944
30 BEECH 193.5393
31 BEECH 197.1501
32 BEECH 174.7058
33 BEECH 192.5512
34 BEECH 199.9723
35 BEECH 175.9333
36 BEECH 185.4285
37 BEECH 185.3669
38 BEECH 172.7221
39 BEECH 175.0001
40 ASH 199.2849
41 ASH 174.2263
42 ASH 150.8999
43 ASH 170.8359
44 ASH 148.0282
45 ASH 159.4717
46 ASH 188.6389
47 ASH 197.0871
48 ASH 151.5592
49 ASH 187.1625
50 ASH 196.6947
51 ASH 198.6819
52 ASH 185.4842
这是一种避免 pre-computing 百分位数的方法。
按 SPECIES
分组并一次性按百分位数过滤。
## GENERATE DUMMY DATA
set.seed(2022)
sample_data <- data.frame(SPECIES = rep(c('OAK', 'ELM', 'BEECH', 'ASH'), each = 50),
SIZE = runif(200, min = 15, max = 200))
## Compute 75th percentiles for each species
## and filter if the SIZE is greater
suppressPackageStartupMessages(library(dplyr))
sample_data %>%
group_by(SPECIES) %>%
filter(SIZE > quantile(SIZE, probs = 0.75))
#> # A tibble: 52 × 2
#> # Groups: SPECIES [4]
#> SPECIES SIZE
#> <chr> <dbl>
#> 1 OAK 166.
#> 2 OAK 158.
#> 3 OAK 172.
#> 4 OAK 171.
#> 5 OAK 188.
#> 6 OAK 200.
#> 7 OAK 158.
#> 8 OAK 172.
#> 9 OAK 164.
#> 10 OAK 157.
#> # … with 42 more rows
由 reprex package (v2.0.1)
创建于 2022-05-02使用data.table
,
require(data.table)
#> Loading required package: data.table
df <- data.table(SPECIES = rep(c('OAK', 'ELM', 'BEECH', 'ASH'), each = 50),
SIZE = runif(200, min = 15, max = 200))
df[,criteria := quantile(SIZE, probs=.75),by=SPECIES][SIZE>criteria,.(SPECIES, SIZE)]
输出
#> SPECIES SIZE
#> 1: OAK 166.6108
#> 2: OAK 152.5978
#> 3: OAK 165.3719
#> 4: OAK 169.5894
#> 5: OAK 145.0316
#> 6: OAK 144.9761
#> 7: OAK 193.5518
#> 8: OAK 167.0893
#> 9: OAK 171.8388
#> 10: OAK 163.6997
#> 11: OAK 162.8079
#> 12: OAK 154.3202
#> 13: OAK 155.4055
#> 14: ELM 182.4347
#> 15: ELM 158.9178
#> 16: ELM 160.9994
#> 17: ELM 172.3293
#> 18: ELM 160.3153
#> 19: ELM 171.6101
#> 20: ELM 197.8908
#> 21: ELM 191.5175
#> 22: ELM 187.9439
#> 23: ELM 195.3205
#> 24: ELM 186.9787
#> 25: ELM 185.5459
#> 26: ELM 176.5530
#> 27: BEECH 142.0979
#> 28: BEECH 163.7029
#> 29: BEECH 196.2091
#> 30: BEECH 193.7850
#> 31: BEECH 184.2935
#> 32: BEECH 180.5350
#> 33: BEECH 168.1937
#> 34: BEECH 198.0463
#> 35: BEECH 166.3282
#> 36: BEECH 175.8253
#> 37: BEECH 174.0137
#> 38: BEECH 142.5474
#> 39: BEECH 158.7505
#> 40: ASH 192.6718
#> 41: ASH 161.2425
#> 42: ASH 159.5650
#> 43: ASH 173.2908
#> 44: ASH 198.7314
#> 45: ASH 172.9008
#> 46: ASH 197.5794
#> 47: ASH 173.4665
#> 48: ASH 161.8918
#> 49: ASH 198.9472
#> 50: ASH 161.1769
#> 51: ASH 178.5557
#> 52: ASH 162.5603
#> SPECIES SIZE
由 reprex package (v2.0.1)
创建于 2022-05-02