按位置访问列加上条件过滤器
access columns by position plus conditional filter
我有以下数据框:
df = structure(list(age = c("F", "F", "M", "M", "M", "F", "M", "M",
"F", "F", "M", "M", "F", "F", "F", "F", "F", "M", "M", "F", "F"
), gender = c(52.8547945205479, 70.617475870193, 47.6986301369863,
85.4876712328767, 56.0288204261033, 27.0219178082192, 40.8583963494959,
24.6553462722298, 80.4027397260274, 55.6684931506849, 70.6904109589041,
64.5095890410959, 45.5397260273973, 78.5909038861022, 42.4219178082192,
44.0712328767123, 77.7068493150685, 70.5199279905761, 43.7178082191781,
77.7205479452055, 74.972602739726)), row.names = c(NA, -21L), class = c("tbl_df",
"tbl", "data.frame"))
我想按性别过滤大于该性别平均年龄的年龄。
但我想按列号而不是名称来执行此操作。
所以我尝试了:
df %>% group_by_at(1) %>% filter_at(vars(2) > mean(vars(2))
但这没有用。
有什么建议吗?
当它按一列分组时,它会将其排除在过滤器之外。试试这个
df %>% group_by(across(1)) %>%
filter(across(1, ~ mean(.) <= .))
# A tibble: 11 x 2
# Groups: age [2]
age gender
<chr> <dbl>
1 F 70.6
2 M 85.5
3 M 56.0
4 F 80.4
5 M 70.7
6 M 64.5
7 F 78.6
8 F 77.7
9 M 70.5
10 F 77.7
11 F 75.0
鉴于添加的注释即多列时的情况,请在添加的虚拟列上查看。
df = structure(list(age = c("F", "F", "M", "M", "M", "F", "M", "M",
"F", "F", "M", "M", "F", "F", "F", "F", "F", "M", "M", "F", "F"
), gender = c(52.8547945205479, 70.617475870193, 47.6986301369863,
85.4876712328767, 56.0288204261033, 27.0219178082192, 40.8583963494959,
24.6553462722298, 80.4027397260274, 55.6684931506849, 70.6904109589041,
64.5095890410959, 45.5397260273973, 78.5909038861022, 42.4219178082192,
44.0712328767123, 77.7068493150685, 70.5199279905761, 43.7178082191781,
77.7205479452055, 74.972602739726),
dummy = runif(21)), row.names = c(NA, -21L), class = c("tbl_df",
"tbl", "data.frame"))
df
# A tibble: 21 x 3
age gender dummy
<chr> <dbl> <dbl>
1 F 52.9 0.703
2 F 70.6 0.892
3 M 47.7 0.170
4 M 85.5 0.269
5 M 56.0 0.158
6 F 27.0 0.260
7 M 40.9 0.0818
8 M 24.7 0.586
9 F 80.4 0.563
10 F 55.7 0.995
# ... with 11 more rows
df %>% group_by(across(1)) %>%
filter(across(1, ~ mean(.) <= .))
# A tibble: 11 x 3
# Groups: age [2]
age gender dummy
<chr> <dbl> <dbl>
1 F 70.6 0.892
2 M 85.5 0.269
3 M 56.0 0.158
4 F 80.4 0.563
5 M 70.7 0.120
6 M 64.5 0.980
7 F 78.6 0.720
8 F 77.7 0.342
9 M 70.5 0.944
10 F 77.7 0.308
11 F 75.0 0.710
dplyr
中的 _at
/_all
变体已被 across
取代。这是一种方法。
library(dplyr)
df %>%
group_by(across(1)) %>%
filter(cur_data()[[1]] > mean(cur_data()[[1]])) %>%
ungroup
# age gender
# <chr> <dbl>
# 1 F 70.6
# 2 M 85.5
# 3 M 56.0
# 4 F 80.4
# 5 M 70.7
# 6 M 64.5
# 7 F 78.6
# 8 F 77.7
# 9 M 70.5
#10 F 77.7
#11 F 75.0
在filter
中使用了[[1]]
,因为cur_data()
中不包含分组列。
我有以下数据框:
df = structure(list(age = c("F", "F", "M", "M", "M", "F", "M", "M",
"F", "F", "M", "M", "F", "F", "F", "F", "F", "M", "M", "F", "F"
), gender = c(52.8547945205479, 70.617475870193, 47.6986301369863,
85.4876712328767, 56.0288204261033, 27.0219178082192, 40.8583963494959,
24.6553462722298, 80.4027397260274, 55.6684931506849, 70.6904109589041,
64.5095890410959, 45.5397260273973, 78.5909038861022, 42.4219178082192,
44.0712328767123, 77.7068493150685, 70.5199279905761, 43.7178082191781,
77.7205479452055, 74.972602739726)), row.names = c(NA, -21L), class = c("tbl_df",
"tbl", "data.frame"))
我想按性别过滤大于该性别平均年龄的年龄。 但我想按列号而不是名称来执行此操作。
所以我尝试了:
df %>% group_by_at(1) %>% filter_at(vars(2) > mean(vars(2))
但这没有用。
有什么建议吗?
当它按一列分组时,它会将其排除在过滤器之外。试试这个
df %>% group_by(across(1)) %>%
filter(across(1, ~ mean(.) <= .))
# A tibble: 11 x 2
# Groups: age [2]
age gender
<chr> <dbl>
1 F 70.6
2 M 85.5
3 M 56.0
4 F 80.4
5 M 70.7
6 M 64.5
7 F 78.6
8 F 77.7
9 M 70.5
10 F 77.7
11 F 75.0
鉴于添加的注释即多列时的情况,请在添加的虚拟列上查看。
df = structure(list(age = c("F", "F", "M", "M", "M", "F", "M", "M",
"F", "F", "M", "M", "F", "F", "F", "F", "F", "M", "M", "F", "F"
), gender = c(52.8547945205479, 70.617475870193, 47.6986301369863,
85.4876712328767, 56.0288204261033, 27.0219178082192, 40.8583963494959,
24.6553462722298, 80.4027397260274, 55.6684931506849, 70.6904109589041,
64.5095890410959, 45.5397260273973, 78.5909038861022, 42.4219178082192,
44.0712328767123, 77.7068493150685, 70.5199279905761, 43.7178082191781,
77.7205479452055, 74.972602739726),
dummy = runif(21)), row.names = c(NA, -21L), class = c("tbl_df",
"tbl", "data.frame"))
df
# A tibble: 21 x 3
age gender dummy
<chr> <dbl> <dbl>
1 F 52.9 0.703
2 F 70.6 0.892
3 M 47.7 0.170
4 M 85.5 0.269
5 M 56.0 0.158
6 F 27.0 0.260
7 M 40.9 0.0818
8 M 24.7 0.586
9 F 80.4 0.563
10 F 55.7 0.995
# ... with 11 more rows
df %>% group_by(across(1)) %>%
filter(across(1, ~ mean(.) <= .))
# A tibble: 11 x 3
# Groups: age [2]
age gender dummy
<chr> <dbl> <dbl>
1 F 70.6 0.892
2 M 85.5 0.269
3 M 56.0 0.158
4 F 80.4 0.563
5 M 70.7 0.120
6 M 64.5 0.980
7 F 78.6 0.720
8 F 77.7 0.342
9 M 70.5 0.944
10 F 77.7 0.308
11 F 75.0 0.710
dplyr
中的 _at
/_all
变体已被 across
取代。这是一种方法。
library(dplyr)
df %>%
group_by(across(1)) %>%
filter(cur_data()[[1]] > mean(cur_data()[[1]])) %>%
ungroup
# age gender
# <chr> <dbl>
# 1 F 70.6
# 2 M 85.5
# 3 M 56.0
# 4 F 80.4
# 5 M 70.7
# 6 M 64.5
# 7 F 78.6
# 8 F 77.7
# 9 M 70.5
#10 F 77.7
#11 F 75.0
在filter
中使用了[[1]]
,因为cur_data()
中不包含分组列。