按位置访问列加上条件过滤器

access columns by position plus conditional filter

我有以下数据框:

df = structure(list(age = c("F", "F", "M", "M", "M", "F", "M", "M", 
"F", "F", "M", "M", "F", "F", "F", "F", "F", "M", "M", "F", "F"
), gender = c(52.8547945205479, 70.617475870193, 47.6986301369863, 
85.4876712328767, 56.0288204261033, 27.0219178082192, 40.8583963494959, 
24.6553462722298, 80.4027397260274, 55.6684931506849, 70.6904109589041, 
64.5095890410959, 45.5397260273973, 78.5909038861022, 42.4219178082192, 
44.0712328767123, 77.7068493150685, 70.5199279905761, 43.7178082191781, 
77.7205479452055, 74.972602739726)), row.names = c(NA, -21L), class = c("tbl_df", 
"tbl", "data.frame"))

我想按性别过滤大于该性别平均年龄的年龄。 但我想按列号而不是名称来执行此操作。

所以我尝试了:

df %>% group_by_at(1) %>% filter_at(vars(2) > mean(vars(2))

但这没有用。

有什么建议吗?

当它按一列分组时,它会将其排除在过滤器之外。试试这个

df %>% group_by(across(1)) %>%
  filter(across(1, ~ mean(.) <= .))

# A tibble: 11 x 2
# Groups:   age [2]
   age   gender
   <chr>  <dbl>
 1 F       70.6
 2 M       85.5
 3 M       56.0
 4 F       80.4
 5 M       70.7
 6 M       64.5
 7 F       78.6
 8 F       77.7
 9 M       70.5
10 F       77.7
11 F       75.0

鉴于添加的注释即多列时的情况,请在添加的虚拟列上查看。

df = structure(list(age = c("F", "F", "M", "M", "M", "F", "M", "M", 
                            "F", "F", "M", "M", "F", "F", "F", "F", "F", "M", "M", "F", "F"
), gender = c(52.8547945205479, 70.617475870193, 47.6986301369863, 
              85.4876712328767, 56.0288204261033, 27.0219178082192, 40.8583963494959, 
              24.6553462722298, 80.4027397260274, 55.6684931506849, 70.6904109589041, 
              64.5095890410959, 45.5397260273973, 78.5909038861022, 42.4219178082192, 
              44.0712328767123, 77.7068493150685, 70.5199279905761, 43.7178082191781, 
              77.7205479452055, 74.972602739726),
dummy = runif(21)), row.names = c(NA, -21L), class = c("tbl_df", 
                                                                                      "tbl", "data.frame"))

df
# A tibble: 21 x 3
   age   gender  dummy
   <chr>  <dbl>  <dbl>
 1 F       52.9 0.703 
 2 F       70.6 0.892 
 3 M       47.7 0.170 
 4 M       85.5 0.269 
 5 M       56.0 0.158 
 6 F       27.0 0.260 
 7 M       40.9 0.0818
 8 M       24.7 0.586 
 9 F       80.4 0.563 
10 F       55.7 0.995 
# ... with 11 more rows

df %>% group_by(across(1)) %>%
  filter(across(1, ~ mean(.) <= .))

# A tibble: 11 x 3
# Groups:   age [2]
   age   gender dummy
   <chr>  <dbl> <dbl>
 1 F       70.6 0.892
 2 M       85.5 0.269
 3 M       56.0 0.158
 4 F       80.4 0.563
 5 M       70.7 0.120
 6 M       64.5 0.980
 7 F       78.6 0.720
 8 F       77.7 0.342
 9 M       70.5 0.944
10 F       77.7 0.308
11 F       75.0 0.710

dplyr 中的 _at/_all 变体已被 across 取代。这是一种方法。

library(dplyr)

df %>% 
   group_by(across(1)) %>% 
   filter(cur_data()[[1]] > mean(cur_data()[[1]])) %>% 
   ungroup

#   age   gender
#   <chr>  <dbl>
# 1 F       70.6
# 2 M       85.5
# 3 M       56.0
# 4 F       80.4
# 5 M       70.7
# 6 M       64.5
# 7 F       78.6
# 8 F       77.7
# 9 M       70.5
#10 F       77.7
#11 F       75.0

filter中使用了[[1]],因为cur_data()中不包含分组列。