按组获取最高值

Getting the top values by group

这是一个示例数据框:

d <- data.frame(
  x   = runif(90),
  grp = gl(3, 30)
) 

我想要 d 的子集,其中包含 grp 的每个值的前 5 个值 x 的行。

使用 base-R,我的方法是这样的:

ordered <- d[order(d$x, decreasing = TRUE), ]    
splits <- split(ordered, ordered$grp)
heads <- lapply(splits, head)
do.call(rbind, heads)
##              x grp
## 1.19 0.8879631   1
## 1.4  0.8844818   1
## 1.12 0.8596197   1
## 1.26 0.8481809   1
## 1.18 0.8461516   1
## 1.29 0.8317092   1
## 2.31 0.9751049   2
## 2.34 0.9269764   2
## 2.57 0.8964114   2
## 2.58 0.8896466   2
## 2.45 0.8888834   2
## 2.35 0.8706823   2
## 3.74 0.9884852   3
## 3.73 0.9837653   3
## 3.83 0.9375398   3
## 3.64 0.9229036   3
## 3.69 0.8021373   3
## 3.86 0.7418946   3

使用 dplyr,我希望它能工作:

d %>%
  arrange_(~ desc(x)) %>%
  group_by_(~ grp) %>%
  head(n = 5)

但它只 returns 总体前 5 行。

head 换成 top_n returns 整个 d

d %>%
  arrange_(~ desc(x)) %>%
  group_by_(~ grp) %>%
  top_n(n = 5)

如何获得正确的子集?

您需要在对 do 的调用中包装 head。在下面的代码中,. 代表当前组(参见 do 帮助页面中对 ... 的描述)。

d %>%
  arrange_(~ desc(x)) %>%
  group_by_(~ grp) %>%
  do(head(., n = 5))

如 akrun 所述,slice 是替代方案。

d %>%
  arrange_(~ desc(x)) %>%
  group_by_(~ grp) %>%
  slice(1:5)

虽然我没有问这个,但为了完整性,一个可能的 data.table 版本是(感谢@Arun 的修复):

setDT(d)[order(-x), head(.SD, 5), by = grp]

我在 base R 中的方法是:

ordered <- d[order(d$x, decreasing = TRUE), ]
ordered[ave(d$x, d$grp, FUN = seq_along) <= 5L,]

并且使用 dplyr,slice 的方法可能是最快的,但您也可以使用 filter,这可能比使用 do(head(., 5)):

更快
d %>% 
  arrange(desc(x)) %>%
  group_by(grp) %>%
  filter(row_number() <= 5L)

dplyr 基准测试

set.seed(123)
d <- data.frame(
  x   = runif(1e6),
  grp = sample(1e4, 1e6, TRUE))

library(microbenchmark)

microbenchmark(
  top_n = {d %>%
             group_by(grp) %>%
             top_n(n = 5, wt = x)},
  dohead = {d %>%
              arrange_(~ desc(x)) %>%
              group_by_(~ grp) %>%
              do(head(., n = 5))},
  slice = {d %>%
             arrange_(~ desc(x)) %>%
             group_by_(~ grp) %>%
             slice(1:5)},
  filter = {d %>% 
              arrange(desc(x)) %>%
              group_by(grp) %>%
              filter(row_number() <= 5L)},
  times = 10,
  unit = "relative"
)

Unit: relative
   expr       min        lq    median        uq       max neval
  top_n  1.042735  1.075366  1.082113  1.085072  1.000846    10
 dohead 18.663825 19.342854 19.511495 19.840377 17.433518    10
  slice  1.000000  1.000000  1.000000  1.000000  1.000000    10
 filter  1.048556  1.044113  1.042184  1.180474  1.053378    10

data.table 也很容易...

library(data.table)
setorder(setDT(d), -x)[, head(.SD, 5), keyby = grp]

或者

setorder(setDT(d), grp, -x)[, head(.SD, 5), by = grp]

或者(对于大数据集应该更快,因为避免为每个组调用 .SD

setorder(setDT(d), grp, -x)[, indx := seq_len(.N), by = grp][indx <= 5]

编辑: 下面是 dplyrdata.table 的比较(如果有人感兴趣的话)

set.seed(123)
d <- data.frame(
  x   = runif(1e6),
  grp = sample(1e4, 1e6, TRUE))

library(dplyr)
library(microbenchmark)
library(data.table)
dd <- copy(d)

microbenchmark(
  top_n = {d %>%
             group_by(grp) %>%
             top_n(n = 5, wt = x)},
  dohead = {d %>%
              arrange_(~ desc(x)) %>%
              group_by_(~ grp) %>%
              do(head(., n = 5))},
  slice = {d %>%
             arrange_(~ desc(x)) %>%
             group_by_(~ grp) %>%
             slice(1:5)},
  filter = {d %>% 
              arrange(desc(x)) %>%
              group_by(grp) %>%
              filter(row_number() <= 5L)},
  data.table1 = setorder(setDT(dd), -x)[, head(.SD, 5L), keyby = grp],
  data.table2 = setorder(setDT(dd), grp, -x)[, head(.SD, 5L), grp],
  data.table3 = setorder(setDT(dd), grp, -x)[, indx := seq_len(.N), grp][indx <= 5L],
  times = 10,
  unit = "relative"
)


#        expr        min         lq      mean     median        uq       max neval
#       top_n  24.246401  24.492972 16.300391  24.441351 11.749050  7.644748    10
#      dohead 122.891381 120.329722 77.763843 115.621635 54.996588 34.114738    10
#       slice  27.365711  26.839443 17.714303  26.433924 12.628934  7.899619    10
#      filter  27.755171  27.225461 17.936295  26.363739 12.935709  7.969806    10
# data.table1  13.753046  16.631143 10.775278  16.330942  8.359951  5.077140    10
# data.table2  12.047111  11.944557  7.862302  11.653385  5.509432  3.642733    10
# data.table3   1.000000   1.000000  1.000000   1.000000  1.000000  1.000000    10

添加稍微快一点的 data.table 解决方案:

set.seed(123L)
d <- data.frame(
    x   = runif(1e8),
    grp = sample(1e4, 1e8, TRUE))
setDT(d)
setorder(d, grp, -x)
dd <- copy(d)

library(microbenchmark)
microbenchmark(
    data.table3 = d[, indx := seq_len(.N), grp][indx <= 5L],
    data.table4 = dd[dd[, .I[seq_len(.N) <= 5L], grp]$V1],
    times = 10L
)

定时输出:

Unit: milliseconds
        expr      min       lq     mean   median        uq      max neval
 data.table3 826.2148 865.6334 950.1380 902.1689 1006.1237 1260.129    10
 data.table4 729.3229 783.7000 859.2084 823.1635  966.8239 1014.397    10

来自dplyr 1.0.0、"slice_min()slice_max() select具有变量最小值或最大值的行,从令人困惑的top_n().

d %>% group_by(grp) %>% slice_max(order_by = x, n = 5)
# # A tibble: 15 x 2
# # Groups:   grp [3]
#     x grp  
# <dbl> <fct>
#  1 0.994 1    
#  2 0.957 1    
#  3 0.955 1    
#  4 0.940 1    
#  5 0.900 1    
#  6 0.963 2    
#  7 0.902 2    
#  8 0.895 2    
#  9 0.858 2    
# 10 0.799 2    
# 11 0.985 3    
# 12 0.893 3    
# 13 0.886 3    
# 14 0.815 3    
# 15 0.812 3

dplyr 1.0.0使用top_n:

来自?top_n,关于wt参数:

The variable to use for ordering [...] defaults to the last variable in the tbl".

数据集中的最后一个变量是“grp”,这不是您希望排名的变量,这就是为什么您 top_n 尝试“returns 整个 d”。因此,如果您希望在数据集中按“x”排名,则需要指定 wt = x.

d %>%
  group_by(grp) %>%
  top_n(n = 5, wt = x)

数据:

set.seed(123)
d <- data.frame(
  x = runif(90),
  grp = gl(3, 30))
如果 ordering 变量在每个组中不是唯一的,

top_n(n = 1) 仍将为每个组 return 多行。为了 select 每组恰好出现一次,向每一行添加一个唯一变量:

set.seed(123)
d <- data.frame(
  x   = runif(90),
  grp = gl(3, 30))

d %>%
  mutate(rn = row_number()) %>% 
  group_by(grp) %>%
  top_n(n = 1, wt = rn)

另一个 data.table 解决方案以突出其简洁的语法:

setDT(d)
d[order(-x), .SD[1:5], grp]