如何根据R中的条件提取最小值和最大值

How to extract minimum and maximum values based on conditions in R

我有一个包含数千行的数据框,我需要输出属于同一组和class的数据部分的最小值和最大值。我需要的是读取第一个起始值,将其与结束列中的前一个值进行比较,如果较小,则跳转到下一行等等,直到起始值大于前一个结束值,然后输出最小起始值该部分的值和最大值。我的数据已经按 group-class-start-end.

排序
df <- data.frame(group = c("1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"),
  class = c("2", "2", "2", "2", "2", "2", "2", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3"),
  start = c("23477018","23535465","23567386","24708741","24708741","24708741","48339885","87274","87274","127819","1832772","1832772","1832772","6733569","7005524","7005524","7644572","8095433","8095433","8095433"),
  end = c("47341413", "47341413", "47909872","42247834","47776347","47909872","53818713","3161655","3479466","3503792","3503792","4916249","5329014","8089225","12037894","13934484","12037894","12037894","13626119","13934484"))

我想要实现的输出是:

  group     class   start     end     
1   1       2    23477018   47909872
2   1       2    48339885   53818713
3   1       3    87274      5329014
4   1       3    6733569    13934484

任何关于如何实现这一点的想法将不胜感激。

我为此使用了 data.table。
我的做法是先把start和end改成整数,不然顺序会出问题。
找出哪些行满足 start > max(所有之前的结束),然后使用 cumsum 给出递增的子组编号。
然后它只是一个简单的子组最小值和最大值。
没有循环可以使它尽可能快。

library(data.table)
df <- data.frame(group = c("1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"),
                 class = c("2", "2", "2", "2", "2", "2", "2", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3"),
                 start = c("23477018","23535465","23567386","24708741","24708741","24708741","48339885","87274","87274","127819","1832772","1832772","1832772","6733569","7005524","7005524","7644572","8095433","8095433","8095433"),
                 end = c("47341413", "47341413", "47909872","42247834","47776347","47909872","53818713","3161655","3479466","3503792","3503792","4916249","5329014","8089225","12037894","13934484","12037894","12037894","13626119","13934484"))

setDT(df)
df[, c('start', 'end') := lapply(.SD, as.integer), .SDcols = c('start', 'end')]
df[, subgrp := cumsum(start > shift(cummax(.SD$end), fill = 0)), keyby = c('group', 'class')]
ans <- df[, .(start = min(start), end = max(end)), keyby = c('group', 'class', 'subgrp')]
ans[, subgrp := NULL][]

   group class    start      end
1:     1     2 23477018 47909872
2:     1     2 48339885 53818713
3:     1     3    87274  5329014
4:     1     3  6733569 13934484

这是一个 tidyverse 解决方案:

library(tidyverse)
            
df <- data.frame(
  group = c("1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"),
  class = c("2", "2", "2", "2", "2", "2", "2", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3"),
  start = c("23477018","23535465","23567386","24708741","24708741","24708741","48339885","87274","87274","127819","1832772","1832772","1832772","6733569","7005524","7005524","7644572","8095433","8095433","8095433"),
  end = c("47341413", "47341413", "47909872","42247834","47776347","47909872","53818713","3161655","3479466","3503792","3503792","4916249","5329014","8089225","12037894","13934484","12037894","12037894","13626119","13934484"))
df %>% 
  group_by(group, class) %>% 
  mutate(
    start = as.integer(start),
    end = as.integer(end),
    end_lag = lag(end),
    larger_flag = case_when(start > end_lag & !is.na(end_lag) ~ 1, TRUE ~ 0),
    sub_group = cumsum(larger_flag)) %>% 
  group_by(group, class, sub_group) %>% 
  summarise(
    start = min(start),
    end = max(end),
    .groups = 'drop'
    ) %>% 
  select(-sub_group)
 # A tibble: 4 x 4
   group class    start      max
   <chr> <chr>    <int>    <int>
 1 1     2     23477018 47909872
 2 1     2     48339885 53818713
 3 1     3        87274  5329014
 4 1     3      6733569 13934484