按预定义的最大组和对数值向量进行分组

Question

我有一个像这样的数字向量 x <- c(1, 23, 7, 10, 9, 2, 4)，我想从左到右对元素进行分组，并限制每组总和不得超过 25。因此，这里第一组是 c(1, 23)，第二组是 c(7, 10)，最后一组是 c(9, 2, 4)。预期输出是一个数据框，第二列包含组：

data.frame(x= c(1, 23,  7,  10,  9,  2,  4), group= c(1, 1, 2, 2, 3, 3, 3))

我已经用 cumsum 尝试了不同的方法，但是一旦达到最后一组 25 的限制总和，我就无法为新组动态重启 cumsum。

Answer 1

我认为cpp函数是最快的方法：

library(Rcpp)
cppFunction(
    "IntegerVector GroupBySum(const NumericVector& x, const double& max_sum = 25)
    {
        double sum = 0;
        int cnt = 0;
        int period = 1;
        IntegerVector res(x.size());
        for (int i = 0; i < x.size(); ++i)
        {
            ++cnt;
            sum += x[i];
            if (sum > max_sum)
            {
                sum = x[i];
                if (cnt > 1)
                    ++period;
                cnt = 1;
            }
            res[i] = period;
        }
        return res;
    }"
)
GroupBySum(c(1, 23,  7,  10,  9,  2,  4), 25)

Answer 2

这是一个使用基数 R 和 cumsum（以及 lapply 进行迭代）的解决方案：

id <- c(seq(1, length(x),1)[!duplicated(cumsum(x) %/% 25)], length(x)+1)
id2 <- 1:length(id)
group <- unlist(lapply(1:(length(id)-1), function(x) rep(id2[x], diff(id)[x])))
data.frame(x=x, group=group)

   x group
1  1     1
2 23     1
3  7     2
4 10     2
5  9     3
6  2     3
7  4     3

编辑：使用递归函数的新方法

这是一种更有效的新方法，它还应涵盖@ЕгорШишунов 考虑的特殊情况，并且应该有效地工作，因为它被编写为递归函数。

 recursiveFunction<- function(x, maxN=25, sumX=0, period=1, period2return=c()){
      sumX <- sumX + x[1]
      if (sumX >= maxN) { sumX=x[1]; period = period + 1}
      period2return <- c(period2return, period)
      if (length(x) == 1) { return(period2return)}
      return(recursiveFunction(x[-1], 25, sumX, period, period2return))
    }
    
    recursiveFunction(x, maxN=25)

请注意，您不应更改最后三个函数参数 (sumX=0, period=1, period2return=c()) 的条目，因为它们仅在函数的递归调用期间很重要。

Answer 3

您可以使用 MESS 包中的 cumsumbinning built-in 函数：

# install.packages("MESS")
MESS::cumsumbinning(x, 25, cutwhenpassed = F)
# [1] 1 1 2 2 3 3 3

或者可以用 purrr::accumulate:

cumsum(x == accumulate(x, ~ifelse(.x + .y <= 25, .x + .y, .y)))
# [1] 1 1 2 2 3 3 3

输出

group <- MESS::cumsumbinning(x, 25, cutwhenpassed = F)
data.frame(x= c(1, 23,  7,  10,  9,  2,  4), 
           group = group)

   x group
1  1     1
2 23     1
3  7     2
4 10     2
5  9     3
6  2     3
7  4     3

快速基准测试：

x<- c(1, 23,  7,  10,  9,  2,  4)
bm <- microbenchmark(
  fThomas(x),
  fThomasRec(x),
  fJKupzig(x), 
  fCumsumbinning(x), 
  fAccumulate(x),
  fReduce(x),
  fRcpp(x),
  times = 100L,
  setup = gc(FALSE)
)
autoplot(bm)

Егор Шишунов 的 Rcpp 是最快的，紧随其后的是 MESS::cumsumbinning 和 ThomasIsCoding 的两个函数。

与 n = 100 相比，差距变大了，但 Rcpp 和 cumsumbinning 仍然是首选，while 循环选项不再有效（我不得不删除 ThomasIsCoding 的函数，因为执行时间太长):

x = runif(100, 1, 50)

Answer 4

在基础 R 中你也可以使用 Reduce:

do.call(rbind, Reduce(\(x,y) if((z<-x[1] + y) > 25) c(y, x[2]+1)
       else c(z, x[2]), x[-1], init = c(x[1], 1), accumulate = TRUE))

     [,1] [,2]
[1,]    1    1
[2,]   24    1
[3,]    7    2
[4,]   17    2
[5,]    9    3
[6,]   11    3
[7,]   15    3

分解：

f <- function(x, y){
  z <- x[1] + y
  if(z > 25) c(y, x[2] + 1)
  else c(z, x[2])
}

do.call(rbind, Reduce(f, x[-1], init = c(x[1], 1), accumulate = TRUE))

如果使用 accumulate

library(tidyverse)
accumulate(x[-1], f, .init = c(x[1], 1)) %>%
invoke(rbind, .)

     [,1] [,2]
[1,]    1    1
[2,]   24    1
[3,]    7    2
[4,]   17    2
[5,]    9    3
[6,]   11    3
[7,]   15    3

Answer 5

如果您愿意，我们可以将此作为编程练习来尝试:)

f1 <- function(x) {
  group <- c()
  while (length(x)) {
    idx <- cumsum(x) <= 25
    x <- x[!idx]
    group <- c(group, rep(max(group, 0) + 1, sum(idx)))
  }
  group
}

或

f2 <- function(x) {
  group <- c()
  g <- 0
  while (length(x)) {
    cnt <- s <- 0
    for (i in seq_along(x)) {
      s <- s + x[i]
      if (s <= 25) {
        cnt <- cnt + 1
      } else {
        break
      }
    }
    g <- g + 1
    group <- c(group, rep(g, cnt))
    x <- x[-(1:cnt)]
  }
  group
}

或

f3 <- function(x) {
  s <- cumsum(x)
  r <- c()
  grp <- 1
  while (length(s)) {
    idx <- (s <= 25)
    r <- c(r, rep(grp, sum(idx)))
    grp <- grp + 1
    s <- s[!idx] - tail(s[idx], 1)
  }
  r
}

这给出了

[1] 1 1 2 2 3 3 3

他们之间的基准测试看起来像

set.seed(1)
set.seed(1)
x <- runif(1e3, 0, 25)
bm <- microbenchmark(
  f1(x),
  f2(x),
  f3(x),
  check = "equivalent"
)
autoplot(bm)

递归版本

另一种选择是使用递归（基于f1()）

f <- function(x, res = c()) {
  if (!length(x)) {
    return(res)
  }
  idx <- cumsum(x) <= 25
  Recall(x[!idx], res = c(res, list(x[idx])))
}

你会看到

> f(x)
[[1]]
[1]  1 23

[[2]]
[1]  7 10

[[3]]
[1] 9 2 4

按预定义的最大组和对数值向量进行分组

Group numeric vector by predefined maximal group sum

grouping

r

vector

cumulative-sum

cumsum

递归版本