将具有不一致组宽度的数据框转换为具有同样大组的数据框

Transform a data frame with inconsistent group widths into one with equally large groups

我的目标是改造这样的东西:

df1 <- data.frame(
  value1 = c(100, 100, 100, 100, 100, 100, 100),
  #  value2=c(a, b, c, d, e, f, g),
  startgroup = c(1, 101, 351, 356, 401, 451, 451),
  endgroup = c(100, 350, 355, 400, 450, 450, 500),
  groupwidth = c(100, 250, 5, 40, 50, 0 , 50)
)

变成这样的东西:

df2 <- data.frame(
  value1 = c(100, 40, 40, 220, 300),
  #  value2=c(a, b*.4, b*.4, b*.2+c+d, d+e+f),
  startgroup = c(1, 101, 201, 301, 401),
  endgroup = c(100, 200, 300, 400, 500),
  groupwidth = c(100, 100, 100, 100, 100)
)

我已经设法用 for 循环做到了,但不知何故每个变量大约需要 5-10 分钟。不知道为什么艰难。我敢肯定,有一种简单的方法可以实现它。

library(tidyverse)
df1 %>%
  filter(groupwidth>0) %>%
  rowwise() %>%
  mutate(gr = list(c(rep(100, groupwidth %/%100), groupwidth %%100)),
         ln = length(gr)) %>%
  unnest(gr) %>%
  group_by(st=cumsum(gr-1) %/%100) %>%
  summarise(val = sum(value1/groupwidth * gr), 
            startgroup = st[1] * 100 + 1, 
            endgroup = startgroup + 99, 
            groupwidth = 100)

# A tibble: 5 x 5
     st   val startgroup endgroup groupwidth
  <dbl> <dbl>      <dbl>    <dbl>      <dbl>
1     0   100          1      100        100
2     1    40        101      200        100
3     2    40        201      300        100
4     3   220        301      400        100
5     4   200        401      500        100

如果有人想知道,我找到了一个有效的解决方案,它可以在不到 5 秒的时间内(对真实数据)起作用。解决方案最终也是一个 for 循环。

df1 <- data.frame(
  value1 = c(100, 100, 100, 100, 100, 100, 100),
  #  value2=c(a, b, c, d, e, f, g),
  startgroup = c(1, 101, 351, 356, 401, 451, 451),
  endgroup = c(100, 350, 355, 400, 450, 450, 500),
  groupwidth = c(100, 250, 5, 40, 50, 0 , 50)) %>% 
  mutate(startgroup = startgroup - 1)


rm <- max(df1$endgroup)%/%100+1

for (i in 1:rm){
  df1 <- df1 %>% 
    mutate(
      a = 0,
      a = ifelse(startgroup <= 100*i-100 & endgroup > 100*i, 
        a + value1/(endgroup-startgroup)*100, 
        a),
      a = ifelse(startgroup <= 100*i-100 & endgroup <= 100*i & endgroup > 100*i-100,  
        a + value1/(endgroup-startgroup)*(endgroup - (i-1)*100),
        a),
      a = ifelse(startgroup > 100*i-100 & endgroup <= 100*i,  
        a + value1,
        a),
      a = ifelse(startgroup > 100*i-100 & startgroup <= 100*i & endgroup > 100*i,  
        a + value1/(endgroup-startgroup)*(i*100 - startgroup),
        a),
      !!paste0(i*100) := a
    )
}
df1 <- df1 %>% 
    pivot_longer(
    cols = contains("00"),
    names_to = "upper_bound",
    values_to = "value",
    values_drop_na = TRUE) %>% 
  group_by(upper_bound) %>% 
  mutate(upper_bound = as.integer(upper_bound)) %>% 
  summarize(
    value = sum(value)
  )