将具有不一致组宽度的数据框转换为具有同样大组的数据框
Transform a data frame with inconsistent group widths into one with equally large groups
我的目标是改造这样的东西:
df1 <- data.frame(
value1 = c(100, 100, 100, 100, 100, 100, 100),
# value2=c(a, b, c, d, e, f, g),
startgroup = c(1, 101, 351, 356, 401, 451, 451),
endgroup = c(100, 350, 355, 400, 450, 450, 500),
groupwidth = c(100, 250, 5, 40, 50, 0 , 50)
)
变成这样的东西:
df2 <- data.frame(
value1 = c(100, 40, 40, 220, 300),
# value2=c(a, b*.4, b*.4, b*.2+c+d, d+e+f),
startgroup = c(1, 101, 201, 301, 401),
endgroup = c(100, 200, 300, 400, 500),
groupwidth = c(100, 100, 100, 100, 100)
)
我已经设法用 for 循环做到了,但不知何故每个变量大约需要 5-10 分钟。不知道为什么艰难。我敢肯定,有一种简单的方法可以实现它。
library(tidyverse)
df1 %>%
filter(groupwidth>0) %>%
rowwise() %>%
mutate(gr = list(c(rep(100, groupwidth %/%100), groupwidth %%100)),
ln = length(gr)) %>%
unnest(gr) %>%
group_by(st=cumsum(gr-1) %/%100) %>%
summarise(val = sum(value1/groupwidth * gr),
startgroup = st[1] * 100 + 1,
endgroup = startgroup + 99,
groupwidth = 100)
# A tibble: 5 x 5
st val startgroup endgroup groupwidth
<dbl> <dbl> <dbl> <dbl> <dbl>
1 0 100 1 100 100
2 1 40 101 200 100
3 2 40 201 300 100
4 3 220 301 400 100
5 4 200 401 500 100
如果有人想知道,我找到了一个有效的解决方案,它可以在不到 5 秒的时间内(对真实数据)起作用。解决方案最终也是一个 for 循环。
df1 <- data.frame(
value1 = c(100, 100, 100, 100, 100, 100, 100),
# value2=c(a, b, c, d, e, f, g),
startgroup = c(1, 101, 351, 356, 401, 451, 451),
endgroup = c(100, 350, 355, 400, 450, 450, 500),
groupwidth = c(100, 250, 5, 40, 50, 0 , 50)) %>%
mutate(startgroup = startgroup - 1)
rm <- max(df1$endgroup)%/%100+1
for (i in 1:rm){
df1 <- df1 %>%
mutate(
a = 0,
a = ifelse(startgroup <= 100*i-100 & endgroup > 100*i,
a + value1/(endgroup-startgroup)*100,
a),
a = ifelse(startgroup <= 100*i-100 & endgroup <= 100*i & endgroup > 100*i-100,
a + value1/(endgroup-startgroup)*(endgroup - (i-1)*100),
a),
a = ifelse(startgroup > 100*i-100 & endgroup <= 100*i,
a + value1,
a),
a = ifelse(startgroup > 100*i-100 & startgroup <= 100*i & endgroup > 100*i,
a + value1/(endgroup-startgroup)*(i*100 - startgroup),
a),
!!paste0(i*100) := a
)
}
df1 <- df1 %>%
pivot_longer(
cols = contains("00"),
names_to = "upper_bound",
values_to = "value",
values_drop_na = TRUE) %>%
group_by(upper_bound) %>%
mutate(upper_bound = as.integer(upper_bound)) %>%
summarize(
value = sum(value)
)
我的目标是改造这样的东西:
df1 <- data.frame(
value1 = c(100, 100, 100, 100, 100, 100, 100),
# value2=c(a, b, c, d, e, f, g),
startgroup = c(1, 101, 351, 356, 401, 451, 451),
endgroup = c(100, 350, 355, 400, 450, 450, 500),
groupwidth = c(100, 250, 5, 40, 50, 0 , 50)
)
变成这样的东西:
df2 <- data.frame(
value1 = c(100, 40, 40, 220, 300),
# value2=c(a, b*.4, b*.4, b*.2+c+d, d+e+f),
startgroup = c(1, 101, 201, 301, 401),
endgroup = c(100, 200, 300, 400, 500),
groupwidth = c(100, 100, 100, 100, 100)
)
我已经设法用 for 循环做到了,但不知何故每个变量大约需要 5-10 分钟。不知道为什么艰难。我敢肯定,有一种简单的方法可以实现它。
library(tidyverse)
df1 %>%
filter(groupwidth>0) %>%
rowwise() %>%
mutate(gr = list(c(rep(100, groupwidth %/%100), groupwidth %%100)),
ln = length(gr)) %>%
unnest(gr) %>%
group_by(st=cumsum(gr-1) %/%100) %>%
summarise(val = sum(value1/groupwidth * gr),
startgroup = st[1] * 100 + 1,
endgroup = startgroup + 99,
groupwidth = 100)
# A tibble: 5 x 5
st val startgroup endgroup groupwidth
<dbl> <dbl> <dbl> <dbl> <dbl>
1 0 100 1 100 100
2 1 40 101 200 100
3 2 40 201 300 100
4 3 220 301 400 100
5 4 200 401 500 100
如果有人想知道,我找到了一个有效的解决方案,它可以在不到 5 秒的时间内(对真实数据)起作用。解决方案最终也是一个 for 循环。
df1 <- data.frame(
value1 = c(100, 100, 100, 100, 100, 100, 100),
# value2=c(a, b, c, d, e, f, g),
startgroup = c(1, 101, 351, 356, 401, 451, 451),
endgroup = c(100, 350, 355, 400, 450, 450, 500),
groupwidth = c(100, 250, 5, 40, 50, 0 , 50)) %>%
mutate(startgroup = startgroup - 1)
rm <- max(df1$endgroup)%/%100+1
for (i in 1:rm){
df1 <- df1 %>%
mutate(
a = 0,
a = ifelse(startgroup <= 100*i-100 & endgroup > 100*i,
a + value1/(endgroup-startgroup)*100,
a),
a = ifelse(startgroup <= 100*i-100 & endgroup <= 100*i & endgroup > 100*i-100,
a + value1/(endgroup-startgroup)*(endgroup - (i-1)*100),
a),
a = ifelse(startgroup > 100*i-100 & endgroup <= 100*i,
a + value1,
a),
a = ifelse(startgroup > 100*i-100 & startgroup <= 100*i & endgroup > 100*i,
a + value1/(endgroup-startgroup)*(i*100 - startgroup),
a),
!!paste0(i*100) := a
)
}
df1 <- df1 %>%
pivot_longer(
cols = contains("00"),
names_to = "upper_bound",
values_to = "value",
values_drop_na = TRUE) %>%
group_by(upper_bound) %>%
mutate(upper_bound = as.integer(upper_bound)) %>%
summarize(
value = sum(value)
)