计算 R 中子组总数的变化
Calculate changes in totals of subgroups in R
我有以下数据框:
# A tibble: 8 x 5
Year Group Unit Profit Sales
<dbl> <chr> <chr> <dbl> <dbl>
1 2021 One A 20 70
2 2021 One B 10 40
3 2021 One C 5 25
4 2021 Two D 15 50
5 2022 One A NaN 50
6 2022 One B NaN 55
7 2022 One E NaN 150
8 2022 Two D NaN 60
而且我希望看到每个组的销售额增长。问题是这些组没有相同的单位。因此,为了估计增长/下降,我只想计算仍然存在的单位的增长。
所以想要的结果是这样的:
# A tibble: 8 x 8
Year Group Unit Profit Sales Sales_group_comparison_next_year Sales_group_compariso~ Group_growth_ra~
<dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2021 One A 20 70 110 NaN NaN
2 2021 One B 10 40 110 NaN NaN
3 2021 One C 5 25 110 NaN NaN
4 2021 Two D 15 50 50 NaN NaN
5 2022 One A NaN 50 NaN 105 0.955
6 2022 One B NaN 55 NaN 105 0.955
7 2022 One E NaN 150 NaN 105 0.955
8 2022 Two D NaN 60 NaN 60 1.2
为了实现这一点,我编写了以下代码。这行得通,但是代码很多。所以我的问题是,有没有更聪明/更优雅的方法来做到这一点?
test2 <- test %>%
group_by(
Unit
) %>%
mutate(
Years_present = list(min(Year):max(Year))
) %>%
ungroup() %>%
mutate(
Present_next_year = map2(Year, Years_present, ~ .x %in% .y & (.x + 1) %in% .y),
Present_last_year = map2(Year, Years_present, ~ .x %in% .y & (.x - 1) %in% .y),
Present_next_year = ifelse(
Present_next_year == TRUE,
Sales,
0
),
Present_last_year = ifelse(
Present_last_year == TRUE,
Sales,
0
)
) %>%
group_by(
Group, Year
) %>%
mutate(
Sales_group_comparison_next_year = sum(Present_next_year),
Sales_group_comparison_last_year = sum(Present_last_year),
) %>%
ungroup() %>%
group_by(Unit) %>%
mutate(
Sales_group_comparison_lagged =
dplyr::lag(
x = Sales_group_comparison_next_year,
n = 1,
default = NaN,
order_by = Year
),
) %>%
ungroup() %>%
rowwise() %>%
mutate(
Group_growth_rate = Sales_group_comparison_last_year / Sales_group_comparison_lagged
) %>%
ungroup() %>%
group_by(Group, Year) %>%
mutate(Group_growth_rate = ifelse(sum(is.na(Group_growth_rate)) == length(Group_growth_rate),
NaN,
max(Group_growth_rate, na.rm = TRUE))
) %>%
ungroup()
谢谢!
这是一种使用一些更简单的 dplyr 数学和 self-join 来提取每个组去年的总数的方法。
library(dplyr)
data_with_2023 %>%
group_by(Group, Unit) %>%
mutate(in_next_yr = lead(Year, default = 0) == Year + 1,
in_last_yr = lag(Year, default = 0) == Year - 1) %>%
group_by(Year, Group) %>%
mutate(total_in_next_yr = sum(Sales*in_next_yr),
total_in_prior_yr = sum(Sales*in_last_yr)) %>%
ungroup() -> data_totals
left_join(data_totals,
data_totals %>%
distinct(Year = Year +1,
Group,
last_yr_total = total_in_next_yr)) %>%
mutate(growth = total_in_prior_yr / last_yr_total)
结果
Joining, by = c("Year", "Group")
# A tibble: 10 × 11
Year Group Unit Profit Sales in_next_yr in_last_yr total_in_next_yr total_in_prior_yr last_yr_total growth
<dbl> <chr> <chr> <chr> <int> <lgl> <lgl> <int> <int> <int> <dbl>
1 2021 One A 20 70 TRUE FALSE 110 0 NA NA
2 2021 One B 10 40 TRUE FALSE 110 0 NA NA
3 2021 One C 5 25 FALSE FALSE 110 0 NA NA
4 2021 Two D 15 50 TRUE FALSE 50 0 NA NA
5 2022 One A NaN 50 FALSE TRUE 205 105 110 0.955
6 2022 One B NaN 55 TRUE TRUE 205 105 110 0.955
7 2022 One E NaN 150 TRUE FALSE 205 105 110 0.955
8 2022 Two D NaN 60 FALSE TRUE 0 60 50 1.2
9 2023 One B NaN 70 FALSE TRUE 0 150 205 0.732
10 2023 One E NaN 80 FALSE TRUE 0 150 205 0.732
源数据
data_with_2023 <- data.frame(
stringsAsFactors = FALSE,
Year = c(2021L, 2021L, 2021L, 2021L, 2022L, 2022L, 2022L, 2022L, 2023L, 2023L),
Group = c("One", "One", "One", "Two", "One", "One", "One", "Two", "One", "One"),
Unit = c("A", "B", "C", "D", "A", "B", "E", "D", "B", "E"),
Profit = c("20", "10", "5", "15", "NaN", "NaN", "NaN", "NaN", "NaN", "NaN"),
Sales = c(70L, 40L, 25L, 50L, 50L, 55L, 150L, 60L, 70L, 80L)
)
我有以下数据框:
# A tibble: 8 x 5
Year Group Unit Profit Sales
<dbl> <chr> <chr> <dbl> <dbl>
1 2021 One A 20 70
2 2021 One B 10 40
3 2021 One C 5 25
4 2021 Two D 15 50
5 2022 One A NaN 50
6 2022 One B NaN 55
7 2022 One E NaN 150
8 2022 Two D NaN 60
而且我希望看到每个组的销售额增长。问题是这些组没有相同的单位。因此,为了估计增长/下降,我只想计算仍然存在的单位的增长。
所以想要的结果是这样的:
# A tibble: 8 x 8
Year Group Unit Profit Sales Sales_group_comparison_next_year Sales_group_compariso~ Group_growth_ra~
<dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2021 One A 20 70 110 NaN NaN
2 2021 One B 10 40 110 NaN NaN
3 2021 One C 5 25 110 NaN NaN
4 2021 Two D 15 50 50 NaN NaN
5 2022 One A NaN 50 NaN 105 0.955
6 2022 One B NaN 55 NaN 105 0.955
7 2022 One E NaN 150 NaN 105 0.955
8 2022 Two D NaN 60 NaN 60 1.2
为了实现这一点,我编写了以下代码。这行得通,但是代码很多。所以我的问题是,有没有更聪明/更优雅的方法来做到这一点?
test2 <- test %>%
group_by(
Unit
) %>%
mutate(
Years_present = list(min(Year):max(Year))
) %>%
ungroup() %>%
mutate(
Present_next_year = map2(Year, Years_present, ~ .x %in% .y & (.x + 1) %in% .y),
Present_last_year = map2(Year, Years_present, ~ .x %in% .y & (.x - 1) %in% .y),
Present_next_year = ifelse(
Present_next_year == TRUE,
Sales,
0
),
Present_last_year = ifelse(
Present_last_year == TRUE,
Sales,
0
)
) %>%
group_by(
Group, Year
) %>%
mutate(
Sales_group_comparison_next_year = sum(Present_next_year),
Sales_group_comparison_last_year = sum(Present_last_year),
) %>%
ungroup() %>%
group_by(Unit) %>%
mutate(
Sales_group_comparison_lagged =
dplyr::lag(
x = Sales_group_comparison_next_year,
n = 1,
default = NaN,
order_by = Year
),
) %>%
ungroup() %>%
rowwise() %>%
mutate(
Group_growth_rate = Sales_group_comparison_last_year / Sales_group_comparison_lagged
) %>%
ungroup() %>%
group_by(Group, Year) %>%
mutate(Group_growth_rate = ifelse(sum(is.na(Group_growth_rate)) == length(Group_growth_rate),
NaN,
max(Group_growth_rate, na.rm = TRUE))
) %>%
ungroup()
谢谢!
这是一种使用一些更简单的 dplyr 数学和 self-join 来提取每个组去年的总数的方法。
library(dplyr)
data_with_2023 %>%
group_by(Group, Unit) %>%
mutate(in_next_yr = lead(Year, default = 0) == Year + 1,
in_last_yr = lag(Year, default = 0) == Year - 1) %>%
group_by(Year, Group) %>%
mutate(total_in_next_yr = sum(Sales*in_next_yr),
total_in_prior_yr = sum(Sales*in_last_yr)) %>%
ungroup() -> data_totals
left_join(data_totals,
data_totals %>%
distinct(Year = Year +1,
Group,
last_yr_total = total_in_next_yr)) %>%
mutate(growth = total_in_prior_yr / last_yr_total)
结果
Joining, by = c("Year", "Group")
# A tibble: 10 × 11
Year Group Unit Profit Sales in_next_yr in_last_yr total_in_next_yr total_in_prior_yr last_yr_total growth
<dbl> <chr> <chr> <chr> <int> <lgl> <lgl> <int> <int> <int> <dbl>
1 2021 One A 20 70 TRUE FALSE 110 0 NA NA
2 2021 One B 10 40 TRUE FALSE 110 0 NA NA
3 2021 One C 5 25 FALSE FALSE 110 0 NA NA
4 2021 Two D 15 50 TRUE FALSE 50 0 NA NA
5 2022 One A NaN 50 FALSE TRUE 205 105 110 0.955
6 2022 One B NaN 55 TRUE TRUE 205 105 110 0.955
7 2022 One E NaN 150 TRUE FALSE 205 105 110 0.955
8 2022 Two D NaN 60 FALSE TRUE 0 60 50 1.2
9 2023 One B NaN 70 FALSE TRUE 0 150 205 0.732
10 2023 One E NaN 80 FALSE TRUE 0 150 205 0.732
源数据
data_with_2023 <- data.frame(
stringsAsFactors = FALSE,
Year = c(2021L, 2021L, 2021L, 2021L, 2022L, 2022L, 2022L, 2022L, 2023L, 2023L),
Group = c("One", "One", "One", "Two", "One", "One", "One", "Two", "One", "One"),
Unit = c("A", "B", "C", "D", "A", "B", "E", "D", "B", "E"),
Profit = c("20", "10", "5", "15", "NaN", "NaN", "NaN", "NaN", "NaN", "NaN"),
Sales = c(70L, 40L, 25L, 50L, 50L, 55L, 150L, 60L, 70L, 80L)
)