在 R 中的 dplyr 中分组后如何保留其他变量?
How can I keep additional variables after grouping in some other variables in dplyr in R?
今天早些时候我发布了这个问题 但我还有一个问题被提出了。
如果我必须在原始数据框中添加更多变量 (loc, height):
var = c(rep("A",3),rep("B",2),rep("C",5));var
date = c(as.Date("2022/01/01"),as.Date("2022/02/01"),as.Date("2022/03/01"),
as.Date("2022/01/01"),as.Date("2022/03/01"),
as.Date("2022/01/01"),as.Date("2022/01/01"),as.Date("2022/02/01"),as.Date("2022/02/01"),as.Date("2022/03/01"))
loc = c(rep("london",3),rep("berlin",2),rep("cairo",5))
height =c(13,14,15,13,15,16,12,14,13,15)
data = tibble(var,date,loc,height);data
如何保持loc变量的对应值和每个月的身高总和(加上之前回答的问题)?
理想情况下,它现在必须看起来像这样:
var
Quarter
Month
Condition
loc
height
A
1
1
TRUE
London
13
A
1
2
TRUE
London
14
A
1
3
TRUE
London
15
B
1
1
TRUE
berlin
13
B
1
2
FALSE
berlin
0
B
1
3
TRUE
berlin
15
C
1
1
TRUE
cairo
28
C
1
2
TRUE
cairo
27
C
1
3
TRUE
cairo
15
有什么帮助吗?我如何使用 dplyr 在 R 中做到这一点?
基于之前的答案:
data <- data %>% mutate(month=month(date),quarter=quarter(month))
left_join(
expand(data, var,month,quarter),
data %>% group_by(var,month, date,loc) %>%
summarize(height=sum(height), .groups="drop") %>%
select(-date) %>%
mutate(condition=TRUE)
) %>%
mutate(condition=!is.na(condition),height=if_else(is.na(height),0,height)) %>%
group_by(var) %>% fill(loc)
输出:
var month quarter loc height condition
<chr> <dbl> <int> <chr> <dbl> <lgl>
1 A 1 1 london 13 TRUE
2 A 2 1 london 14 TRUE
3 A 3 1 london 15 TRUE
4 B 1 1 berlin 13 TRUE
5 B 2 1 berlin 0 FALSE
6 B 3 1 berlin 15 TRUE
7 C 1 1 cairo 28 TRUE
8 C 2 1 cairo 27 TRUE
9 C 3 1 cairo 15 TRUE
根据之前的解决方案,在 distinct
中添加 .keep_all = TRUE
,然后 fill
具有之前 non-NA 值的 loc
列
library(dplyr)
library(tidyr)
library(lubridate)
data %>%
mutate(month = lubridate::month(date)) %>%
group_by(var, month) %>%
mutate(height = sum(height)) %>%
ungroup %>%
complete(var, month, fill = list(height = 0)) %>%
mutate(Quarter = quarter, Condition = !is.na(date)) %>%
distinct(var, month, Quarter, Condition, .keep_all = TRUE) %>%
fill(loc) %>%
select(-date)
-输出
# A tibble: 9 × 6
var month loc height Quarter Condition
<chr> <dbl> <chr> <dbl> <dbl> <lgl>
1 A 1 london 13 1 TRUE
2 A 2 london 14 1 TRUE
3 A 3 london 15 1 TRUE
4 B 1 berlin 13 1 TRUE
5 B 2 berlin 0 1 FALSE
6 B 3 berlin 15 1 TRUE
7 C 1 cairo 28 1 TRUE
8 C 2 cairo 27 1 TRUE
9 C 3 cairo 15 1 TRUE
这是dplyr
解决方案:这部分complete(var,Month, fill = list(height = 0))
来自@akrun:
library(dplyr)
data %>%
group_by(var, Quarter = quarter(date), Month = month(date), loc) %>%
summarise(height = sum(height)) %>%
ungroup() %>%
complete(var,Month, fill = list(height = 0)) %>%
fill(c(Quarter, loc), .direction = "down") %>%
mutate(Condition = ifelse(height == 0 , FALSE, TRUE))
var Month Quarter loc height Condition
<chr> <dbl> <int> <chr> <dbl> <lgl>
1 A 1 1 london 13 TRUE
2 A 2 1 london 14 TRUE
3 A 3 1 london 15 TRUE
4 B 1 1 berlin 13 TRUE
5 B 2 1 berlin 0 FALSE
6 B 3 1 berlin 15 TRUE
7 C 1 1 cairo 28 TRUE
8 C 2 1 cairo 27 TRUE
9 C 3 1 cairo 15 TRUE
您可以在 complete()
步骤中使用 nesting()
来仅获得您想要的组合,然后在对高度求和之前按您想要保持不同的所有内容进行分组:
library(dplyr)
library(tidyr)
var = c(rep("A",3),rep("B",2),rep("C",5));var
date = c(as.Date("2022/01/01"),as.Date("2022/02/01"),as.Date("2022/03/01"),
as.Date("2022/01/01"),as.Date("2022/03/01"),
as.Date("2022/01/01"),as.Date("2022/01/01"),as.Date("2022/02/01"),as.Date("2022/02/01"),as.Date("2022/03/01"))
loc = c(rep("london",3),rep("berlin",2),rep("cairo",5))
height =c(13,14,15,13,15,16,12,14,13,15)
data = tibble(var,date,loc,height)
data %>%
mutate(month = lubridate::month(date)) %>%
complete(month, nesting(var, loc)) %>%
mutate(Quarter = lubridate::quarter(month),
Condition = !is.na(date)) %>%
group_by(across(-c(height, date))) %>%
summarise(height = sum(height), .groups = "drop") %>%
arrange(var)
#> # A tibble: 9 × 6
#> month var loc Quarter Condition height
#> <dbl> <chr> <chr> <int> <lgl> <dbl>
#> 1 1 A london 1 TRUE 13
#> 2 2 A london 1 TRUE 14
#> 3 3 A london 1 TRUE 15
#> 4 1 B berlin 1 TRUE 13
#> 5 2 B berlin 1 FALSE NA
#> 6 3 B berlin 1 TRUE 15
#> 7 1 C cairo 1 TRUE 28
#> 8 2 C cairo 1 TRUE 27
#> 9 3 C cairo 1 TRUE 15
由 reprex package (v2.0.1)
创建于 2022-06-01
今天早些时候我发布了这个问题
如果我必须在原始数据框中添加更多变量 (loc, height):
var = c(rep("A",3),rep("B",2),rep("C",5));var
date = c(as.Date("2022/01/01"),as.Date("2022/02/01"),as.Date("2022/03/01"),
as.Date("2022/01/01"),as.Date("2022/03/01"),
as.Date("2022/01/01"),as.Date("2022/01/01"),as.Date("2022/02/01"),as.Date("2022/02/01"),as.Date("2022/03/01"))
loc = c(rep("london",3),rep("berlin",2),rep("cairo",5))
height =c(13,14,15,13,15,16,12,14,13,15)
data = tibble(var,date,loc,height);data
如何保持loc变量的对应值和每个月的身高总和(加上之前回答的问题)?
理想情况下,它现在必须看起来像这样:
var | Quarter | Month | Condition | loc | height |
---|---|---|---|---|---|
A | 1 | 1 | TRUE | London | 13 |
A | 1 | 2 | TRUE | London | 14 |
A | 1 | 3 | TRUE | London | 15 |
B | 1 | 1 | TRUE | berlin | 13 |
B | 1 | 2 | FALSE | berlin | 0 |
B | 1 | 3 | TRUE | berlin | 15 |
C | 1 | 1 | TRUE | cairo | 28 |
C | 1 | 2 | TRUE | cairo | 27 |
C | 1 | 3 | TRUE | cairo | 15 |
有什么帮助吗?我如何使用 dplyr 在 R 中做到这一点?
基于之前的答案:
data <- data %>% mutate(month=month(date),quarter=quarter(month))
left_join(
expand(data, var,month,quarter),
data %>% group_by(var,month, date,loc) %>%
summarize(height=sum(height), .groups="drop") %>%
select(-date) %>%
mutate(condition=TRUE)
) %>%
mutate(condition=!is.na(condition),height=if_else(is.na(height),0,height)) %>%
group_by(var) %>% fill(loc)
输出:
var month quarter loc height condition
<chr> <dbl> <int> <chr> <dbl> <lgl>
1 A 1 1 london 13 TRUE
2 A 2 1 london 14 TRUE
3 A 3 1 london 15 TRUE
4 B 1 1 berlin 13 TRUE
5 B 2 1 berlin 0 FALSE
6 B 3 1 berlin 15 TRUE
7 C 1 1 cairo 28 TRUE
8 C 2 1 cairo 27 TRUE
9 C 3 1 cairo 15 TRUE
根据之前的解决方案,在 distinct
中添加 .keep_all = TRUE
,然后 fill
具有之前 non-NA 值的 loc
列
library(dplyr)
library(tidyr)
library(lubridate)
data %>%
mutate(month = lubridate::month(date)) %>%
group_by(var, month) %>%
mutate(height = sum(height)) %>%
ungroup %>%
complete(var, month, fill = list(height = 0)) %>%
mutate(Quarter = quarter, Condition = !is.na(date)) %>%
distinct(var, month, Quarter, Condition, .keep_all = TRUE) %>%
fill(loc) %>%
select(-date)
-输出
# A tibble: 9 × 6
var month loc height Quarter Condition
<chr> <dbl> <chr> <dbl> <dbl> <lgl>
1 A 1 london 13 1 TRUE
2 A 2 london 14 1 TRUE
3 A 3 london 15 1 TRUE
4 B 1 berlin 13 1 TRUE
5 B 2 berlin 0 1 FALSE
6 B 3 berlin 15 1 TRUE
7 C 1 cairo 28 1 TRUE
8 C 2 cairo 27 1 TRUE
9 C 3 cairo 15 1 TRUE
这是dplyr
解决方案:这部分complete(var,Month, fill = list(height = 0))
来自@akrun:
library(dplyr)
data %>%
group_by(var, Quarter = quarter(date), Month = month(date), loc) %>%
summarise(height = sum(height)) %>%
ungroup() %>%
complete(var,Month, fill = list(height = 0)) %>%
fill(c(Quarter, loc), .direction = "down") %>%
mutate(Condition = ifelse(height == 0 , FALSE, TRUE))
var Month Quarter loc height Condition
<chr> <dbl> <int> <chr> <dbl> <lgl>
1 A 1 1 london 13 TRUE
2 A 2 1 london 14 TRUE
3 A 3 1 london 15 TRUE
4 B 1 1 berlin 13 TRUE
5 B 2 1 berlin 0 FALSE
6 B 3 1 berlin 15 TRUE
7 C 1 1 cairo 28 TRUE
8 C 2 1 cairo 27 TRUE
9 C 3 1 cairo 15 TRUE
您可以在 complete()
步骤中使用 nesting()
来仅获得您想要的组合,然后在对高度求和之前按您想要保持不同的所有内容进行分组:
library(dplyr)
library(tidyr)
var = c(rep("A",3),rep("B",2),rep("C",5));var
date = c(as.Date("2022/01/01"),as.Date("2022/02/01"),as.Date("2022/03/01"),
as.Date("2022/01/01"),as.Date("2022/03/01"),
as.Date("2022/01/01"),as.Date("2022/01/01"),as.Date("2022/02/01"),as.Date("2022/02/01"),as.Date("2022/03/01"))
loc = c(rep("london",3),rep("berlin",2),rep("cairo",5))
height =c(13,14,15,13,15,16,12,14,13,15)
data = tibble(var,date,loc,height)
data %>%
mutate(month = lubridate::month(date)) %>%
complete(month, nesting(var, loc)) %>%
mutate(Quarter = lubridate::quarter(month),
Condition = !is.na(date)) %>%
group_by(across(-c(height, date))) %>%
summarise(height = sum(height), .groups = "drop") %>%
arrange(var)
#> # A tibble: 9 × 6
#> month var loc Quarter Condition height
#> <dbl> <chr> <chr> <int> <lgl> <dbl>
#> 1 1 A london 1 TRUE 13
#> 2 2 A london 1 TRUE 14
#> 3 3 A london 1 TRUE 15
#> 4 1 B berlin 1 TRUE 13
#> 5 2 B berlin 1 FALSE NA
#> 6 3 B berlin 1 TRUE 15
#> 7 1 C cairo 1 TRUE 28
#> 8 2 C cairo 1 TRUE 27
#> 9 3 C cairo 1 TRUE 15
由 reprex package (v2.0.1)
创建于 2022-06-01