基于R dplyr中的多个组进行总结
summarise based on multiple groups in R dplyr
我有一个大数据框,看起来像这样
library(tidyverse)
df <- tibble(id=c(1,1,2,2,2,3), counts=c(10,20,15,15,10,20), fruit=c("apple","banana","cherry","cherry","ananas","pear"))
df
#> # A tibble: 6 × 3
#> id counts fruit
#> <dbl> <dbl> <chr>
#> 1 1 10 apple
#> 2 1 20 banana
#> 3 2 15 cherry
#> 4 2 15 cherry
#> 5 2 10 ananas
#> 6 3 20 pear
由 reprex package (v2.0.1)
于 2022-04-13 创建
对于每个 id,我想保留最大数量的水果,然后我想在另一列中添加每个 id 的唯一水果 sum_counts。
我希望我的数据看起来像这样:
# A tibble: 3 × 4
id central_fruit fruits sum_counts
<dbl> <chr> <chr> <dbl>
1 1 banana banana, apple 30
2 2 cherry cherry, ananas 30
3 3 pear pear 20
这是我目前尝试的方法,不知道为什么惨败
library(tidyverse)
df <- tibble(id=c(1,1,2,2,2,3), counts=c(10,20,15,15,15,20), fruit=c("apple","banana","cherry","cherry","ananas","pear"))
df %>%
group_by(id,fruit) %>%
add_count(fruit) %>%
ungroup() %>%
group_by(id) %>%
summarise(central_fruit=fruit[which.max(counts)],
fruits = toString(sort(unique(fruit), decreasing = TRUE)),
sum_counts = sum(unique(counts)))
#> # A tibble: 3 × 4
#> id central_fruit fruits sum_counts
#> <dbl> <chr> <chr> <dbl>
#> 1 1 banana banana, apple 30
#> 2 2 cherry cherry, ananas 15
#> 3 3 pear pear 20
由 reprex package (v2.0.1)
于 2022-04-13 创建
这是一个 dplyr
方法。
library(dplyr)
df <- tibble(id=c(1,1,2,2,2,3), counts=c(10,20,15,15,10,20), fruit=c("apple","banana","cherry","cherry","ananas","pear"))
df %>%
group_by(id) %>%
mutate(fruits = paste0(unique(fruit), collapse = ", "),
sum_counts = sum(unique(counts))) %>%
filter(counts == max(counts)) %>%
distinct() %>%
rename("central_fruit" = "fruit") %>%
select(-counts)
#> # A tibble: 3 × 4
#> # Groups: id [3]
#> id central_fruit fruits sum_counts
#> <dbl> <chr> <chr> <dbl>
#> 1 1 banana apple, banana 30
#> 2 2 cherry cherry, ananas 25
#> 3 3 pear pear 20
由 reprex package (v2.0.1)
于 2022-04-13 创建
这应该有效:
df |>
group_by(id) |>
distinct(fruit, .keep_all = TRUE) |>
mutate(
is_central_fruit = counts == max(counts),
sum_counts = sum(counts),
fruits = paste(fruit, collapse = ", ")
) |>
filter(
is_central_fruit
) |>
select(
-is_central_fruit,
-counts,
central_fruit = fruit
)
# id central_fruit sum_counts fruits
# <dbl> <chr> <dbl> <chr>
# 1 1 banana 30 apple, banana
# 2 2 cherry 25 cherry, ananas
# 3 3 pear 20 pear
如果您想订购 fruits
列,那么我不会将水果存储为字符向量,而是作为因子列表。
另一种 dplyr
方法但保留水果顺序(central_fruit 在 fruits
中排在第一位):
df %>%
distinct() %>%
group_by(id) %>%
mutate(sum_counts = sum(counts)) %>%
arrange(id, desc(counts)) %>%
mutate(fruits = paste(fruit, collapse = ", ")) %>%
slice(1) %>%
select(id, central_fruit = fruit, fruits, sum_counts) %>%
ungroup()
这个returns
# A tibble: 3 x 4
id central_fruit fruits sum_counts
<dbl> <chr> <chr> <dbl>
1 1 banana banana, apple 30
2 2 cherry cherry, ananas 25
3 3 pear pear 20
我有一个大数据框,看起来像这样
library(tidyverse)
df <- tibble(id=c(1,1,2,2,2,3), counts=c(10,20,15,15,10,20), fruit=c("apple","banana","cherry","cherry","ananas","pear"))
df
#> # A tibble: 6 × 3
#> id counts fruit
#> <dbl> <dbl> <chr>
#> 1 1 10 apple
#> 2 1 20 banana
#> 3 2 15 cherry
#> 4 2 15 cherry
#> 5 2 10 ananas
#> 6 3 20 pear
由 reprex package (v2.0.1)
于 2022-04-13 创建对于每个 id,我想保留最大数量的水果,然后我想在另一列中添加每个 id 的唯一水果 sum_counts。
我希望我的数据看起来像这样:
# A tibble: 3 × 4
id central_fruit fruits sum_counts
<dbl> <chr> <chr> <dbl>
1 1 banana banana, apple 30
2 2 cherry cherry, ananas 30
3 3 pear pear 20
这是我目前尝试的方法,不知道为什么惨败
library(tidyverse)
df <- tibble(id=c(1,1,2,2,2,3), counts=c(10,20,15,15,15,20), fruit=c("apple","banana","cherry","cherry","ananas","pear"))
df %>%
group_by(id,fruit) %>%
add_count(fruit) %>%
ungroup() %>%
group_by(id) %>%
summarise(central_fruit=fruit[which.max(counts)],
fruits = toString(sort(unique(fruit), decreasing = TRUE)),
sum_counts = sum(unique(counts)))
#> # A tibble: 3 × 4
#> id central_fruit fruits sum_counts
#> <dbl> <chr> <chr> <dbl>
#> 1 1 banana banana, apple 30
#> 2 2 cherry cherry, ananas 15
#> 3 3 pear pear 20
由 reprex package (v2.0.1)
于 2022-04-13 创建这是一个 dplyr
方法。
library(dplyr)
df <- tibble(id=c(1,1,2,2,2,3), counts=c(10,20,15,15,10,20), fruit=c("apple","banana","cherry","cherry","ananas","pear"))
df %>%
group_by(id) %>%
mutate(fruits = paste0(unique(fruit), collapse = ", "),
sum_counts = sum(unique(counts))) %>%
filter(counts == max(counts)) %>%
distinct() %>%
rename("central_fruit" = "fruit") %>%
select(-counts)
#> # A tibble: 3 × 4
#> # Groups: id [3]
#> id central_fruit fruits sum_counts
#> <dbl> <chr> <chr> <dbl>
#> 1 1 banana apple, banana 30
#> 2 2 cherry cherry, ananas 25
#> 3 3 pear pear 20
由 reprex package (v2.0.1)
于 2022-04-13 创建这应该有效:
df |>
group_by(id) |>
distinct(fruit, .keep_all = TRUE) |>
mutate(
is_central_fruit = counts == max(counts),
sum_counts = sum(counts),
fruits = paste(fruit, collapse = ", ")
) |>
filter(
is_central_fruit
) |>
select(
-is_central_fruit,
-counts,
central_fruit = fruit
)
# id central_fruit sum_counts fruits
# <dbl> <chr> <dbl> <chr>
# 1 1 banana 30 apple, banana
# 2 2 cherry 25 cherry, ananas
# 3 3 pear 20 pear
如果您想订购 fruits
列,那么我不会将水果存储为字符向量,而是作为因子列表。
另一种 dplyr
方法但保留水果顺序(central_fruit 在 fruits
中排在第一位):
df %>%
distinct() %>%
group_by(id) %>%
mutate(sum_counts = sum(counts)) %>%
arrange(id, desc(counts)) %>%
mutate(fruits = paste(fruit, collapse = ", ")) %>%
slice(1) %>%
select(id, central_fruit = fruit, fruits, sum_counts) %>%
ungroup()
这个returns
# A tibble: 3 x 4
id central_fruit fruits sum_counts
<dbl> <chr> <chr> <dbl>
1 1 banana banana, apple 30
2 2 cherry cherry, ananas 25
3 3 pear pear 20