折叠特定 rows/cases 的数据框
Collapse specific rows/cases of dataframe
我想折叠 data.frame
的某些特定行(最好在 中使用 dplyr
)。折叠应该通过函数 sum() 聚合一些列,其他列通过 mean() 聚合。
例如,让我们向 iris
数据集添加一个基于字符的唯一 ID。
iris_df <- iris[1:5,]
iris_df$ID <- paste("ID_",1:nrow(iris_df),sep="")
这是我们的起点:
structure(list(Sepal.Length = c(5.1, 4.9, 4.7, 4.6, 5),
Sepal.Width = c(3.5, 3, 3.2, 3.1, 3.6),
Petal.Length = c(1.4, 1.4, 1.3, 1.5, 1.4),
Petal.Width = c(0.2, 0.2, 0.2, 0.2, 0.2),
Species = structure(c(1L, 1L, 1L, 1L, 1L),
.Label = c("setosa", "versicolor", "virginica"), class = "factor"),
ID = c("ID_1", "ID_2", "ID_3", "ID_4","ID_5")),
row.names = c(NA, 5L), class = "data.frame")
现在,我想折叠 ID==ID_1 + ID==ID_2 的情况。为此,应该将萼片值聚合为均值,将花瓣值聚合为总和。 ID 应该变成“ID_1+ID_2”(所以通过 paste() 聚合?)
最终结果应该是这样的:
structure(list(Sepal.Length = c(5.0, 4.7, 4.6, 5),
Sepal.Width = c(3.25, 3.2, 3.1, 3.6),
Petal.Length = c(2.8, 1.3, 1.5, 1.4),
Petal.Width = c(0.4, 0.2, 0.2, 0.2),
Species = structure(c(1L, 1L, 1L, 1L),
.Label = c("setosa", "versicolor", "virginica"), class = "factor"),
ID = c("ID_1+ID_2", "ID_3", "ID_4","ID_5")),
row.names = c(NA, 4L), class = "data.frame")
可以使用 dplyr
(使用 group_by()
和 summarize()
)包来完成吗?
更新: 作为一些补充说明,所需的过程应该承认行索引不是先验已知的,例如只是 ID_x 和 ID_y 需要折叠(ID_x 可能是第 i 行,ID_y 在第 j 行)。
以下是实现所需输出的一种方法:
library(dplyr)
df %>%
slice(1:2) %>%
mutate(across(1:4, mean),
ID = paste(ID[1], ID[2], sep = "+"),
across(3:4, sum)) %>%
slice(1) %>%
bind_rows(df) %>%
slice(c(1, 4:6))
输出:
Sepal.Length Sepal.Width Petal.Length Petal.Width Species ID
1 5.0 3.25 2.8 0.4 setosa ID_1+ID_2
2 4.7 3.20 1.3 0.2 setosa ID_3
3 4.6 3.10 1.5 0.2 setosa ID_4
4 5.0 3.60 1.4 0.2 setosa ID_5
我们可以使用 %in%
根据这些 ID 的存在创建一个分组
library(dplyr)
library(stringr)
df1 %>%
group_by(grp = case_when(ID %in% c("ID_1", "ID_2") ~ 0L,
TRUE ~ row_number()), Species) %>%
summarise(across(starts_with("Sepal"), mean),
across(starts_with("Petal"), sum), ID = str_c(ID, collapse="+"),
.groups = 'drop') %>%
select(-grp)
-输出
# A tibble: 4 x 6
Species Sepal.Length Sepal.Width Petal.Length Petal.Width ID
<fct> <dbl> <dbl> <dbl> <dbl> <chr>
1 setosa 5 3.25 2.8 0.4 ID_1+ID_2
2 setosa 4.7 3.2 1.3 0.2 ID_3
3 setosa 4.6 3.1 1.5 0.2 ID_4
4 setosa 5 3.6 1.4 0.2 ID_5
如果只有一个'Species',那我们也可以用first
df1 %>%
group_by(grp = case_when(ID %in% c("ID_1", "ID_2") ~ 0L,
TRUE ~ row_number())) %>%
summarise(across(starts_with("Sepal"), mean),
across(starts_with("Petal"), sum), Species = first(Species),
ID = str_c(ID, collapse="+"),
.groups = 'drop') %>%
select(-grp)
# A tibble: 4 x 6
Sepal.Length Sepal.Width Petal.Length Petal.Width Species ID
<dbl> <dbl> <dbl> <dbl> <fct> <chr>
1 5 3.25 2.8 0.4 setosa ID_1+ID_2
2 4.7 3.2 1.3 0.2 setosa ID_3
3 4.6 3.1 1.5 0.2 setosa ID_4
4 5 3.6 1.4 0.2 setosa ID_5
或者另一种选择是通过折叠 ID 或对 fct_collapse
的兴趣来创建新关卡
library(forcats)
df1 %>%
group_by(grp = fct_collapse(ID, other = c("ID_1", "ID_2"))) %>%
summarise(across(starts_with("Sepal"), mean),
across(starts_with("Petal"), sum), Species = first(Species),
ID = str_c(ID, collapse="+"),
.groups = 'drop') %>%
select(-grp)
# A tibble: 4 x 6
Sepal.Length Sepal.Width Petal.Length Petal.Width Species ID
<dbl> <dbl> <dbl> <dbl> <fct> <chr>
1 5 3.25 2.8 0.4 setosa ID_1+ID_2
2 4.7 3.2 1.3 0.2 setosa ID_3
3 4.6 3.1 1.5 0.2 setosa ID_4
4 5 3.6 1.4 0.2 setosa ID_5
我想折叠 data.frame
的某些特定行(最好在 中使用 dplyr
)。折叠应该通过函数 sum() 聚合一些列,其他列通过 mean() 聚合。
例如,让我们向 iris
数据集添加一个基于字符的唯一 ID。
iris_df <- iris[1:5,]
iris_df$ID <- paste("ID_",1:nrow(iris_df),sep="")
这是我们的起点:
structure(list(Sepal.Length = c(5.1, 4.9, 4.7, 4.6, 5),
Sepal.Width = c(3.5, 3, 3.2, 3.1, 3.6),
Petal.Length = c(1.4, 1.4, 1.3, 1.5, 1.4),
Petal.Width = c(0.2, 0.2, 0.2, 0.2, 0.2),
Species = structure(c(1L, 1L, 1L, 1L, 1L),
.Label = c("setosa", "versicolor", "virginica"), class = "factor"),
ID = c("ID_1", "ID_2", "ID_3", "ID_4","ID_5")),
row.names = c(NA, 5L), class = "data.frame")
现在,我想折叠 ID==ID_1 + ID==ID_2 的情况。为此,应该将萼片值聚合为均值,将花瓣值聚合为总和。 ID 应该变成“ID_1+ID_2”(所以通过 paste() 聚合?)
最终结果应该是这样的:
structure(list(Sepal.Length = c(5.0, 4.7, 4.6, 5),
Sepal.Width = c(3.25, 3.2, 3.1, 3.6),
Petal.Length = c(2.8, 1.3, 1.5, 1.4),
Petal.Width = c(0.4, 0.2, 0.2, 0.2),
Species = structure(c(1L, 1L, 1L, 1L),
.Label = c("setosa", "versicolor", "virginica"), class = "factor"),
ID = c("ID_1+ID_2", "ID_3", "ID_4","ID_5")),
row.names = c(NA, 4L), class = "data.frame")
可以使用 dplyr
(使用 group_by()
和 summarize()
)包来完成吗?
更新: 作为一些补充说明,所需的过程应该承认行索引不是先验已知的,例如只是 ID_x 和 ID_y 需要折叠(ID_x 可能是第 i 行,ID_y 在第 j 行)。
以下是实现所需输出的一种方法:
library(dplyr)
df %>%
slice(1:2) %>%
mutate(across(1:4, mean),
ID = paste(ID[1], ID[2], sep = "+"),
across(3:4, sum)) %>%
slice(1) %>%
bind_rows(df) %>%
slice(c(1, 4:6))
输出:
Sepal.Length Sepal.Width Petal.Length Petal.Width Species ID
1 5.0 3.25 2.8 0.4 setosa ID_1+ID_2
2 4.7 3.20 1.3 0.2 setosa ID_3
3 4.6 3.10 1.5 0.2 setosa ID_4
4 5.0 3.60 1.4 0.2 setosa ID_5
我们可以使用 %in%
library(dplyr)
library(stringr)
df1 %>%
group_by(grp = case_when(ID %in% c("ID_1", "ID_2") ~ 0L,
TRUE ~ row_number()), Species) %>%
summarise(across(starts_with("Sepal"), mean),
across(starts_with("Petal"), sum), ID = str_c(ID, collapse="+"),
.groups = 'drop') %>%
select(-grp)
-输出
# A tibble: 4 x 6
Species Sepal.Length Sepal.Width Petal.Length Petal.Width ID
<fct> <dbl> <dbl> <dbl> <dbl> <chr>
1 setosa 5 3.25 2.8 0.4 ID_1+ID_2
2 setosa 4.7 3.2 1.3 0.2 ID_3
3 setosa 4.6 3.1 1.5 0.2 ID_4
4 setosa 5 3.6 1.4 0.2 ID_5
如果只有一个'Species',那我们也可以用first
df1 %>%
group_by(grp = case_when(ID %in% c("ID_1", "ID_2") ~ 0L,
TRUE ~ row_number())) %>%
summarise(across(starts_with("Sepal"), mean),
across(starts_with("Petal"), sum), Species = first(Species),
ID = str_c(ID, collapse="+"),
.groups = 'drop') %>%
select(-grp)
# A tibble: 4 x 6
Sepal.Length Sepal.Width Petal.Length Petal.Width Species ID
<dbl> <dbl> <dbl> <dbl> <fct> <chr>
1 5 3.25 2.8 0.4 setosa ID_1+ID_2
2 4.7 3.2 1.3 0.2 setosa ID_3
3 4.6 3.1 1.5 0.2 setosa ID_4
4 5 3.6 1.4 0.2 setosa ID_5
或者另一种选择是通过折叠 ID 或对 fct_collapse
library(forcats)
df1 %>%
group_by(grp = fct_collapse(ID, other = c("ID_1", "ID_2"))) %>%
summarise(across(starts_with("Sepal"), mean),
across(starts_with("Petal"), sum), Species = first(Species),
ID = str_c(ID, collapse="+"),
.groups = 'drop') %>%
select(-grp)
# A tibble: 4 x 6
Sepal.Length Sepal.Width Petal.Length Petal.Width Species ID
<dbl> <dbl> <dbl> <dbl> <fct> <chr>
1 5 3.25 2.8 0.4 setosa ID_1+ID_2
2 4.7 3.2 1.3 0.2 setosa ID_3
3 4.6 3.1 1.5 0.2 setosa ID_4
4 5 3.6 1.4 0.2 setosa ID_5