R:如何跨多列迭代求和?
R : How to iterate sum across multiple columns?
我正在尝试 iterate/loop 计算多个非连续列的总和。我的 objective 是计算跨时间重复测量的多个问卷的子量表分数。
一份包含 x 个项目和 n 个时间点的问卷的数据集:
df <- tibble(
ID = 1:5,
itemA_1 = sample(100, 5, TRUE),
itemB_1 = sample(100, 5, TRUE),
itemC_1 = sample(100, 5, TRUE),
itemD_1 = sample(100, 5, TRUE),
itemx_1 = sample(100, 5, TRUE),
itemA_3 = sample(100, 5, TRUE),
itemB_3 = sample(100, 5, TRUE),
itemC_3 = sample(100, 5, TRUE),
itemD_3 = sample(100, 5, TRUE),
itemx_3 = sample(100, 5, TRUE),
itemA_n = sample(100, 5, TRUE),
itemB_n = sample(100, 5, TRUE),
itemC_n = sample(100, 5, TRUE),
itemD_n = sample(100, 5, TRUE),
itemx_n = sample(100, 5, TRUE),
)
一个特定时间点的总和很好:
df %>% mutate(total_1 = sum(c(itemA_1, itemC_1, itemD_1))
这个循环不起作用:
for (i in c(1, 3, n)) {
df %>% mutate(total_i = sum(c(itemA_i, itemC_i, itemD_i))
}
我做错了什么?
我们可以用 pivot_longer
重新整形为 'long' 格式,并按 sum
进行分组
library(dplyr)
library(tidyr)
df1 <- df %>%
pivot_longer(cols =-ID, names_to = c("item", ".value"), names_sep = "_") %>%
filter(item %in% c("itemA", "itemC", "itemD")) %>%
group_by(ID) %>%
summarise(across(where(is.numeric), sum, na.rm = TRUE,
.names = "total_{.col}")) %>%
left_join(df, .)
-输出
> df1
# A tibble: 5 × 19
ID itemA_1 itemB_1 itemC_1 itemD_1 itemx_1 itemA_3 itemB_3 itemC_3 itemD_3 itemx_3 itemA_n itemB_n itemC_n itemD_n itemx_n total_1
<int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 1 69 27 56 44 54 53 66 28 67 19 65 38 12 45 33 250
2 2 31 65 7 34 84 19 64 70 27 23 98 65 94 71 100 221
3 3 58 34 68 18 69 100 24 47 54 60 47 48 81 61 22 247
4 4 95 16 85 34 9 28 73 57 79 60 57 31 16 24 84 239
5 5 19 66 43 25 35 31 39 17 15 84 10 23 100 6 74 188
# … with 2 more variables: total_3 <int>, total_n <int>
如果我们想使用 for
循环,那么 paste
带有 i
的列名,在赋值 (:=
时评估 (!!
) )
library(stringr)
for (i in c(1, 3, 'n')) {
df <- df %>%
mutate(!! str_c("total_", i) :=
rowSums(across(all_of(str_c(c("itemA_", "itemC_", "itemD_"), i)))))
}
但是请注意,这不是动态的,因为我们必须手动将 1, 2, ..., n
包含在循环中
-检查 for
循环的输出并重塑
> all.equal(df1$total_1, df$total_1)
[1] TRUE
> all.equal(df1$total_3, df$total_3)
[1] TRUE
> all.equal(df1$total_n, df$total_n)
[1] TRUE
这是一个没有旋转的基本 R 选项,我们首先 select 我们想要求和的列,然后得到唯一的后缀名,然后我们可以使用 rowSums
得到总和每个组(即每个唯一的后缀)。然后,我更新列名,然后 merge
使用原始数据框。
df_sum <- df[, grepl( "ID|itemA|itemC|itemD", names(df))]
suffixes <- unique(sub("^[^_]*_", "", colnames(df_sum)))
df2 <- sapply(suffixes, function(x) rowSums(df_sum[,endsWith(colnames(df_sum), x)]))
colnames(df2)[-1] <- paste("total", colnames(df2)[-1], sep = "_")
merge(x = df, y = df2, by = "ID", all.x = TRUE)
输出
ID itemA_1 itemB_1 itemC_1 itemD_1 itemx_1 itemA_3 itemB_3 itemC_3 itemD_3 itemx_3 itemA_n itemB_n itemC_n itemD_n itemx_n total_1 total_3 total_n
1 1 92 84 31 74 77 26 71 92 59 70 47 54 7 6 95 197 177 60
2 2 49 6 40 6 94 61 69 58 49 62 66 13 94 52 23 95 168 212
3 3 67 69 34 56 44 94 69 1 52 96 62 64 34 78 67 157 147 174
4 4 86 33 85 87 30 33 26 15 70 97 34 36 74 58 87 258 118 166
5 5 49 25 23 56 63 4 84 35 92 34 33 62 95 77 50 128 131 205
数据
df <- structure(list(ID = 1:5, itemA_1 = c(92L, 49L, 67L, 86L, 49L),
itemB_1 = c(84L, 6L, 69L, 33L, 25L), itemC_1 = c(31L, 40L,
34L, 85L, 23L), itemD_1 = c(74L, 6L, 56L, 87L, 56L), itemx_1 = c(77L,
94L, 44L, 30L, 63L), itemA_3 = c(26L, 61L, 94L, 33L, 4L),
itemB_3 = c(71L, 69L, 69L, 26L, 84L), itemC_3 = c(92L, 58L,
1L, 15L, 35L), itemD_3 = c(59L, 49L, 52L, 70L, 92L), itemx_3 = c(70L,
62L, 96L, 97L, 34L), itemA_n = c(47L, 66L, 62L, 34L, 33L),
itemB_n = c(54L, 13L, 64L, 36L, 62L), itemC_n = c(7L, 94L,
34L, 74L, 95L), itemD_n = c(6L, 52L, 78L, 58L, 77L), itemx_n = c(95L,
23L, 67L, 87L, 50L)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -5L))
基准
df1 <- df
df2 <- df
bm <- microbenchmark::microbenchmark(akrun_tidyverse = {df %>%
pivot_longer(cols =-ID, names_to = c("item", ".value"), names_sep = "_") %>%
filter(item %in% c("itemA", "itemC", "itemD")) %>%
group_by(ID) %>%
summarise(across(where(is.numeric), sum, na.rm = TRUE,
.names = "total_{.col}")) %>%
left_join(df, .)},
akrun_loop = {for (i in c(1, 3, 'n')) {
df1 <- df1 %>%
mutate(!! str_c("total_", i) :=
rowSums(across(all_of(str_c(c("itemA_", "itemC_", "itemD_"), i)))))
}},
andrew_baseR = {df_sum <- df2[, grepl( "ID|itemA|itemC|itemD", names(df2))];
suffixes <- unique(sub("^[^_]*_", "", colnames(df_sum)));
df3 <- sapply(suffixes, function(x) rowSums(df_sum[,endsWith(colnames(df_sum), x)]));
colnames(df3)[-1] <- paste("total", colnames(df3)[-1], sep = "_");
merge(x = df, y = df3, by = "ID", all.x = TRUE)},
times = 1000)
autoplot(bm)
另一种可能的解决方案,基于purrr::map_dfc
:
library(tidyverse)
map_dfc(c("1","3","n"),~ data.frame(apply(select(df, ends_with(.x)), 1, sum)) %>%
set_names(str_c("Total_",.x))) %>% bind_cols(df, .)
#> ID itemA_1 itemB_1 itemC_1 itemD_1 itemx_1 itemA_3 itemB_3 itemC_3 itemD_3
#> 1 1 89 33 84 15 86 8 9 18 98
#> 2 2 37 2 32 52 37 79 37 31 57
#> 3 3 5 21 54 58 25 74 43 7 14
#> 4 4 19 76 71 84 75 34 72 100 33
#> 5 5 78 52 58 42 5 28 90 21 95
#> itemx_3 itemA_n itemB_n itemC_n itemD_n itemx_n Total_1 Total_3 Total_n
#> 1 78 5 52 4 73 88 307 211 222
#> 2 82 88 78 3 3 69 160 286 241
#> 3 41 60 73 59 32 87 163 179 311
#> 4 97 86 33 81 44 22 325 336 266
#> 5 37 4 9 35 5 38 235 271 91
我正在尝试 iterate/loop 计算多个非连续列的总和。我的 objective 是计算跨时间重复测量的多个问卷的子量表分数。
一份包含 x 个项目和 n 个时间点的问卷的数据集:
df <- tibble(
ID = 1:5,
itemA_1 = sample(100, 5, TRUE),
itemB_1 = sample(100, 5, TRUE),
itemC_1 = sample(100, 5, TRUE),
itemD_1 = sample(100, 5, TRUE),
itemx_1 = sample(100, 5, TRUE),
itemA_3 = sample(100, 5, TRUE),
itemB_3 = sample(100, 5, TRUE),
itemC_3 = sample(100, 5, TRUE),
itemD_3 = sample(100, 5, TRUE),
itemx_3 = sample(100, 5, TRUE),
itemA_n = sample(100, 5, TRUE),
itemB_n = sample(100, 5, TRUE),
itemC_n = sample(100, 5, TRUE),
itemD_n = sample(100, 5, TRUE),
itemx_n = sample(100, 5, TRUE),
)
一个特定时间点的总和很好:
df %>% mutate(total_1 = sum(c(itemA_1, itemC_1, itemD_1))
这个循环不起作用:
for (i in c(1, 3, n)) {
df %>% mutate(total_i = sum(c(itemA_i, itemC_i, itemD_i))
}
我做错了什么?
我们可以用 pivot_longer
重新整形为 'long' 格式,并按 sum
library(dplyr)
library(tidyr)
df1 <- df %>%
pivot_longer(cols =-ID, names_to = c("item", ".value"), names_sep = "_") %>%
filter(item %in% c("itemA", "itemC", "itemD")) %>%
group_by(ID) %>%
summarise(across(where(is.numeric), sum, na.rm = TRUE,
.names = "total_{.col}")) %>%
left_join(df, .)
-输出
> df1
# A tibble: 5 × 19
ID itemA_1 itemB_1 itemC_1 itemD_1 itemx_1 itemA_3 itemB_3 itemC_3 itemD_3 itemx_3 itemA_n itemB_n itemC_n itemD_n itemx_n total_1
<int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 1 69 27 56 44 54 53 66 28 67 19 65 38 12 45 33 250
2 2 31 65 7 34 84 19 64 70 27 23 98 65 94 71 100 221
3 3 58 34 68 18 69 100 24 47 54 60 47 48 81 61 22 247
4 4 95 16 85 34 9 28 73 57 79 60 57 31 16 24 84 239
5 5 19 66 43 25 35 31 39 17 15 84 10 23 100 6 74 188
# … with 2 more variables: total_3 <int>, total_n <int>
如果我们想使用 for
循环,那么 paste
带有 i
的列名,在赋值 (:=
时评估 (!!
) )
library(stringr)
for (i in c(1, 3, 'n')) {
df <- df %>%
mutate(!! str_c("total_", i) :=
rowSums(across(all_of(str_c(c("itemA_", "itemC_", "itemD_"), i)))))
}
但是请注意,这不是动态的,因为我们必须手动将 1, 2, ..., n
包含在循环中
-检查 for
循环的输出并重塑
> all.equal(df1$total_1, df$total_1)
[1] TRUE
> all.equal(df1$total_3, df$total_3)
[1] TRUE
> all.equal(df1$total_n, df$total_n)
[1] TRUE
这是一个没有旋转的基本 R 选项,我们首先 select 我们想要求和的列,然后得到唯一的后缀名,然后我们可以使用 rowSums
得到总和每个组(即每个唯一的后缀)。然后,我更新列名,然后 merge
使用原始数据框。
df_sum <- df[, grepl( "ID|itemA|itemC|itemD", names(df))]
suffixes <- unique(sub("^[^_]*_", "", colnames(df_sum)))
df2 <- sapply(suffixes, function(x) rowSums(df_sum[,endsWith(colnames(df_sum), x)]))
colnames(df2)[-1] <- paste("total", colnames(df2)[-1], sep = "_")
merge(x = df, y = df2, by = "ID", all.x = TRUE)
输出
ID itemA_1 itemB_1 itemC_1 itemD_1 itemx_1 itemA_3 itemB_3 itemC_3 itemD_3 itemx_3 itemA_n itemB_n itemC_n itemD_n itemx_n total_1 total_3 total_n
1 1 92 84 31 74 77 26 71 92 59 70 47 54 7 6 95 197 177 60
2 2 49 6 40 6 94 61 69 58 49 62 66 13 94 52 23 95 168 212
3 3 67 69 34 56 44 94 69 1 52 96 62 64 34 78 67 157 147 174
4 4 86 33 85 87 30 33 26 15 70 97 34 36 74 58 87 258 118 166
5 5 49 25 23 56 63 4 84 35 92 34 33 62 95 77 50 128 131 205
数据
df <- structure(list(ID = 1:5, itemA_1 = c(92L, 49L, 67L, 86L, 49L),
itemB_1 = c(84L, 6L, 69L, 33L, 25L), itemC_1 = c(31L, 40L,
34L, 85L, 23L), itemD_1 = c(74L, 6L, 56L, 87L, 56L), itemx_1 = c(77L,
94L, 44L, 30L, 63L), itemA_3 = c(26L, 61L, 94L, 33L, 4L),
itemB_3 = c(71L, 69L, 69L, 26L, 84L), itemC_3 = c(92L, 58L,
1L, 15L, 35L), itemD_3 = c(59L, 49L, 52L, 70L, 92L), itemx_3 = c(70L,
62L, 96L, 97L, 34L), itemA_n = c(47L, 66L, 62L, 34L, 33L),
itemB_n = c(54L, 13L, 64L, 36L, 62L), itemC_n = c(7L, 94L,
34L, 74L, 95L), itemD_n = c(6L, 52L, 78L, 58L, 77L), itemx_n = c(95L,
23L, 67L, 87L, 50L)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -5L))
基准
df1 <- df
df2 <- df
bm <- microbenchmark::microbenchmark(akrun_tidyverse = {df %>%
pivot_longer(cols =-ID, names_to = c("item", ".value"), names_sep = "_") %>%
filter(item %in% c("itemA", "itemC", "itemD")) %>%
group_by(ID) %>%
summarise(across(where(is.numeric), sum, na.rm = TRUE,
.names = "total_{.col}")) %>%
left_join(df, .)},
akrun_loop = {for (i in c(1, 3, 'n')) {
df1 <- df1 %>%
mutate(!! str_c("total_", i) :=
rowSums(across(all_of(str_c(c("itemA_", "itemC_", "itemD_"), i)))))
}},
andrew_baseR = {df_sum <- df2[, grepl( "ID|itemA|itemC|itemD", names(df2))];
suffixes <- unique(sub("^[^_]*_", "", colnames(df_sum)));
df3 <- sapply(suffixes, function(x) rowSums(df_sum[,endsWith(colnames(df_sum), x)]));
colnames(df3)[-1] <- paste("total", colnames(df3)[-1], sep = "_");
merge(x = df, y = df3, by = "ID", all.x = TRUE)},
times = 1000)
autoplot(bm)
另一种可能的解决方案,基于purrr::map_dfc
:
library(tidyverse)
map_dfc(c("1","3","n"),~ data.frame(apply(select(df, ends_with(.x)), 1, sum)) %>%
set_names(str_c("Total_",.x))) %>% bind_cols(df, .)
#> ID itemA_1 itemB_1 itemC_1 itemD_1 itemx_1 itemA_3 itemB_3 itemC_3 itemD_3
#> 1 1 89 33 84 15 86 8 9 18 98
#> 2 2 37 2 32 52 37 79 37 31 57
#> 3 3 5 21 54 58 25 74 43 7 14
#> 4 4 19 76 71 84 75 34 72 100 33
#> 5 5 78 52 58 42 5 28 90 21 95
#> itemx_3 itemA_n itemB_n itemC_n itemD_n itemx_n Total_1 Total_3 Total_n
#> 1 78 5 52 4 73 88 307 211 222
#> 2 82 88 78 3 3 69 160 286 241
#> 3 41 60 73 59 32 87 163 179 311
#> 4 97 86 33 81 44 22 325 336 266
#> 5 37 4 9 35 5 38 235 271 91