mutate_at 水平与垂直
mutate_at horizontal vs vertical
我正在尝试使用 mutate_at.
仅在某些列上应用函数
这里是数据:
structure(list(LoB = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = c("1", "2", "3", "4"), class = "factor"),
AY = c(1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
2003, 2004, 2005), R_0 = c(50135, 46530, 38295, 12033, 13332,
35064, 15695, 41227, 88360, 29500, 30158, 47589), R_1 = c(76631,
4908, 30427, 4268, 1994, 48426, 4585, 15578, 8112, 30945,
8141, 11594), R_2 = c(28763, 2634, 374, 0, 216, 0, 555, 0,
7161, 2192, 0, 772), R_3 = c(0, 1409, 470, 0, 203, 0, 0,
0, 0, 1556, 0, 675), R_4 = c(16433, 0, 436, 0, 202, 2115,
0, 0, 0, 1271, 0, 535), R_5 = c(6301, 0, 0, 0, 179, 0, 0,
0, 183, 1052, 0, 0), R_6 = c(0, 0, 0, 0, 147, 0, 0, 0, 0,
982, 0, 0), R_7 = c(0, 0, 0, 0, 135, 0, 0, 0, 0, 907, 2356,
0), R_8 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 902, 0, 0), R_9 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 833, 0, 0), R_10 = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 800, 0, 0), R_11 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 684, 0, 0)), row.names = c(NA, -12L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), vars = "LoB", drop = TRUE, indices = list(
0:11), group_sizes = 12L, biggest_group_size = 12L, labels = structure(list(
LoB = structure(1L, .Label = c("1", "2", "3", "4"), class = "factor")), row.names = c(NA,
-1L), class = "data.frame", vars = "LoB", drop = TRUE))
如下所示:
# A tibble: 12 x 14
# Groups: LoB [1]
LoB AY R_0 R_1 R_2 R_3 R_4 R_5 R_6 R_7 R_8 R_9 R_10 R_11
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1994 50135 76631 28763 0 16433 6301 0 0 0 0 0 0
2 1 1995 46530 4908 2634 1409 0 0 0 0 0 0 0 0
3 1 1996 38295 30427 374 470 436 0 0 0 0 0 0 0
4 1 1997 12033 4268 0 0 0 0 0 0 0 0 0 0
5 1 1998 13332 1994 216 203 202 179 147 135 0 0 0 0
6 1 1999 35064 48426 0 0 2115 0 0 0 0 0 0 0
7 1 2000 15695 4585 555 0 0 0 0 0 0 0 0 0
8 1 2001 41227 15578 0 0 0 0 0 0 0 0 0 0
9 1 2002 88360 8112 7161 0 0 183 0 0 0 0 0 0
10 1 2003 29500 30945 2192 1556 1271 1052 982 907 902 833 800 684
11 1 2004 30158 8141 0 0 0 0 0 2356 0 0 0 0
12 1 2005 47589 11594 772 675 535 0 0 0 0 0 0 0
假设我想创建以 R_
开头的列的累计总和。为此,我写道:
df %>% mutate_at(vars(contains("R_")), funs(cumsum))
这给了我以下输出:
# A tibble: 12 x 14
# Groups: LoB [1]
LoB AY R_0 R_1 R_2 R_3 R_4 R_5 R_6 R_7 R_8 R_9 R_10 R_11
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1994 50135 76631 28763 0 16433 6301 0 0 0 0 0 0
2 1 1995 96665 81539 31397 1409 16433 6301 0 0 0 0 0 0
3 1 1996 134960 111966 31771 1879 16869 6301 0 0 0 0 0 0
4 1 1997 146993 116234 31771 1879 16869 6301 0 0 0 0 0 0
5 1 1998 160325 118228 31987 2082 17071 6480 147 135 0 0 0 0
6 1 1999 195389 166654 31987 2082 19186 6480 147 135 0 0 0 0
7 1 2000 211084 171239 32542 2082 19186 6480 147 135 0 0 0 0
8 1 2001 252311 186817 32542 2082 19186 6480 147 135 0 0 0 0
9 1 2002 340671 194929 39703 2082 19186 6663 147 135 0 0 0 0
10 1 2003 370171 225874 41895 3638 20457 7715 1129 1042 902 833 800 684
11 1 2004 400329 234015 41895 3638 20457 7715 1129 3398 902 833 800 684
12 1 2005 447918 245609 42667 4313 20992 7715 1129 3398 902 833 800 684
这里的问题是累积总和是垂直(通过变量)而不是水平进行的。我怎样才能在 dplyr 中实现这一点?
我不确定是否有不使用 gather
和 spread
的方法。这是我会怎么做。首先,我会将数据重塑为 "long",然后我们需要使用 group_by
,以便我们只计算原始 data.frame
中每一行的 cumsum
(如果分组不够充分,我们可以在数据中添加 row_number
,然后添加 group_by
)。在此之后,我们 mutate
然后 spread
将数据恢复为 "wide"。最后,我们按照@Gregor 的建议添加一个 select(names(df))
以保留原始列顺序。
df %>%
gather(variable, value, contains('R_')) %>% # reshape wide to long
group_by(LoB, AY) %>% # group by for each row in original data
mutate(value = cumsum(value)) %>% # calculate cumsum
spread(variable, value) %>% # reshape back from long to wide
select(names(df)) # added to retain original column order
# LoB AY R_0 R_1 R_2 R_3 ...
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> ...
# 1 1 1994 50135 126766 155529 155529 ...
# 2 1 1995 46530 51438 54072 55481 ...
# 3 1 1996 38295 68722 69096 69566 ...
# 4 1 1997 12033 16301 16301 16301 ...
与 bouncyball 的答案类似,但它保持列顺序并使用通用 ID 以防 (LoB, AY) 不是主键:
df %>%
mutate(id = 1:n()) %>%
gather(old_name, value, starts_with("R_")) %>%
arrange(id, nchar(old_name), old_name) %>%
group_by(id) %>%
mutate(value = cumsum(value)) %>%
ungroup() %>%
select(-id) %>%
spread(old_name, value) %>%
select(names(df)) %>%
select(AY, everything())
按行运算通常在矩阵上效果更好。为了避免 gather/spread
麻烦,我会提取 R_
列,使用 apply
(隐式转换为矩阵),然后将结果分配回原始数据:
也就是说,数据似乎不是很整洁。 gather
使用长格式并保持长格式可能会更好。
result = dd %>% ungroup %>%
select(starts_with("R_")) %>%
apply(1, cumsum) %>%
t
dd[, grepl("^R_", names(dd))] = result
dd
# # A tibble: 12 x 14
# # Groups: LoB [1]
# LoB AY R_0 R_1 R_2 R_3 R_4 R_5 R_6 R_7 R_8 R_9 R_10 R_11
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 1994 50135 126766 155529 155529 171962 178263 178263 178263 178263 178263 178263 178263
# 2 1 1995 46530 51438 54072 55481 55481 55481 55481 55481 55481 55481 55481 55481
# 3 1 1996 38295 68722 69096 69566 70002 70002 70002 70002 70002 70002 70002 70002
# 4 1 1997 12033 16301 16301 16301 16301 16301 16301 16301 16301 16301 16301 16301
# 5 1 1998 13332 15326 15542 15745 15947 16126 16273 16408 16408 16408 16408 16408
# 6 1 1999 35064 83490 83490 83490 85605 85605 85605 85605 85605 85605 85605 85605
# 7 1 2000 15695 20280 20835 20835 20835 20835 20835 20835 20835 20835 20835 20835
# 8 1 2001 41227 56805 56805 56805 56805 56805 56805 56805 56805 56805 56805 56805
# 9 1 2002 88360 96472 103633 103633 103633 103816 103816 103816 103816 103816 103816 103816
# 10 1 2003 29500 60445 62637 64193 65464 66516 67498 68405 69307 70140 70940 71624
# 11 1 2004 30158 38299 38299 38299 38299 38299 38299 40655 40655 40655 40655 40655
# 12 1 2005 47589 59183 59955 60630 61165 61165 61165 61165 61165 61165 61165 61165
我正在尝试使用 mutate_at.
仅在某些列上应用函数这里是数据:
structure(list(LoB = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = c("1", "2", "3", "4"), class = "factor"),
AY = c(1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
2003, 2004, 2005), R_0 = c(50135, 46530, 38295, 12033, 13332,
35064, 15695, 41227, 88360, 29500, 30158, 47589), R_1 = c(76631,
4908, 30427, 4268, 1994, 48426, 4585, 15578, 8112, 30945,
8141, 11594), R_2 = c(28763, 2634, 374, 0, 216, 0, 555, 0,
7161, 2192, 0, 772), R_3 = c(0, 1409, 470, 0, 203, 0, 0,
0, 0, 1556, 0, 675), R_4 = c(16433, 0, 436, 0, 202, 2115,
0, 0, 0, 1271, 0, 535), R_5 = c(6301, 0, 0, 0, 179, 0, 0,
0, 183, 1052, 0, 0), R_6 = c(0, 0, 0, 0, 147, 0, 0, 0, 0,
982, 0, 0), R_7 = c(0, 0, 0, 0, 135, 0, 0, 0, 0, 907, 2356,
0), R_8 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 902, 0, 0), R_9 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 833, 0, 0), R_10 = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 800, 0, 0), R_11 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 684, 0, 0)), row.names = c(NA, -12L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), vars = "LoB", drop = TRUE, indices = list(
0:11), group_sizes = 12L, biggest_group_size = 12L, labels = structure(list(
LoB = structure(1L, .Label = c("1", "2", "3", "4"), class = "factor")), row.names = c(NA,
-1L), class = "data.frame", vars = "LoB", drop = TRUE))
如下所示:
# A tibble: 12 x 14
# Groups: LoB [1]
LoB AY R_0 R_1 R_2 R_3 R_4 R_5 R_6 R_7 R_8 R_9 R_10 R_11
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1994 50135 76631 28763 0 16433 6301 0 0 0 0 0 0
2 1 1995 46530 4908 2634 1409 0 0 0 0 0 0 0 0
3 1 1996 38295 30427 374 470 436 0 0 0 0 0 0 0
4 1 1997 12033 4268 0 0 0 0 0 0 0 0 0 0
5 1 1998 13332 1994 216 203 202 179 147 135 0 0 0 0
6 1 1999 35064 48426 0 0 2115 0 0 0 0 0 0 0
7 1 2000 15695 4585 555 0 0 0 0 0 0 0 0 0
8 1 2001 41227 15578 0 0 0 0 0 0 0 0 0 0
9 1 2002 88360 8112 7161 0 0 183 0 0 0 0 0 0
10 1 2003 29500 30945 2192 1556 1271 1052 982 907 902 833 800 684
11 1 2004 30158 8141 0 0 0 0 0 2356 0 0 0 0
12 1 2005 47589 11594 772 675 535 0 0 0 0 0 0 0
假设我想创建以 R_
开头的列的累计总和。为此,我写道:
df %>% mutate_at(vars(contains("R_")), funs(cumsum))
这给了我以下输出:
# A tibble: 12 x 14
# Groups: LoB [1]
LoB AY R_0 R_1 R_2 R_3 R_4 R_5 R_6 R_7 R_8 R_9 R_10 R_11
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1994 50135 76631 28763 0 16433 6301 0 0 0 0 0 0
2 1 1995 96665 81539 31397 1409 16433 6301 0 0 0 0 0 0
3 1 1996 134960 111966 31771 1879 16869 6301 0 0 0 0 0 0
4 1 1997 146993 116234 31771 1879 16869 6301 0 0 0 0 0 0
5 1 1998 160325 118228 31987 2082 17071 6480 147 135 0 0 0 0
6 1 1999 195389 166654 31987 2082 19186 6480 147 135 0 0 0 0
7 1 2000 211084 171239 32542 2082 19186 6480 147 135 0 0 0 0
8 1 2001 252311 186817 32542 2082 19186 6480 147 135 0 0 0 0
9 1 2002 340671 194929 39703 2082 19186 6663 147 135 0 0 0 0
10 1 2003 370171 225874 41895 3638 20457 7715 1129 1042 902 833 800 684
11 1 2004 400329 234015 41895 3638 20457 7715 1129 3398 902 833 800 684
12 1 2005 447918 245609 42667 4313 20992 7715 1129 3398 902 833 800 684
这里的问题是累积总和是垂直(通过变量)而不是水平进行的。我怎样才能在 dplyr 中实现这一点?
我不确定是否有不使用 gather
和 spread
的方法。这是我会怎么做。首先,我会将数据重塑为 "long",然后我们需要使用 group_by
,以便我们只计算原始 data.frame
中每一行的 cumsum
(如果分组不够充分,我们可以在数据中添加 row_number
,然后添加 group_by
)。在此之后,我们 mutate
然后 spread
将数据恢复为 "wide"。最后,我们按照@Gregor 的建议添加一个 select(names(df))
以保留原始列顺序。
df %>%
gather(variable, value, contains('R_')) %>% # reshape wide to long
group_by(LoB, AY) %>% # group by for each row in original data
mutate(value = cumsum(value)) %>% # calculate cumsum
spread(variable, value) %>% # reshape back from long to wide
select(names(df)) # added to retain original column order
# LoB AY R_0 R_1 R_2 R_3 ...
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> ...
# 1 1 1994 50135 126766 155529 155529 ...
# 2 1 1995 46530 51438 54072 55481 ...
# 3 1 1996 38295 68722 69096 69566 ...
# 4 1 1997 12033 16301 16301 16301 ...
与 bouncyball 的答案类似,但它保持列顺序并使用通用 ID 以防 (LoB, AY) 不是主键:
df %>%
mutate(id = 1:n()) %>%
gather(old_name, value, starts_with("R_")) %>%
arrange(id, nchar(old_name), old_name) %>%
group_by(id) %>%
mutate(value = cumsum(value)) %>%
ungroup() %>%
select(-id) %>%
spread(old_name, value) %>%
select(names(df)) %>%
select(AY, everything())
按行运算通常在矩阵上效果更好。为了避免 gather/spread
麻烦,我会提取 R_
列,使用 apply
(隐式转换为矩阵),然后将结果分配回原始数据:
也就是说,数据似乎不是很整洁。 gather
使用长格式并保持长格式可能会更好。
result = dd %>% ungroup %>%
select(starts_with("R_")) %>%
apply(1, cumsum) %>%
t
dd[, grepl("^R_", names(dd))] = result
dd
# # A tibble: 12 x 14
# # Groups: LoB [1]
# LoB AY R_0 R_1 R_2 R_3 R_4 R_5 R_6 R_7 R_8 R_9 R_10 R_11
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 1994 50135 126766 155529 155529 171962 178263 178263 178263 178263 178263 178263 178263
# 2 1 1995 46530 51438 54072 55481 55481 55481 55481 55481 55481 55481 55481 55481
# 3 1 1996 38295 68722 69096 69566 70002 70002 70002 70002 70002 70002 70002 70002
# 4 1 1997 12033 16301 16301 16301 16301 16301 16301 16301 16301 16301 16301 16301
# 5 1 1998 13332 15326 15542 15745 15947 16126 16273 16408 16408 16408 16408 16408
# 6 1 1999 35064 83490 83490 83490 85605 85605 85605 85605 85605 85605 85605 85605
# 7 1 2000 15695 20280 20835 20835 20835 20835 20835 20835 20835 20835 20835 20835
# 8 1 2001 41227 56805 56805 56805 56805 56805 56805 56805 56805 56805 56805 56805
# 9 1 2002 88360 96472 103633 103633 103633 103816 103816 103816 103816 103816 103816 103816
# 10 1 2003 29500 60445 62637 64193 65464 66516 67498 68405 69307 70140 70940 71624
# 11 1 2004 30158 38299 38299 38299 38299 38299 38299 40655 40655 40655 40655 40655
# 12 1 2005 47589 59183 59955 60630 61165 61165 61165 61165 61165 61165 61165 61165