将多个 DPLYR 命令转换为单个 DPLYR 命令
Converting Multiple DPLYR Commands into a Single DPLYR Command
我正在使用 R 编程语言。
我有以下数据集(“my_data”):
structure(list(idd = 1:50, group_1 = c("B", "B", "A", "B", "B",
"A", "A", "A", "B", "A", "A", "B", "B", "B", "A", "A", "A", "A",
"B", "B", "A", "B", "A", "B", "A", "B", "B", "A", "B", "B", "B",
"A", "B", "A", "B", "B", "A", "A", "A", "A", "A", "B", "B", "B",
"A", "B", "B", "B", "B", "B"), v1 = c(15.7296737049317, -4.33377704672207,
-0.551850185265, 2.66888122578048, 12.109072642513, 0.0107927293899017,
20.7785032320562, -1.98974382507874, 12.1663703518471, 11.4308702978893,
-0.657500910529805, 5.71376589298221, 3.43820523228653, 19.5939432685761,
25.5605263610222, -0.407964337882465, 19.3057240854025, 9.24554068987809,
-9.6719534905096, 2.44096357354807, 14.6114916050676, 11.4510663104787,
-14.4231132108142, 15.8031868545157, 16.5505199848675, 6.95491162740581,
2.92431767382703, 29.7157201447823, 9.10001319352251, 9.85982748068076,
-1.23456937110154, -3.44130123376206, -5.23155771062088, 5.78031789617826,
23.6092446408098, 27.5379484533487, 25.6836473435279, 22.9675556994775,
7.62403748556388, -2.24150135680706, 6.72187319859928, -14.1245027627225,
6.8620712655661, 26.5987870464572, 11.3095310060752, 20.9588868268958,
14.8934095694391, 2.21089704551347, 27.4355935292935, 9.21612714668934
), group_2 = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L)), row.names = c(NA, -50L), class = "data.frame")
head(my_data)
idd group_1 v1 group_2
1 1 B 15.72967370 1
2 2 B -4.33377705 2
3 3 A -0.55185019 3
4 4 B 2.66888123 4
5 5 B 12.10907264 5
6 6 A 0.01079273 6
7 7 A 20.77850323 7
8 8 A -1.98974383 8
9 9 B 12.16637035 9
10 10 A 11.43087030 10
11 11 A -0.65750091 1
12 12 B 5.71376589 2
对于这个数据集,我想在“dplyr”中执行以下步骤:
- 对于每组 10 行,求 group_1 = "A" 和 group_2 = "B" 的 "v1" 之和
- 对于这些分组中的每一个,创建一个新变量(“v2”),即:“A” if sum(group_1 = A) > sum(group_1 = B), " B" 如果 sum(group_1 = A) < sum(group_1 = B) 或 "0" 如果 sum(group_1 = A) = sum(group_1 = B)
我知道如何在 R 中手动执行此操作:
#STEP 1: since my_data has 50 rows, break my_data into 5 groups of 10 rows
rows_1 = my_data[1:10,]
rows_2 = my_data[11:20,]
rows_3 = my_data[21:30,]
rows_4 = my_data[31:40,]
rows_5 = my_data[41:50,]
# STEP 2: find out values of "v2"
library(dplyr)
dplyr_row_1 = data.frame(rows_1 %>% group_by(group_1) %>% summarize(sum = sum(v1)))
dplyr_row_1$v2 = ifelse(dplyr_row_1[1,2] > dplyr_row_1[2,2], "A", ifelse(dplyr_row_1[1,2] < dplyr_row_1[2,2], "B", 0))
dplyr_row_2 = data.frame(rows_2 %>% group_by(group_1) %>% summarize(sum = sum(v1)))
dplyr_row_2$v2 = ifelse(dplyr_row_2[1,2] > dplyr_row_2[2,2], "A", ifelse(dplyr_row_2[1,2] < dplyr_row_2[2,2], "B", 0))
dplyr_row_3 = data.frame(rows_3 %>% group_by(group_1) %>% summarize(sum = sum(v1)))
dplyr_row_3$v2 = ifelse(dplyr_row_3[1,2] > dplyr_row_3[2,2], "A", ifelse(dplyr_row_3[1,2] < dplyr_row_3[2,2], "B", 0))
dplyr_row_4 = data.frame(rows_4 %>% group_by(group_1) %>% summarize(sum = sum(v1)))
dplyr_row_4$v2 = ifelse(dplyr_row_4[1,2] > dplyr_row_4[2,2], "A", ifelse(dplyr_row_4[1,2] < dplyr_row_4[2,2], "B", 0))
dplyr_row_5 = data.frame(rows_5 %>% group_by(group_1) %>% summarize(sum = sum(v1)))
dplyr_row_5$v2 = ifelse(dplyr_row_5[1,2] > dplyr_row_5[2,2], "A", ifelse(dplyr_row_5[1,2] < dplyr_row_5[2,2], "B", 0))
# STEP 3: append "v2" to first 5 files:
rows_1$v2 = dplyr_row_1$v2
rows_2$v2 = dplyr_row_2$v2
rows_3$v2 = dplyr_row_3$v2
rows_4$v2 = dplyr_row_4$v2
rows_5$v2 = dplyr_row_5$v2
# STEP 4: create final file:
final_file = rbind(rows_1,rows_2, rows_3, rows_4, rows_5)
因此,最终文件如下所示:
idd group_1 v1 group_2 v2
1 1 B 15.72967370 1 B
2 2 B -4.33377705 2 B
3 3 A -0.55185019 3 B
4 4 B 2.66888123 4 B
5 5 B 12.10907264 5 B
6 6 A 0.01079273 6 B
7 7 A 20.77850323 7 B
8 8 A -1.98974383 8 B
9 9 B 12.16637035 9 B
10 10 A 11.43087030 10 B
11 11 A -0.65750091 1 A
我的问题:有人可以告诉我如何在单个“dplyr”命令中执行步骤 1 到步骤 4 吗?
谢谢!
- 首先,我将创建一个
group_index
以将每 10 行分组在一起。
- 然后
group_by
相关列并计算总和。
- 移除
group_1
的分组层,因为我们需要比较A
和B
的值。
- 如果sum的唯一长度等于“1”,则表示它们相同,则在
v2
列中输入“0”。如果不相同,则输出group_1
. 中存储的最大类别
- 最后删除
sum
列并按 idd
排序。
这种方法可以解决group_1
中两个以上组的问题。
例如,此处显示前 20 行。
library(tidyverse)
df %>%
mutate(group_index = rep(1:(nrow(df)/10), each = 10)) %>%
group_by(group_index, group_1) %>%
mutate(sum = sum(v1)) %>%
group_by(group_index) %>%
mutate(v2 = ifelse(length(unique(sum)) == 1, 0, group_1[which.max(sum)])) %>%
ungroup() %>%
select(-c(sum, group_index))
# A tibble: 20 x 5
idd group_1 v1 group_2 v2
<int> <chr> <dbl> <int> <chr>
1 1 B 15.7 1 B
2 2 B -4.33 2 B
3 3 A -0.552 3 B
4 4 B 2.67 4 B
5 5 B 12.1 5 B
6 6 A 0.0108 6 B
7 7 A 20.8 7 B
8 8 A -1.99 8 B
9 9 B 12.2 9 B
10 10 A 11.4 10 B
11 11 A -0.658 1 A
12 12 B 5.71 2 A
13 13 B 3.44 3 A
14 14 B 19.6 4 A
15 15 A 25.6 5 A
16 16 A -0.408 6 A
17 17 A 19.3 7 A
18 18 A 9.25 8 A
19 19 B -9.67 9 A
20 20 B 2.44 10 A
这是替代方法。
library(tidyverse)
df %>%
mutate(group_index = rep(1:(n() /10), each = 10)) %>%
group_by(group_index) %>%
mutate(
v2 = case_when(
sum(v1[group_1 == 'A']) > sum(v1[group_1 == 'B']) ~ 'A',
sum(v1[group_1 == 'A']) < sum(v1[group_1 == 'B']) ~ 'B',
TRUE ~'0')
)
我正在使用 R 编程语言。
我有以下数据集(“my_data”):
structure(list(idd = 1:50, group_1 = c("B", "B", "A", "B", "B",
"A", "A", "A", "B", "A", "A", "B", "B", "B", "A", "A", "A", "A",
"B", "B", "A", "B", "A", "B", "A", "B", "B", "A", "B", "B", "B",
"A", "B", "A", "B", "B", "A", "A", "A", "A", "A", "B", "B", "B",
"A", "B", "B", "B", "B", "B"), v1 = c(15.7296737049317, -4.33377704672207,
-0.551850185265, 2.66888122578048, 12.109072642513, 0.0107927293899017,
20.7785032320562, -1.98974382507874, 12.1663703518471, 11.4308702978893,
-0.657500910529805, 5.71376589298221, 3.43820523228653, 19.5939432685761,
25.5605263610222, -0.407964337882465, 19.3057240854025, 9.24554068987809,
-9.6719534905096, 2.44096357354807, 14.6114916050676, 11.4510663104787,
-14.4231132108142, 15.8031868545157, 16.5505199848675, 6.95491162740581,
2.92431767382703, 29.7157201447823, 9.10001319352251, 9.85982748068076,
-1.23456937110154, -3.44130123376206, -5.23155771062088, 5.78031789617826,
23.6092446408098, 27.5379484533487, 25.6836473435279, 22.9675556994775,
7.62403748556388, -2.24150135680706, 6.72187319859928, -14.1245027627225,
6.8620712655661, 26.5987870464572, 11.3095310060752, 20.9588868268958,
14.8934095694391, 2.21089704551347, 27.4355935292935, 9.21612714668934
), group_2 = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L)), row.names = c(NA, -50L), class = "data.frame")
head(my_data)
idd group_1 v1 group_2
1 1 B 15.72967370 1
2 2 B -4.33377705 2
3 3 A -0.55185019 3
4 4 B 2.66888123 4
5 5 B 12.10907264 5
6 6 A 0.01079273 6
7 7 A 20.77850323 7
8 8 A -1.98974383 8
9 9 B 12.16637035 9
10 10 A 11.43087030 10
11 11 A -0.65750091 1
12 12 B 5.71376589 2
对于这个数据集,我想在“dplyr”中执行以下步骤:
- 对于每组 10 行,求 group_1 = "A" 和 group_2 = "B" 的 "v1" 之和
- 对于这些分组中的每一个,创建一个新变量(“v2”),即:“A” if sum(group_1 = A) > sum(group_1 = B), " B" 如果 sum(group_1 = A) < sum(group_1 = B) 或 "0" 如果 sum(group_1 = A) = sum(group_1 = B)
我知道如何在 R 中手动执行此操作:
#STEP 1: since my_data has 50 rows, break my_data into 5 groups of 10 rows
rows_1 = my_data[1:10,]
rows_2 = my_data[11:20,]
rows_3 = my_data[21:30,]
rows_4 = my_data[31:40,]
rows_5 = my_data[41:50,]
# STEP 2: find out values of "v2"
library(dplyr)
dplyr_row_1 = data.frame(rows_1 %>% group_by(group_1) %>% summarize(sum = sum(v1)))
dplyr_row_1$v2 = ifelse(dplyr_row_1[1,2] > dplyr_row_1[2,2], "A", ifelse(dplyr_row_1[1,2] < dplyr_row_1[2,2], "B", 0))
dplyr_row_2 = data.frame(rows_2 %>% group_by(group_1) %>% summarize(sum = sum(v1)))
dplyr_row_2$v2 = ifelse(dplyr_row_2[1,2] > dplyr_row_2[2,2], "A", ifelse(dplyr_row_2[1,2] < dplyr_row_2[2,2], "B", 0))
dplyr_row_3 = data.frame(rows_3 %>% group_by(group_1) %>% summarize(sum = sum(v1)))
dplyr_row_3$v2 = ifelse(dplyr_row_3[1,2] > dplyr_row_3[2,2], "A", ifelse(dplyr_row_3[1,2] < dplyr_row_3[2,2], "B", 0))
dplyr_row_4 = data.frame(rows_4 %>% group_by(group_1) %>% summarize(sum = sum(v1)))
dplyr_row_4$v2 = ifelse(dplyr_row_4[1,2] > dplyr_row_4[2,2], "A", ifelse(dplyr_row_4[1,2] < dplyr_row_4[2,2], "B", 0))
dplyr_row_5 = data.frame(rows_5 %>% group_by(group_1) %>% summarize(sum = sum(v1)))
dplyr_row_5$v2 = ifelse(dplyr_row_5[1,2] > dplyr_row_5[2,2], "A", ifelse(dplyr_row_5[1,2] < dplyr_row_5[2,2], "B", 0))
# STEP 3: append "v2" to first 5 files:
rows_1$v2 = dplyr_row_1$v2
rows_2$v2 = dplyr_row_2$v2
rows_3$v2 = dplyr_row_3$v2
rows_4$v2 = dplyr_row_4$v2
rows_5$v2 = dplyr_row_5$v2
# STEP 4: create final file:
final_file = rbind(rows_1,rows_2, rows_3, rows_4, rows_5)
因此,最终文件如下所示:
idd group_1 v1 group_2 v2
1 1 B 15.72967370 1 B
2 2 B -4.33377705 2 B
3 3 A -0.55185019 3 B
4 4 B 2.66888123 4 B
5 5 B 12.10907264 5 B
6 6 A 0.01079273 6 B
7 7 A 20.77850323 7 B
8 8 A -1.98974383 8 B
9 9 B 12.16637035 9 B
10 10 A 11.43087030 10 B
11 11 A -0.65750091 1 A
我的问题:有人可以告诉我如何在单个“dplyr”命令中执行步骤 1 到步骤 4 吗?
谢谢!
- 首先,我将创建一个
group_index
以将每 10 行分组在一起。 - 然后
group_by
相关列并计算总和。 - 移除
group_1
的分组层,因为我们需要比较A
和B
的值。 - 如果sum的唯一长度等于“1”,则表示它们相同,则在
v2
列中输入“0”。如果不相同,则输出group_1
. 中存储的最大类别
- 最后删除
sum
列并按idd
排序。
这种方法可以解决group_1
中两个以上组的问题。
例如,此处显示前 20 行。
library(tidyverse)
df %>%
mutate(group_index = rep(1:(nrow(df)/10), each = 10)) %>%
group_by(group_index, group_1) %>%
mutate(sum = sum(v1)) %>%
group_by(group_index) %>%
mutate(v2 = ifelse(length(unique(sum)) == 1, 0, group_1[which.max(sum)])) %>%
ungroup() %>%
select(-c(sum, group_index))
# A tibble: 20 x 5
idd group_1 v1 group_2 v2
<int> <chr> <dbl> <int> <chr>
1 1 B 15.7 1 B
2 2 B -4.33 2 B
3 3 A -0.552 3 B
4 4 B 2.67 4 B
5 5 B 12.1 5 B
6 6 A 0.0108 6 B
7 7 A 20.8 7 B
8 8 A -1.99 8 B
9 9 B 12.2 9 B
10 10 A 11.4 10 B
11 11 A -0.658 1 A
12 12 B 5.71 2 A
13 13 B 3.44 3 A
14 14 B 19.6 4 A
15 15 A 25.6 5 A
16 16 A -0.408 6 A
17 17 A 19.3 7 A
18 18 A 9.25 8 A
19 19 B -9.67 9 A
20 20 B 2.44 10 A
这是替代方法。
library(tidyverse)
df %>%
mutate(group_index = rep(1:(n() /10), each = 10)) %>%
group_by(group_index) %>%
mutate(
v2 = case_when(
sum(v1[group_1 == 'A']) > sum(v1[group_1 == 'B']) ~ 'A',
sum(v1[group_1 == 'A']) < sum(v1[group_1 == 'B']) ~ 'B',
TRUE ~'0')
)