将多个 DPLYR 命令转换为单个 DPLYR 命令

Converting Multiple DPLYR Commands into a Single DPLYR Command

我正在使用 R 编程语言。

我有以下数据集(“my_data”):

structure(list(idd = 1:50, group_1 = c("B", "B", "A", "B", "B", 
"A", "A", "A", "B", "A", "A", "B", "B", "B", "A", "A", "A", "A", 
"B", "B", "A", "B", "A", "B", "A", "B", "B", "A", "B", "B", "B", 
"A", "B", "A", "B", "B", "A", "A", "A", "A", "A", "B", "B", "B", 
"A", "B", "B", "B", "B", "B"), v1 = c(15.7296737049317, -4.33377704672207, 
-0.551850185265, 2.66888122578048, 12.109072642513, 0.0107927293899017, 
20.7785032320562, -1.98974382507874, 12.1663703518471, 11.4308702978893, 
-0.657500910529805, 5.71376589298221, 3.43820523228653, 19.5939432685761, 
25.5605263610222, -0.407964337882465, 19.3057240854025, 9.24554068987809, 
-9.6719534905096, 2.44096357354807, 14.6114916050676, 11.4510663104787, 
-14.4231132108142, 15.8031868545157, 16.5505199848675, 6.95491162740581, 
2.92431767382703, 29.7157201447823, 9.10001319352251, 9.85982748068076, 
-1.23456937110154, -3.44130123376206, -5.23155771062088, 5.78031789617826, 
23.6092446408098, 27.5379484533487, 25.6836473435279, 22.9675556994775, 
7.62403748556388, -2.24150135680706, 6.72187319859928, -14.1245027627225, 
6.8620712655661, 26.5987870464572, 11.3095310060752, 20.9588868268958, 
14.8934095694391, 2.21089704551347, 27.4355935292935, 9.21612714668934
), group_2 = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L)), row.names = c(NA, -50L), class = "data.frame")

head(my_data)

   idd group_1          v1 group_2
1    1       B 15.72967370       1
2    2       B -4.33377705       2
3    3       A -0.55185019       3
4    4       B  2.66888123       4
5    5       B 12.10907264       5
6    6       A  0.01079273       6
7    7       A 20.77850323       7
8    8       A -1.98974383       8
9    9       B 12.16637035       9
10  10       A 11.43087030      10
11  11       A -0.65750091       1
12  12       B  5.71376589       2

对于这个数据集,我想在“dplyr”中执行以下步骤:

我知道如何在 R 中手动执行此操作:

#STEP 1: since my_data has 50 rows, break my_data into 5 groups of 10 rows

rows_1 = my_data[1:10,]
rows_2 = my_data[11:20,]
rows_3 = my_data[21:30,]
rows_4 = my_data[31:40,]
rows_5 = my_data[41:50,]

# STEP 2: find out values of "v2"

library(dplyr)

dplyr_row_1 = data.frame(rows_1 %>% group_by(group_1) %>% summarize(sum = sum(v1)))

dplyr_row_1$v2 = ifelse(dplyr_row_1[1,2] > dplyr_row_1[2,2], "A", ifelse(dplyr_row_1[1,2] < dplyr_row_1[2,2], "B", 0))

dplyr_row_2 = data.frame(rows_2 %>% group_by(group_1) %>% summarize(sum = sum(v1)))

dplyr_row_2$v2 = ifelse(dplyr_row_2[1,2] > dplyr_row_2[2,2], "A", ifelse(dplyr_row_2[1,2] < dplyr_row_2[2,2], "B", 0))

dplyr_row_3 = data.frame(rows_3 %>% group_by(group_1) %>% summarize(sum = sum(v1)))

dplyr_row_3$v2 = ifelse(dplyr_row_3[1,2] > dplyr_row_3[2,2], "A", ifelse(dplyr_row_3[1,2] < dplyr_row_3[2,2], "B", 0))

dplyr_row_4 = data.frame(rows_4 %>% group_by(group_1) %>% summarize(sum = sum(v1)))

dplyr_row_4$v2 = ifelse(dplyr_row_4[1,2] > dplyr_row_4[2,2], "A", ifelse(dplyr_row_4[1,2] < dplyr_row_4[2,2], "B", 0))

dplyr_row_5 = data.frame(rows_5 %>% group_by(group_1) %>% summarize(sum = sum(v1)))

dplyr_row_5$v2 = ifelse(dplyr_row_5[1,2] > dplyr_row_5[2,2], "A", ifelse(dplyr_row_5[1,2] < dplyr_row_5[2,2], "B", 0))

# STEP 3: append "v2" to first 5 files:

rows_1$v2 = dplyr_row_1$v2
rows_2$v2 = dplyr_row_2$v2
rows_3$v2 = dplyr_row_3$v2
rows_4$v2 = dplyr_row_4$v2
rows_5$v2 = dplyr_row_5$v2

# STEP 4: create final file:

final_file = rbind(rows_1,rows_2, rows_3, rows_4, rows_5)

因此,最终文件如下所示:

  idd group_1          v1 group_2 v2
1    1       B 15.72967370       1  B
2    2       B -4.33377705       2  B
3    3       A -0.55185019       3  B
4    4       B  2.66888123       4  B
5    5       B 12.10907264       5  B
6    6       A  0.01079273       6  B
7    7       A 20.77850323       7  B
8    8       A -1.98974383       8  B
9    9       B 12.16637035       9  B
10  10       A 11.43087030      10  B
11  11       A -0.65750091       1  A

我的问题:有人可以告诉我如何在单个“dplyr”命令中执行步骤 1 到步骤 4 吗?

谢谢!

  1. 首先,我将创建一个 group_index 以将每 10 行分组在一起。
  2. 然后group_by相关列并计算总和。
  3. 移除group_1的分组层,因为我们需要比较AB的值。
  4. 如果sum的唯一长度等于“1”,则表示它们相同,则在v2列中输入“0”。如果不相同,则输出group_1.
  5. 中存储的最大类别
  6. 最后删除 sum 列并按 idd 排序。

这种方法可以解决group_1中两个以上组的问题。

例如,此处显示前 20 行。

library(tidyverse)

df %>% 
  mutate(group_index = rep(1:(nrow(df)/10), each = 10)) %>% 
  group_by(group_index, group_1) %>% 
  mutate(sum = sum(v1)) %>% 
  group_by(group_index) %>%
  mutate(v2 = ifelse(length(unique(sum)) == 1, 0, group_1[which.max(sum)])) %>%
  ungroup() %>% 
  select(-c(sum, group_index))

# A tibble: 20 x 5
     idd group_1      v1 group_2 v2   
   <int> <chr>     <dbl>   <int> <chr>
 1     1 B       15.7          1 B    
 2     2 B       -4.33         2 B    
 3     3 A       -0.552        3 B    
 4     4 B        2.67         4 B    
 5     5 B       12.1          5 B    
 6     6 A        0.0108       6 B    
 7     7 A       20.8          7 B    
 8     8 A       -1.99         8 B    
 9     9 B       12.2          9 B    
10    10 A       11.4         10 B    
11    11 A       -0.658        1 A    
12    12 B        5.71         2 A    
13    13 B        3.44         3 A    
14    14 B       19.6          4 A    
15    15 A       25.6          5 A    
16    16 A       -0.408        6 A    
17    17 A       19.3          7 A    
18    18 A        9.25         8 A    
19    19 B       -9.67         9 A    
20    20 B        2.44        10 A    

这是替代方法。

library(tidyverse)

df %>% 
  mutate(group_index = rep(1:(n() /10), each = 10)) %>% 
  group_by(group_index) %>%
  mutate(
   v2 = case_when(
    sum(v1[group_1 == 'A']) > sum(v1[group_1 == 'B']) ~ 'A', 
    sum(v1[group_1 == 'A']) < sum(v1[group_1 == 'B']) ~ 'B', 
    TRUE ~'0')
)