R中的条件过滤和数据转换;根据组和值删除行并添加其他列
Conditional filtering and data transformation in R; removing rows and adding additional columns based on group and value
我正在尝试通过根据条件和按组过滤掉我不再需要的行来转换现有数据集。随后,我想添加额外的列,它本质上是一个状态列和一个 activity 实例列。这是数据框:-
rawdata<-structure(list(DateTime = c("20/02/2021 13:00", "20/02/2021 14:00",
"20/02/2021 15:00", "20/02/2021 16:00", "20/02/2021 17:00", "20/02/2021 18:00",
"20/02/2021 19:00", "20/02/2021 20:00", "20/02/2021 21:00", "20/02/2021 22:00",
"20/02/2021 23:00", "21/02/2021 00:00", "01/03/2021 00:00", "01/03/2021 01:00",
"01/03/2021 02:00", "01/03/2021 03:00", "01/03/2021 04:00", "01/03/2021 05:00",
"01/03/2021 06:00", "01/03/2021 07:00", "01/03/2021 08:00", "01/03/2021 09:00",
"01/03/2021 10:00", "01/03/2021 11:00", "01/03/2021 12:00", "01/03/2021 13:00",
"20/02/2021 13:00", "20/02/2021 14:00", "20/02/2021 15:00", "20/02/2021 16:00",
"20/02/2021 17:00", "20/02/2021 18:00", "20/02/2021 19:00", "20/02/2021 20:00",
"20/02/2021 21:00", "20/02/2021 22:00"), Cluster = c("Cluster 3",
"Cluster 3", "Cluster 3", "Cluster 3", "NotActive", "NotActive",
"NotActive", "Cluster 2", "Cluster 1", "Cluster 3", "NotActive",
"NotActive", "NotActive", "Cluster 5", "Cluster 5", "Cluster 4",
"NotActive", "NotActive", "NotActive", "NotActive", "Cluster 2",
"Cluster 2", "Cluster 3", "NotActive", "NotActive", "NotActive",
"NotActive", "NotActive", "NotActive", "NotActive", "NotActive",
"NotActive", "Cluster 1", "Cluster 2", "NotActive", "NotActive"
), UserID = c("AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA",
"AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "BBB", "BBB", "BBB",
"BBB", "BBB", "BBB", "BBB", "BBB", "BBB", "BBB", "BBB", "BBB",
"BBB", "DDD", "DDD", "DDD", "DDD", "DDD", "DDD", "DDD", "DDD",
"DDD", "DDD")), class = "data.frame", row.names = c(NA, -36L))
print(rawdata)
DateTime Cluster UserID
DateTime Cluster UserID
1 20/02/2021 13:00 Cluster 3 AAA
2 20/02/2021 14:00 Cluster 3 AAA
3 20/02/2021 15:00 Cluster 3 AAA
4 20/02/2021 16:00 Cluster 3 AAA
5 20/02/2021 17:00 NotActive AAA
6 20/02/2021 18:00 NotActive AAA
7 20/02/2021 19:00 NotActive AAA
8 20/02/2021 20:00 Cluster 2 AAA
9 20/02/2021 21:00 Cluster 1 AAA
10 20/02/2021 22:00 Cluster 3 AAA
11 20/02/2021 23:00 NotActive AAA
12 21/02/2021 00:00 NotActive AAA
13 01/03/2021 00:00 NotActive AAA
14 01/03/2021 01:00 Cluster 5 BBB
15 01/03/2021 02:00 Cluster 5 BBB
16 01/03/2021 03:00 Cluster 4 BBB
17 01/03/2021 04:00 NotActive BBB
18 01/03/2021 05:00 NotActive BBB
19 01/03/2021 06:00 NotActive BBB
20 01/03/2021 07:00 NotActive BBB
21 01/03/2021 08:00 Cluster 2 BBB
22 01/03/2021 09:00 Cluster 2 BBB
23 01/03/2021 10:00 Cluster 3 BBB
24 01/03/2021 11:00 NotActive BBB
25 01/03/2021 12:00 NotActive BBB
26 01/03/2021 13:00 NotActive BBB
27 20/02/2021 13:00 NotActive DDD
28 20/02/2021 14:00 NotActive DDD
29 20/02/2021 15:00 NotActive DDD
30 20/02/2021 16:00 NotActive DDD
31 20/02/2021 17:00 NotActive DDD
32 20/02/2021 18:00 NotActive DDD
33 20/02/2021 19:00 Cluster 1 DDD
34 20/02/2021 20:00 Cluster 2 DDD
35 20/02/2021 21:00 NotActive DDD
36 20/02/2021 22:00 NotActive DDD
为了进一步说明,这里是所需的输出:-
desiredoutput<-structure(list(DateTime = structure(c(1613826000, 1613829600,
1613833200, 1613836800, 1613840400, 1613851200, 1613854800, 1613858400,
1613862000, 1614560400, 1614564000, 1614567600, 1614571200, 1614585600,
1614589200, 1614592800, 1614596400, 1613847600, 1613851200, 1613854800
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), Cluster = c("Cluster 3",
"Cluster 3", "Cluster 3", "Cluster 3", "NotActive", "Cluster 2",
"Cluster 1", "Cluster 3", "NotActive", "Cluster 5", "Cluster 5",
"Cluster 4", "NotActive", "Cluster 2", "Cluster 2", "Cluster 3",
"NotActive", "Cluster 1", "Cluster 2", "NotActive"), UserID = c("AAA",
"AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "BBB",
"BBB", "BBB", "BBB", "BBB", "BBB", "BBB", "BBB", "DDD", "DDD",
"DDD"), Status = c("Start", "Ongoing", "Ongoing", "Ongoing",
"Complete", "Start", "Ongoing", "Ongoing", "Complete", "Start",
"Ongoing", "Ongoing", "Complete", "Start", "Ongoing", "Ongoing",
"Complete", "Start", "Ongoing", "Complete"), Instance = c(1,
2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3)), row.names = c(NA,
-20L), class = c("tbl_df", "tbl", "data.frame"))
print(desiredoutput)
DateTime Cluster UserID Status Instance
<dttm> <chr> <chr> <chr> <dbl>
1 2021-02-20 13:00:00 Cluster 3 AAA Start 1
2 2021-02-20 14:00:00 Cluster 3 AAA Ongoing 2
3 2021-02-20 15:00:00 Cluster 3 AAA Ongoing 3
4 2021-02-20 16:00:00 Cluster 3 AAA Ongoing 4
5 2021-02-20 17:00:00 NotActive AAA Complete 5
6 2021-02-20 20:00:00 Cluster 2 AAA Start 1
7 2021-02-20 21:00:00 Cluster 1 AAA Ongoing 2
8 2021-02-20 22:00:00 Cluster 3 AAA Ongoing 3
9 2021-02-20 23:00:00 NotActive AAA Complete 4
10 2021-03-01 01:00:00 Cluster 5 BBB Start 1
11 2021-03-01 02:00:00 Cluster 5 BBB Ongoing 2
12 2021-03-01 03:00:00 Cluster 4 BBB Ongoing 3
13 2021-03-01 04:00:00 NotActive BBB Complete 4
14 2021-03-01 08:00:00 Cluster 2 BBB Start 1
15 2021-03-01 09:00:00 Cluster 2 BBB Ongoing 2
16 2021-03-01 10:00:00 Cluster 3 BBB Ongoing 3
17 2021-03-01 11:00:00 NotActive BBB Complete 4
18 2021-02-20 19:00:00 Cluster 1 DDD Start 1
19 2021-02-20 20:00:00 Cluster 2 DDD Ongoing 2
20 2021-02-20 21:00:00 NotActive DDD Complete 3
我想要的是在 Cluster
列中(按 UserID
),如果值为 NotActive
,则保留第一个 NotActive
值并丢弃剩余 NotActive
行,直到它是别的东西。同样,我想创建一个“状态”列,该列中的第一个值对应于 Start
,直到第一个 NotActive
值对应于 Complete
; Ongoing
之间的所有其他内容。最后,Instance
列只是在 Status
列中从 1==Start 到 N==Complete 编号。
任何帮助将不胜感激:)
这是一个 dplyr
方法 -
library(dplyr)
rawdata %>%
#for each user id
group_by(UserID) %>%
#drop the repeated the 'NotActive' rows
filter(Cluster != 'NotActive' | lag(Cluster != 'NotActive', default = TRUE)) %>%
#Drop the rows that begins with 'NotActive'.
filter(!(row_number() == 1 & Cluster == 'NotActive')) %>%
#Create a group column for each cycle
group_by(grp = cumsum(lag(Cluster == 'NotActive', default = TRUE)), .add = TRUE) %>%
#Create a status column which starts with 'Start' and ends with 'Complete'
#fill it with 'Ongoing' in between.
#Create an Instance column using row_number()
mutate(Status = c('Start', rep('Ongoing', n() - 2), 'Complete'),
Instance = row_number()) %>%
ungroup %>%
select(-grp)
此 returns 以下数据框。
# DateTime Cluster UserID Status Instance
# <chr> <chr> <chr> <chr> <int>
# 1 20/02/2021 13:00 Cluster 3 AAA Start 1
# 2 20/02/2021 14:00 Cluster 3 AAA Ongoing 2
# 3 20/02/2021 15:00 Cluster 3 AAA Ongoing 3
# 4 20/02/2021 16:00 Cluster 3 AAA Ongoing 4
# 5 20/02/2021 17:00 NotActive AAA Complete 5
# 6 20/02/2021 20:00 Cluster 2 AAA Start 1
# 7 20/02/2021 21:00 Cluster 1 AAA Ongoing 2
# 8 20/02/2021 22:00 Cluster 3 AAA Ongoing 3
# 9 20/02/2021 23:00 NotActive AAA Complete 4
#10 01/03/2021 01:00 Cluster 5 BBB Start 1
#11 01/03/2021 02:00 Cluster 5 BBB Ongoing 2
#12 01/03/2021 03:00 Cluster 4 BBB Ongoing 3
#13 01/03/2021 04:00 NotActive BBB Complete 4
#14 01/03/2021 08:00 Cluster 2 BBB Start 1
#15 01/03/2021 09:00 Cluster 2 BBB Ongoing 2
#16 01/03/2021 10:00 Cluster 3 BBB Ongoing 3
#17 01/03/2021 11:00 NotActive BBB Complete 4
#18 20/02/2021 19:00 Cluster 1 DDD Start 1
#19 20/02/2021 20:00 Cluster 2 DDD Ongoing 2
#20 20/02/2021 21:00 NotActive DDD Complete 3
我正在尝试通过根据条件和按组过滤掉我不再需要的行来转换现有数据集。随后,我想添加额外的列,它本质上是一个状态列和一个 activity 实例列。这是数据框:-
rawdata<-structure(list(DateTime = c("20/02/2021 13:00", "20/02/2021 14:00",
"20/02/2021 15:00", "20/02/2021 16:00", "20/02/2021 17:00", "20/02/2021 18:00",
"20/02/2021 19:00", "20/02/2021 20:00", "20/02/2021 21:00", "20/02/2021 22:00",
"20/02/2021 23:00", "21/02/2021 00:00", "01/03/2021 00:00", "01/03/2021 01:00",
"01/03/2021 02:00", "01/03/2021 03:00", "01/03/2021 04:00", "01/03/2021 05:00",
"01/03/2021 06:00", "01/03/2021 07:00", "01/03/2021 08:00", "01/03/2021 09:00",
"01/03/2021 10:00", "01/03/2021 11:00", "01/03/2021 12:00", "01/03/2021 13:00",
"20/02/2021 13:00", "20/02/2021 14:00", "20/02/2021 15:00", "20/02/2021 16:00",
"20/02/2021 17:00", "20/02/2021 18:00", "20/02/2021 19:00", "20/02/2021 20:00",
"20/02/2021 21:00", "20/02/2021 22:00"), Cluster = c("Cluster 3",
"Cluster 3", "Cluster 3", "Cluster 3", "NotActive", "NotActive",
"NotActive", "Cluster 2", "Cluster 1", "Cluster 3", "NotActive",
"NotActive", "NotActive", "Cluster 5", "Cluster 5", "Cluster 4",
"NotActive", "NotActive", "NotActive", "NotActive", "Cluster 2",
"Cluster 2", "Cluster 3", "NotActive", "NotActive", "NotActive",
"NotActive", "NotActive", "NotActive", "NotActive", "NotActive",
"NotActive", "Cluster 1", "Cluster 2", "NotActive", "NotActive"
), UserID = c("AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA",
"AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "BBB", "BBB", "BBB",
"BBB", "BBB", "BBB", "BBB", "BBB", "BBB", "BBB", "BBB", "BBB",
"BBB", "DDD", "DDD", "DDD", "DDD", "DDD", "DDD", "DDD", "DDD",
"DDD", "DDD")), class = "data.frame", row.names = c(NA, -36L))
print(rawdata)
DateTime Cluster UserID
DateTime Cluster UserID
1 20/02/2021 13:00 Cluster 3 AAA
2 20/02/2021 14:00 Cluster 3 AAA
3 20/02/2021 15:00 Cluster 3 AAA
4 20/02/2021 16:00 Cluster 3 AAA
5 20/02/2021 17:00 NotActive AAA
6 20/02/2021 18:00 NotActive AAA
7 20/02/2021 19:00 NotActive AAA
8 20/02/2021 20:00 Cluster 2 AAA
9 20/02/2021 21:00 Cluster 1 AAA
10 20/02/2021 22:00 Cluster 3 AAA
11 20/02/2021 23:00 NotActive AAA
12 21/02/2021 00:00 NotActive AAA
13 01/03/2021 00:00 NotActive AAA
14 01/03/2021 01:00 Cluster 5 BBB
15 01/03/2021 02:00 Cluster 5 BBB
16 01/03/2021 03:00 Cluster 4 BBB
17 01/03/2021 04:00 NotActive BBB
18 01/03/2021 05:00 NotActive BBB
19 01/03/2021 06:00 NotActive BBB
20 01/03/2021 07:00 NotActive BBB
21 01/03/2021 08:00 Cluster 2 BBB
22 01/03/2021 09:00 Cluster 2 BBB
23 01/03/2021 10:00 Cluster 3 BBB
24 01/03/2021 11:00 NotActive BBB
25 01/03/2021 12:00 NotActive BBB
26 01/03/2021 13:00 NotActive BBB
27 20/02/2021 13:00 NotActive DDD
28 20/02/2021 14:00 NotActive DDD
29 20/02/2021 15:00 NotActive DDD
30 20/02/2021 16:00 NotActive DDD
31 20/02/2021 17:00 NotActive DDD
32 20/02/2021 18:00 NotActive DDD
33 20/02/2021 19:00 Cluster 1 DDD
34 20/02/2021 20:00 Cluster 2 DDD
35 20/02/2021 21:00 NotActive DDD
36 20/02/2021 22:00 NotActive DDD
为了进一步说明,这里是所需的输出:-
desiredoutput<-structure(list(DateTime = structure(c(1613826000, 1613829600,
1613833200, 1613836800, 1613840400, 1613851200, 1613854800, 1613858400,
1613862000, 1614560400, 1614564000, 1614567600, 1614571200, 1614585600,
1614589200, 1614592800, 1614596400, 1613847600, 1613851200, 1613854800
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), Cluster = c("Cluster 3",
"Cluster 3", "Cluster 3", "Cluster 3", "NotActive", "Cluster 2",
"Cluster 1", "Cluster 3", "NotActive", "Cluster 5", "Cluster 5",
"Cluster 4", "NotActive", "Cluster 2", "Cluster 2", "Cluster 3",
"NotActive", "Cluster 1", "Cluster 2", "NotActive"), UserID = c("AAA",
"AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "BBB",
"BBB", "BBB", "BBB", "BBB", "BBB", "BBB", "BBB", "DDD", "DDD",
"DDD"), Status = c("Start", "Ongoing", "Ongoing", "Ongoing",
"Complete", "Start", "Ongoing", "Ongoing", "Complete", "Start",
"Ongoing", "Ongoing", "Complete", "Start", "Ongoing", "Ongoing",
"Complete", "Start", "Ongoing", "Complete"), Instance = c(1,
2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3)), row.names = c(NA,
-20L), class = c("tbl_df", "tbl", "data.frame"))
print(desiredoutput)
DateTime Cluster UserID Status Instance
<dttm> <chr> <chr> <chr> <dbl>
1 2021-02-20 13:00:00 Cluster 3 AAA Start 1
2 2021-02-20 14:00:00 Cluster 3 AAA Ongoing 2
3 2021-02-20 15:00:00 Cluster 3 AAA Ongoing 3
4 2021-02-20 16:00:00 Cluster 3 AAA Ongoing 4
5 2021-02-20 17:00:00 NotActive AAA Complete 5
6 2021-02-20 20:00:00 Cluster 2 AAA Start 1
7 2021-02-20 21:00:00 Cluster 1 AAA Ongoing 2
8 2021-02-20 22:00:00 Cluster 3 AAA Ongoing 3
9 2021-02-20 23:00:00 NotActive AAA Complete 4
10 2021-03-01 01:00:00 Cluster 5 BBB Start 1
11 2021-03-01 02:00:00 Cluster 5 BBB Ongoing 2
12 2021-03-01 03:00:00 Cluster 4 BBB Ongoing 3
13 2021-03-01 04:00:00 NotActive BBB Complete 4
14 2021-03-01 08:00:00 Cluster 2 BBB Start 1
15 2021-03-01 09:00:00 Cluster 2 BBB Ongoing 2
16 2021-03-01 10:00:00 Cluster 3 BBB Ongoing 3
17 2021-03-01 11:00:00 NotActive BBB Complete 4
18 2021-02-20 19:00:00 Cluster 1 DDD Start 1
19 2021-02-20 20:00:00 Cluster 2 DDD Ongoing 2
20 2021-02-20 21:00:00 NotActive DDD Complete 3
我想要的是在 Cluster
列中(按 UserID
),如果值为 NotActive
,则保留第一个 NotActive
值并丢弃剩余 NotActive
行,直到它是别的东西。同样,我想创建一个“状态”列,该列中的第一个值对应于 Start
,直到第一个 NotActive
值对应于 Complete
; Ongoing
之间的所有其他内容。最后,Instance
列只是在 Status
列中从 1==Start 到 N==Complete 编号。
任何帮助将不胜感激:)
这是一个 dplyr
方法 -
library(dplyr)
rawdata %>%
#for each user id
group_by(UserID) %>%
#drop the repeated the 'NotActive' rows
filter(Cluster != 'NotActive' | lag(Cluster != 'NotActive', default = TRUE)) %>%
#Drop the rows that begins with 'NotActive'.
filter(!(row_number() == 1 & Cluster == 'NotActive')) %>%
#Create a group column for each cycle
group_by(grp = cumsum(lag(Cluster == 'NotActive', default = TRUE)), .add = TRUE) %>%
#Create a status column which starts with 'Start' and ends with 'Complete'
#fill it with 'Ongoing' in between.
#Create an Instance column using row_number()
mutate(Status = c('Start', rep('Ongoing', n() - 2), 'Complete'),
Instance = row_number()) %>%
ungroup %>%
select(-grp)
此 returns 以下数据框。
# DateTime Cluster UserID Status Instance
# <chr> <chr> <chr> <chr> <int>
# 1 20/02/2021 13:00 Cluster 3 AAA Start 1
# 2 20/02/2021 14:00 Cluster 3 AAA Ongoing 2
# 3 20/02/2021 15:00 Cluster 3 AAA Ongoing 3
# 4 20/02/2021 16:00 Cluster 3 AAA Ongoing 4
# 5 20/02/2021 17:00 NotActive AAA Complete 5
# 6 20/02/2021 20:00 Cluster 2 AAA Start 1
# 7 20/02/2021 21:00 Cluster 1 AAA Ongoing 2
# 8 20/02/2021 22:00 Cluster 3 AAA Ongoing 3
# 9 20/02/2021 23:00 NotActive AAA Complete 4
#10 01/03/2021 01:00 Cluster 5 BBB Start 1
#11 01/03/2021 02:00 Cluster 5 BBB Ongoing 2
#12 01/03/2021 03:00 Cluster 4 BBB Ongoing 3
#13 01/03/2021 04:00 NotActive BBB Complete 4
#14 01/03/2021 08:00 Cluster 2 BBB Start 1
#15 01/03/2021 09:00 Cluster 2 BBB Ongoing 2
#16 01/03/2021 10:00 Cluster 3 BBB Ongoing 3
#17 01/03/2021 11:00 NotActive BBB Complete 4
#18 20/02/2021 19:00 Cluster 1 DDD Start 1
#19 20/02/2021 20:00 Cluster 2 DDD Ongoing 2
#20 20/02/2021 21:00 NotActive DDD Complete 3