通过在 R 中对变量进行分组有条件地改变值;基于两个数据帧中时间戳之间的差异时间的条件
Conditionally changing values by grouping variable in R; condition based on difftime between timestamps in two dataframes
我正在尝试以有条件的方式更正数据集中的一些错误条目。我需要按组执行此操作,条件基于 2 个不同数据集的 2 个时间戳之间的差异。
以下是我正在处理的数据类型的一些示例:-
df1<-structure(list(UserID = c("AAA", "AAA", "AAA", "BBB", "BBB",
"BBB", "BBB", "CCC", "CCC", "CCC", "CCC", "CCC", "DDD", "DDD",
"DDD", "DDD", "DDD", "DDD"), Value = c("Group1", "Group1", "Group2",
"Group3", "Group3", "Group1", "Group2", "Group4", "Group5", "Group5",
"Group5", "Group5", "Group1", "Group2", "Group2", "Group2", "Group2",
"Group2"), Time = structure(c(1577840400, 1577844000, 1577847600,
1577966400, 1577970000, 1577973600, 1577977200, 1577977200, 1577980800,
1577984400, 1577988000, 1577991600, 1578193200, 1578196800, 1578200400,
1578204000, 1578207600, 1578211200), class = c("POSIXct", "POSIXt"
), tzone = "UTC")), row.names = c(NA, -18L), class = "data.frame")
df2<-structure(list(UserID = c("AAA", "AAA", "AAA", "BBB", "BBB",
"BBB", "BBB", "CCC", "CCC", "DDD", "DDD"), StartTime = structure(c(1577839980,
1577840460, 1577843820, 1577966580, 1577970180, 1577973360, 1577975160,
1577977920, 1577978940, 1578193200, 1578193920), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), EndTime = structure(c(1577840460,
1577843820, 1577846640, 1577970180, 1577973360, 1577975160, 1577978580,
1577978940, 1577980680, 1578193920, 1578196620), class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -11L), class = "data.frame")
两个数据集如下所示:-
print(df1)
UserID Value Time
1 AAA Group1 2020-01-01 01:00:00
2 AAA Group1 2020-01-01 02:00:00
3 AAA Group2 2020-01-01 03:00:00
4 BBB Group3 2020-01-02 12:00:00
5 BBB Group3 2020-01-02 13:00:00
6 BBB Group1 2020-01-02 14:00:00
7 BBB Group2 2020-01-02 15:00:00
8 CCC Group4 2020-01-02 15:00:00
9 CCC Group5 2020-01-02 16:00:00
10 CCC Group5 2020-01-02 17:00:00
11 CCC Group5 2020-01-02 18:00:00
12 CCC Group5 2020-01-02 19:00:00
13 DDD Group1 2020-01-05 03:00:00
14 DDD Group2 2020-01-05 04:00:00
15 DDD Group2 2020-01-05 05:00:00
16 DDD Group2 2020-01-05 06:00:00
17 DDD Group2 2020-01-05 07:00:00
18 DDD Group2 2020-01-05 08:00:00
print(df2)
UserID StartTime EndTime
1 AAA 2020-01-01 00:53:00 2020-01-01 01:01:00
2 AAA 2020-01-01 01:01:00 2020-01-01 01:57:00
3 AAA 2020-01-01 01:57:00 2020-01-01 02:44:00
4 BBB 2020-01-02 12:03:00 2020-01-02 13:03:00
5 BBB 2020-01-02 13:03:00 2020-01-02 13:56:00
6 BBB 2020-01-02 13:56:00 2020-01-02 14:26:00
7 BBB 2020-01-02 14:26:00 2020-01-02 15:23:00
8 CCC 2020-01-02 15:12:00 2020-01-02 15:29:00
9 CCC 2020-01-02 15:29:00 2020-01-02 15:58:00
10 DDD 2020-01-05 03:00:00 2020-01-05 03:12:00
11 DDD 2020-01-05 03:12:00 2020-01-05 03:57:00
df1 中的时间戳列四舍五入到每小时,而 df2 中有开始时间戳和结束时间戳(两者都是细粒度的,四舍五入到分钟)。 df1 中有一些条目是不正确的,因为它们没有在相应的时间出现在 df2 中。
例如,UserID
CCC 最后一个 EndTime
时间戳在 df2 中是 2020-01-02 15:58:00,但在 df1 中,CCC 出现在 2020-01-02 17:00:00、2020-01-02 18:00:00 和 2020-01-02 19:00:00; UserID
DDD.
的类似实例
我想做什么
如果 UserID
在 df1 中有一个条目,其 df1$Time
时间戳比它们在 df2 中的最后一个 df2$EndTime
时间戳 >=60 分钟,我希望条目在df1$Value
将更改为“NoGroup”。
以下是预期结果的说明性示例:-
UserID Value Time
1 AAA Group1 2020-01-01 01:00:00
2 AAA Group1 2020-01-01 02:00:00
3 AAA Group2 2020-01-01 03:00:00
4 BBB Group3 2020-01-02 12:00:00
5 BBB Group3 2020-01-02 13:00:00
6 BBB Group1 2020-01-02 14:00:00
7 BBB Group2 2020-01-02 15:00:00
8 CCC Group4 2020-01-02 15:00:00
9 CCC Group5 2020-01-02 16:00:00
10 CCC NoGroup 2020-01-02 17:00:00
11 CCC NoGroup 2020-01-02 18:00:00
12 CCC NoGroup 2020-01-02 19:00:00
13 DDD Group1 2020-01-05 03:00:00
14 DDD Group2 2020-01-05 04:00:00
15 DDD NoGroup 2020-01-05 05:00:00
16 DDD NoGroup 2020-01-05 06:00:00
17 DDD NoGroup 2020-01-05 07:00:00
18 DDD NoGroup 2020-01-05 08:00:00
一如既往地感谢任何指点:)
使用 dplyr
:
df1 %>%
left_join(df2 %>% group_by(UserID) %>% filter(EndTime == max(EndTime)), by = "UserID") %>%
mutate(Value = if_else(Time-EndTime >= 60, "NoGroup", Value)) %>%
select(-c(4,5))
首先你 join
每个 UserID
的最后 EndTimes
从 df2
到 df1
,接下来你检查是否有任何 Time
在此 EndTime
之后超过 60 分钟并相应地更改 Value
。最后,您删除了 join
期间添加的列
df1<-structure(list(UserID = c("AAA", "AAA", "AAA", "BBB", "BBB",
"BBB", "BBB", "CCC", "CCC", "CCC", "CCC", "CCC", "DDD", "DDD",
"DDD", "DDD", "DDD", "DDD"), Value = c("Group1", "Group1", "Group2",
"Group3", "Group3", "Group1", "Group2", "Group4", "Group5", "Group5",
"Group5", "Group5", "Group1", "Group2", "Group2", "Group2", "Group2",
"Group2"), Time = structure(c(1577840400, 1577844000, 1577847600,
1577966400, 1577970000, 1577973600, 1577977200, 1577977200, 1577980800,
1577984400, 1577988000, 1577991600, 1578193200, 1578196800, 1578200400,
1578204000, 1578207600, 1578211200), class = c("POSIXct", "POSIXt"
), tzone = "UTC")), row.names = c(NA, -18L), class = "data.frame")
df2<-structure(list(UserID = c("AAA", "AAA", "AAA", "BBB", "BBB",
"BBB", "BBB", "CCC", "CCC", "DDD", "DDD"), StartTime = structure(c(1577839980,
1577840460, 1577843820, 1577966580, 1577970180, 1577973360, 1577975160,
1577977920, 1577978940, 1578193200, 1578193920), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), EndTime = structure(c(1577840460,
1577843820, 1577846640, 1577970180, 1577973360, 1577975160, 1577978580,
1577978940, 1577980680, 1578193920, 1578196620), class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -11L), class = "data.frame")
library(tidyverse)
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#>
#> date, intersect, setdiff, union
no_groups <-
df1 %>%
as_tibble() %>%
left_join(df2 %>% as_tibble()) %>%
group_by(UserID) %>%
mutate(
last_end = max(EndTime)
) %>%
mutate(
no_group = all((last_end - Time) <= minutes(60))
) %>%
distinct(UserID, no_group)
#> Joining, by = "UserID"
no_groups
#> # A tibble: 4 x 2
#> # Groups: UserID [4]
#> UserID no_group
#> <chr> <lgl>
#> 1 AAA FALSE
#> 2 BBB FALSE
#> 3 CCC TRUE
#> 4 DDD TRUE
df1 %>%
as_tibble() %>%
left_join(no_groups) %>%
mutate(Value = ifelse(no_group, "NoGroup", Value)) %>%
select(-no_group)
#> Joining, by = "UserID"
#> # A tibble: 18 x 3
#> UserID Value Time
#> <chr> <chr> <dttm>
#> 1 AAA Group1 2020-01-01 01:00:00
#> 2 AAA Group1 2020-01-01 02:00:00
#> 3 AAA Group2 2020-01-01 03:00:00
#> 4 BBB Group3 2020-01-02 12:00:00
#> 5 BBB Group3 2020-01-02 13:00:00
#> 6 BBB Group1 2020-01-02 14:00:00
#> 7 BBB Group2 2020-01-02 15:00:00
#> 8 CCC NoGroup 2020-01-02 15:00:00
#> 9 CCC NoGroup 2020-01-02 16:00:00
#> 10 CCC NoGroup 2020-01-02 17:00:00
#> 11 CCC NoGroup 2020-01-02 18:00:00
#> 12 CCC NoGroup 2020-01-02 19:00:00
#> 13 DDD NoGroup 2020-01-05 03:00:00
#> 14 DDD NoGroup 2020-01-05 04:00:00
#> 15 DDD NoGroup 2020-01-05 05:00:00
#> 16 DDD NoGroup 2020-01-05 06:00:00
#> 17 DDD NoGroup 2020-01-05 07:00:00
#> 18 DDD NoGroup 2020-01-05 08:00:00
由 reprex package (v2.0.0)
于 2021-09-17 创建
我正在尝试以有条件的方式更正数据集中的一些错误条目。我需要按组执行此操作,条件基于 2 个不同数据集的 2 个时间戳之间的差异。
以下是我正在处理的数据类型的一些示例:-
df1<-structure(list(UserID = c("AAA", "AAA", "AAA", "BBB", "BBB",
"BBB", "BBB", "CCC", "CCC", "CCC", "CCC", "CCC", "DDD", "DDD",
"DDD", "DDD", "DDD", "DDD"), Value = c("Group1", "Group1", "Group2",
"Group3", "Group3", "Group1", "Group2", "Group4", "Group5", "Group5",
"Group5", "Group5", "Group1", "Group2", "Group2", "Group2", "Group2",
"Group2"), Time = structure(c(1577840400, 1577844000, 1577847600,
1577966400, 1577970000, 1577973600, 1577977200, 1577977200, 1577980800,
1577984400, 1577988000, 1577991600, 1578193200, 1578196800, 1578200400,
1578204000, 1578207600, 1578211200), class = c("POSIXct", "POSIXt"
), tzone = "UTC")), row.names = c(NA, -18L), class = "data.frame")
df2<-structure(list(UserID = c("AAA", "AAA", "AAA", "BBB", "BBB",
"BBB", "BBB", "CCC", "CCC", "DDD", "DDD"), StartTime = structure(c(1577839980,
1577840460, 1577843820, 1577966580, 1577970180, 1577973360, 1577975160,
1577977920, 1577978940, 1578193200, 1578193920), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), EndTime = structure(c(1577840460,
1577843820, 1577846640, 1577970180, 1577973360, 1577975160, 1577978580,
1577978940, 1577980680, 1578193920, 1578196620), class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -11L), class = "data.frame")
两个数据集如下所示:-
print(df1)
UserID Value Time
1 AAA Group1 2020-01-01 01:00:00
2 AAA Group1 2020-01-01 02:00:00
3 AAA Group2 2020-01-01 03:00:00
4 BBB Group3 2020-01-02 12:00:00
5 BBB Group3 2020-01-02 13:00:00
6 BBB Group1 2020-01-02 14:00:00
7 BBB Group2 2020-01-02 15:00:00
8 CCC Group4 2020-01-02 15:00:00
9 CCC Group5 2020-01-02 16:00:00
10 CCC Group5 2020-01-02 17:00:00
11 CCC Group5 2020-01-02 18:00:00
12 CCC Group5 2020-01-02 19:00:00
13 DDD Group1 2020-01-05 03:00:00
14 DDD Group2 2020-01-05 04:00:00
15 DDD Group2 2020-01-05 05:00:00
16 DDD Group2 2020-01-05 06:00:00
17 DDD Group2 2020-01-05 07:00:00
18 DDD Group2 2020-01-05 08:00:00
print(df2)
UserID StartTime EndTime
1 AAA 2020-01-01 00:53:00 2020-01-01 01:01:00
2 AAA 2020-01-01 01:01:00 2020-01-01 01:57:00
3 AAA 2020-01-01 01:57:00 2020-01-01 02:44:00
4 BBB 2020-01-02 12:03:00 2020-01-02 13:03:00
5 BBB 2020-01-02 13:03:00 2020-01-02 13:56:00
6 BBB 2020-01-02 13:56:00 2020-01-02 14:26:00
7 BBB 2020-01-02 14:26:00 2020-01-02 15:23:00
8 CCC 2020-01-02 15:12:00 2020-01-02 15:29:00
9 CCC 2020-01-02 15:29:00 2020-01-02 15:58:00
10 DDD 2020-01-05 03:00:00 2020-01-05 03:12:00
11 DDD 2020-01-05 03:12:00 2020-01-05 03:57:00
df1 中的时间戳列四舍五入到每小时,而 df2 中有开始时间戳和结束时间戳(两者都是细粒度的,四舍五入到分钟)。 df1 中有一些条目是不正确的,因为它们没有在相应的时间出现在 df2 中。
例如,UserID
CCC 最后一个 EndTime
时间戳在 df2 中是 2020-01-02 15:58:00,但在 df1 中,CCC 出现在 2020-01-02 17:00:00、2020-01-02 18:00:00 和 2020-01-02 19:00:00; UserID
DDD.
我想做什么
如果 UserID
在 df1 中有一个条目,其 df1$Time
时间戳比它们在 df2 中的最后一个 df2$EndTime
时间戳 >=60 分钟,我希望条目在df1$Value
将更改为“NoGroup”。
以下是预期结果的说明性示例:-
UserID Value Time
1 AAA Group1 2020-01-01 01:00:00
2 AAA Group1 2020-01-01 02:00:00
3 AAA Group2 2020-01-01 03:00:00
4 BBB Group3 2020-01-02 12:00:00
5 BBB Group3 2020-01-02 13:00:00
6 BBB Group1 2020-01-02 14:00:00
7 BBB Group2 2020-01-02 15:00:00
8 CCC Group4 2020-01-02 15:00:00
9 CCC Group5 2020-01-02 16:00:00
10 CCC NoGroup 2020-01-02 17:00:00
11 CCC NoGroup 2020-01-02 18:00:00
12 CCC NoGroup 2020-01-02 19:00:00
13 DDD Group1 2020-01-05 03:00:00
14 DDD Group2 2020-01-05 04:00:00
15 DDD NoGroup 2020-01-05 05:00:00
16 DDD NoGroup 2020-01-05 06:00:00
17 DDD NoGroup 2020-01-05 07:00:00
18 DDD NoGroup 2020-01-05 08:00:00
一如既往地感谢任何指点:)
使用 dplyr
:
df1 %>%
left_join(df2 %>% group_by(UserID) %>% filter(EndTime == max(EndTime)), by = "UserID") %>%
mutate(Value = if_else(Time-EndTime >= 60, "NoGroup", Value)) %>%
select(-c(4,5))
首先你 join
每个 UserID
的最后 EndTimes
从 df2
到 df1
,接下来你检查是否有任何 Time
在此 EndTime
之后超过 60 分钟并相应地更改 Value
。最后,您删除了 join
df1<-structure(list(UserID = c("AAA", "AAA", "AAA", "BBB", "BBB",
"BBB", "BBB", "CCC", "CCC", "CCC", "CCC", "CCC", "DDD", "DDD",
"DDD", "DDD", "DDD", "DDD"), Value = c("Group1", "Group1", "Group2",
"Group3", "Group3", "Group1", "Group2", "Group4", "Group5", "Group5",
"Group5", "Group5", "Group1", "Group2", "Group2", "Group2", "Group2",
"Group2"), Time = structure(c(1577840400, 1577844000, 1577847600,
1577966400, 1577970000, 1577973600, 1577977200, 1577977200, 1577980800,
1577984400, 1577988000, 1577991600, 1578193200, 1578196800, 1578200400,
1578204000, 1578207600, 1578211200), class = c("POSIXct", "POSIXt"
), tzone = "UTC")), row.names = c(NA, -18L), class = "data.frame")
df2<-structure(list(UserID = c("AAA", "AAA", "AAA", "BBB", "BBB",
"BBB", "BBB", "CCC", "CCC", "DDD", "DDD"), StartTime = structure(c(1577839980,
1577840460, 1577843820, 1577966580, 1577970180, 1577973360, 1577975160,
1577977920, 1577978940, 1578193200, 1578193920), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), EndTime = structure(c(1577840460,
1577843820, 1577846640, 1577970180, 1577973360, 1577975160, 1577978580,
1577978940, 1577980680, 1578193920, 1578196620), class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -11L), class = "data.frame")
library(tidyverse)
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#>
#> date, intersect, setdiff, union
no_groups <-
df1 %>%
as_tibble() %>%
left_join(df2 %>% as_tibble()) %>%
group_by(UserID) %>%
mutate(
last_end = max(EndTime)
) %>%
mutate(
no_group = all((last_end - Time) <= minutes(60))
) %>%
distinct(UserID, no_group)
#> Joining, by = "UserID"
no_groups
#> # A tibble: 4 x 2
#> # Groups: UserID [4]
#> UserID no_group
#> <chr> <lgl>
#> 1 AAA FALSE
#> 2 BBB FALSE
#> 3 CCC TRUE
#> 4 DDD TRUE
df1 %>%
as_tibble() %>%
left_join(no_groups) %>%
mutate(Value = ifelse(no_group, "NoGroup", Value)) %>%
select(-no_group)
#> Joining, by = "UserID"
#> # A tibble: 18 x 3
#> UserID Value Time
#> <chr> <chr> <dttm>
#> 1 AAA Group1 2020-01-01 01:00:00
#> 2 AAA Group1 2020-01-01 02:00:00
#> 3 AAA Group2 2020-01-01 03:00:00
#> 4 BBB Group3 2020-01-02 12:00:00
#> 5 BBB Group3 2020-01-02 13:00:00
#> 6 BBB Group1 2020-01-02 14:00:00
#> 7 BBB Group2 2020-01-02 15:00:00
#> 8 CCC NoGroup 2020-01-02 15:00:00
#> 9 CCC NoGroup 2020-01-02 16:00:00
#> 10 CCC NoGroup 2020-01-02 17:00:00
#> 11 CCC NoGroup 2020-01-02 18:00:00
#> 12 CCC NoGroup 2020-01-02 19:00:00
#> 13 DDD NoGroup 2020-01-05 03:00:00
#> 14 DDD NoGroup 2020-01-05 04:00:00
#> 15 DDD NoGroup 2020-01-05 05:00:00
#> 16 DDD NoGroup 2020-01-05 06:00:00
#> 17 DDD NoGroup 2020-01-05 07:00:00
#> 18 DDD NoGroup 2020-01-05 08:00:00
由 reprex package (v2.0.0)
于 2021-09-17 创建