如果连续因素与 R 编辑/合并行
Edit / merge rows if consecutive factor with R
我有这个样本
data <- structure(list(id = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L), .Label = c("a", "b"), class = "factor"), minTime = structure(c(1L,
2L, 3L, 4L, 5L, 7L, 6L, 8L, 9L, 10L), .Label = c("2014-06-06 07:39:50",
"2014-06-07 02:24:32", "2014-06-07 15:14:29", "2014-06-07 2:29",
"2014-06-08 5:40", "2014-06-18 17:54:42", "2014-06-18 2:45",
"2014-06-19 02:37:53", "2014-06-19 19:15", "2014-06-19 22:15"
), class = "factor"), maxTime = structure(c(1L, 3L, 4L, 2L, 5L,
6L, 7L, 8L, 9L, 10L), .Label = c("2014-06-07 01:41:31", "2014-06-07 10:01",
"2014-06-07 14:44:08", "2014-06-07 22:31:02", "2014-06-08 5:50",
"2014-06-18 2:50", "2014-06-19 01:49:05", "2014-06-19 18:51:36",
"2014-06-19 20:15", "2014-06-19 23:15"), class = "factor"), duration.minutes = c(NA,
740L, 437L, 452L, NA, NA, 474L, 974L, 4062L, 353L), event = structure(c(1L,
4L, 4L, 2L, 2L, 1L, 4L, 4L, 3L, 4L), .Label = c("enter", "exit",
"stop", "trip"), class = "factor")), .Names = c("id", "minTime",
"maxTime", "duration.minutes", "event"), class = "data.frame", row.names = c(NA,
-10L))
如果事件对于每个 ID 都是连续的,我想合并事件 "trips"。
id 为 a 和 b 的连续行程已合并:
- 编辑了 maxTime 以包含整个行程
- duration.minutes 也被编辑了。
例如,这个输出:
dput(output.wanted)
structure(list(id = structure(c(1L, 1L, 1L, 2L, 2L, 1L, 2L), .Label = c("a",
"b"), class = "factor"), minTime = structure(c(5L, 6L, 7L, 2L,
1L, 3L, 4L), .Label = c("18.6.2014 17:54", "18.6.2014 2:45",
"19.6.2014 19:15", "19.6.2014 22:15", "6.6.2014 7:39", "7.6.2014 2:24",
"7.6.2014 2:29"), class = "factor"), maxTime = structure(c(5L,
7L, 6L, 1L, 2L, 3L, 4L), .Label = c("18.6.2014 2:50", "19.6.2014 18:51",
"19.6.2014 20:15", "19.6.2014 23:15", "7.6.2014 1:41", "7.6.2014 10:01",
"7.6.2014 22:31"), class = "factor"), duration.minutes = c(NA,
1177L, 452L, NA, 1448L, 4062L, 353L), event = structure(c(1L,
4L, 2L, 1L, 4L, 3L, 4L), .Label = c("enter", "exit", "stop",
"trip"), class = "factor")), .Names = c("id", "minTime", "maxTime",
"duration.minutes", "event"), class = "data.frame", row.names = c(NA,
-7L))
关键是我希望合并所有事件(旅行或停止),如果它们对于同一 ID 是连续的。但如果停靠点或行程是独一无二的,它会保持不变
我一直在尝试用 group_by 和变异来做到这一点,但我有点迷茫...
--
我找到了解决方案,但事件 stops
仍然存在一个小问题
这是一个示例:
> dput(total)
structure(list(Ship = c(205482000, 205482000, 205482000, 205482000,
205482000, 205482000, 205482000, 205482000, 205482000, 205482000,
205482000, 205482000), minTime = structure(c(1401570241, 1401969219,
1401981860, 1402052108, 1402768362, 1402772602, 1402841443, 1402855773,
1403056361, 1403278916, 1403290856, 1403367735), class = c("POSIXct",
"POSIXt")), maxTime = structure(c(1401966639, 1401980399, 1402051849,
1402056430, 1402772313, 1402839873, 1402852433, 1403052550, 1403276355,
1403289496, 1403367596, 1403371285), class = c("POSIXct", "POSIXt"
)), duration.minutes = structure(c(6607, 186, 1166, NA, NA, 1121,
183, 3280, 3667, 176, 1279, NA), class = "difftime", units = "mins"),
event = c("stop", "trip", "trip", "exit", "enter", "trip",
"trip", "stop", "stop", "trip", "trip", "exit"), dist.sailed = c(NA,
50254.2034817555, 349194.108518887, NA, NA, 347816.081064252,
50035.8859874946, NA, NA, 49982.687612038, 351978.737678528,
NA)), .Names = c("Ship", "minTime", "maxTime", "duration.minutes",
"event", "dist.sailed"), class = "data.frame", row.names = c(4L,
5L, 6L, 2L, 1L, 7L, 8L, 9L, 10L, 11L, 12L, 3L))
以下代码未生成完整的停止持续时间,而仅生成 NA:
total <- total %>%
group_by(Ship) %>%
mutate(new_id = data.table::rleid(event)) %>%
group_by(event, new_id, Ship) %>%
mutate(duration.minutes = ifelse(event == 'trip', sum(duration.minutes), duration.minutes), maxTime = tail(maxTime, 1))%>%
mutate(duration.minutes = ifelse(event == 'stop', sum(duration.minutes), duration.minutes), maxTime = tail(maxTime, 1))%>%
mutate(dist.sailed = ifelse(event == 'trip', sum(dist.sailed), dist.sailed), dist.sailed = tail(dist.sailed, 1)) %>%
filter(!duplicated(duration.minutes)) %>%
select(-new_id)
我在@Sotos 的代码中添加了mutate(duration.minutes = ifelse(event == 'stop', sum(duration.minutes), duration.minutes), maxTime = tail(maxTime, 1))%>%
如果您使用 dplyr
包,则必须使用 summarise
,而不是 mutate
以某种方式组合数据。喜欢
some.df %>%
group_by(event) %>%
summarise(sum.maxTime = sum(maxTime), sum.duration = sum(duration.minutes))
Mutate
在您的 data.frame 中添加或更改一列,但结果的行数始终与原始行数相同。 summarise
之后的行号将是数据集中的组数。
希望这对您有所帮助
这有点乱,
library(dplyr)
data %>%
group_by(id) %>%
mutate(new_id = data.table::rleid(event)) %>%
group_by(event, new_id, id) %>%
mutate(duration.minutes = ifelse(event == 'trip', sum(duration.minutes), duration.minutes), maxTime = tail(maxTime, 1)) %>%
filter(!duplicated(duration.minutes)) %>%
select(-new_id)
#new_id id minTime maxTime duration.minutes event
# <int> <fctr> <fctr> <fctr> <int> <fctr>
#1 1 a 2014-06-06 07:39:50 2014-06-07 01:41:31 NA enter
#2 2 a 2014-06-07 02:24:32 2014-06-07 22:31:02 1177 trip
#3 3 a 2014-06-07 2:29 2014-06-08 5:50 452 exit
#4 3 a 2014-06-08 5:40 2014-06-08 5:50 NA exit
#5 1 b 2014-06-18 2:45 2014-06-18 2:50 NA enter
#6 2 b 2014-06-18 17:54:42 2014-06-19 18:51:36 1448 trip
#7 3 b 2014-06-19 19:15 2014-06-19 20:15 4062 stop
#8 4 b 2014-06-19 22:15 2014-06-19 23:15 353 trip
如果我们 运行 stop
而不是 trip
的相同代码,我们会得到以下结果
#new_id Ship minTime maxTime duration.minutes event dist.sailed
# <int> <dbl> <time> <time> <dbl> <chr> <dbl>
#1 1 205482000 2014-06-01 00:04:01 2014-06-05 14:10:39 6607 stop NA
#2 2 205482000 2014-06-05 14:53:39 2014-06-06 13:50:49 186 trip 50254.20
#3 2 205482000 2014-06-05 18:24:20 2014-06-06 13:50:49 1166 trip 349194.11
#4 3 205482000 2014-06-06 13:55:08 2014-06-06 15:07:10 NA exit NA
#5 4 205482000 2014-06-14 20:52:42 2014-06-14 21:58:33 NA enter NA
#6 5 205482000 2014-06-14 22:03:22 2014-06-15 20:13:53 1121 trip 347816.08
#7 5 205482000 2014-06-15 17:10:43 2014-06-15 20:13:53 183 trip 50035.89
#8 6 205482000 2014-06-15 21:09:33 2014-06-20 17:59:15 6947 stop NA
#9 7 205482000 2014-06-20 18:41:56 2014-06-21 19:19:56 176 trip 49982.69
#10 7 205482000 2014-06-20 22:00:56 2014-06-21 19:19:56 1279 trip 351978.74
#11 8 205482000 2014-06-21 19:22:15 2014-06-21 20:21:25 NA exit NA
我有这个样本
data <- structure(list(id = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L), .Label = c("a", "b"), class = "factor"), minTime = structure(c(1L,
2L, 3L, 4L, 5L, 7L, 6L, 8L, 9L, 10L), .Label = c("2014-06-06 07:39:50",
"2014-06-07 02:24:32", "2014-06-07 15:14:29", "2014-06-07 2:29",
"2014-06-08 5:40", "2014-06-18 17:54:42", "2014-06-18 2:45",
"2014-06-19 02:37:53", "2014-06-19 19:15", "2014-06-19 22:15"
), class = "factor"), maxTime = structure(c(1L, 3L, 4L, 2L, 5L,
6L, 7L, 8L, 9L, 10L), .Label = c("2014-06-07 01:41:31", "2014-06-07 10:01",
"2014-06-07 14:44:08", "2014-06-07 22:31:02", "2014-06-08 5:50",
"2014-06-18 2:50", "2014-06-19 01:49:05", "2014-06-19 18:51:36",
"2014-06-19 20:15", "2014-06-19 23:15"), class = "factor"), duration.minutes = c(NA,
740L, 437L, 452L, NA, NA, 474L, 974L, 4062L, 353L), event = structure(c(1L,
4L, 4L, 2L, 2L, 1L, 4L, 4L, 3L, 4L), .Label = c("enter", "exit",
"stop", "trip"), class = "factor")), .Names = c("id", "minTime",
"maxTime", "duration.minutes", "event"), class = "data.frame", row.names = c(NA,
-10L))
如果事件对于每个 ID 都是连续的,我想合并事件 "trips"。
id 为 a 和 b 的连续行程已合并:
- 编辑了 maxTime 以包含整个行程
- duration.minutes 也被编辑了。
例如,这个输出:
dput(output.wanted)
structure(list(id = structure(c(1L, 1L, 1L, 2L, 2L, 1L, 2L), .Label = c("a",
"b"), class = "factor"), minTime = structure(c(5L, 6L, 7L, 2L,
1L, 3L, 4L), .Label = c("18.6.2014 17:54", "18.6.2014 2:45",
"19.6.2014 19:15", "19.6.2014 22:15", "6.6.2014 7:39", "7.6.2014 2:24",
"7.6.2014 2:29"), class = "factor"), maxTime = structure(c(5L,
7L, 6L, 1L, 2L, 3L, 4L), .Label = c("18.6.2014 2:50", "19.6.2014 18:51",
"19.6.2014 20:15", "19.6.2014 23:15", "7.6.2014 1:41", "7.6.2014 10:01",
"7.6.2014 22:31"), class = "factor"), duration.minutes = c(NA,
1177L, 452L, NA, 1448L, 4062L, 353L), event = structure(c(1L,
4L, 2L, 1L, 4L, 3L, 4L), .Label = c("enter", "exit", "stop",
"trip"), class = "factor")), .Names = c("id", "minTime", "maxTime",
"duration.minutes", "event"), class = "data.frame", row.names = c(NA,
-7L))
关键是我希望合并所有事件(旅行或停止),如果它们对于同一 ID 是连续的。但如果停靠点或行程是独一无二的,它会保持不变
我一直在尝试用 group_by 和变异来做到这一点,但我有点迷茫...
--
我找到了解决方案,但事件 stops
这是一个示例:
> dput(total)
structure(list(Ship = c(205482000, 205482000, 205482000, 205482000,
205482000, 205482000, 205482000, 205482000, 205482000, 205482000,
205482000, 205482000), minTime = structure(c(1401570241, 1401969219,
1401981860, 1402052108, 1402768362, 1402772602, 1402841443, 1402855773,
1403056361, 1403278916, 1403290856, 1403367735), class = c("POSIXct",
"POSIXt")), maxTime = structure(c(1401966639, 1401980399, 1402051849,
1402056430, 1402772313, 1402839873, 1402852433, 1403052550, 1403276355,
1403289496, 1403367596, 1403371285), class = c("POSIXct", "POSIXt"
)), duration.minutes = structure(c(6607, 186, 1166, NA, NA, 1121,
183, 3280, 3667, 176, 1279, NA), class = "difftime", units = "mins"),
event = c("stop", "trip", "trip", "exit", "enter", "trip",
"trip", "stop", "stop", "trip", "trip", "exit"), dist.sailed = c(NA,
50254.2034817555, 349194.108518887, NA, NA, 347816.081064252,
50035.8859874946, NA, NA, 49982.687612038, 351978.737678528,
NA)), .Names = c("Ship", "minTime", "maxTime", "duration.minutes",
"event", "dist.sailed"), class = "data.frame", row.names = c(4L,
5L, 6L, 2L, 1L, 7L, 8L, 9L, 10L, 11L, 12L, 3L))
以下代码未生成完整的停止持续时间,而仅生成 NA:
total <- total %>%
group_by(Ship) %>%
mutate(new_id = data.table::rleid(event)) %>%
group_by(event, new_id, Ship) %>%
mutate(duration.minutes = ifelse(event == 'trip', sum(duration.minutes), duration.minutes), maxTime = tail(maxTime, 1))%>%
mutate(duration.minutes = ifelse(event == 'stop', sum(duration.minutes), duration.minutes), maxTime = tail(maxTime, 1))%>%
mutate(dist.sailed = ifelse(event == 'trip', sum(dist.sailed), dist.sailed), dist.sailed = tail(dist.sailed, 1)) %>%
filter(!duplicated(duration.minutes)) %>%
select(-new_id)
我在@Sotos 的代码中添加了mutate(duration.minutes = ifelse(event == 'stop', sum(duration.minutes), duration.minutes), maxTime = tail(maxTime, 1))%>%
如果您使用 dplyr
包,则必须使用 summarise
,而不是 mutate
以某种方式组合数据。喜欢
some.df %>%
group_by(event) %>%
summarise(sum.maxTime = sum(maxTime), sum.duration = sum(duration.minutes))
Mutate
在您的 data.frame 中添加或更改一列,但结果的行数始终与原始行数相同。 summarise
之后的行号将是数据集中的组数。
希望这对您有所帮助
这有点乱,
library(dplyr)
data %>%
group_by(id) %>%
mutate(new_id = data.table::rleid(event)) %>%
group_by(event, new_id, id) %>%
mutate(duration.minutes = ifelse(event == 'trip', sum(duration.minutes), duration.minutes), maxTime = tail(maxTime, 1)) %>%
filter(!duplicated(duration.minutes)) %>%
select(-new_id)
#new_id id minTime maxTime duration.minutes event
# <int> <fctr> <fctr> <fctr> <int> <fctr>
#1 1 a 2014-06-06 07:39:50 2014-06-07 01:41:31 NA enter
#2 2 a 2014-06-07 02:24:32 2014-06-07 22:31:02 1177 trip
#3 3 a 2014-06-07 2:29 2014-06-08 5:50 452 exit
#4 3 a 2014-06-08 5:40 2014-06-08 5:50 NA exit
#5 1 b 2014-06-18 2:45 2014-06-18 2:50 NA enter
#6 2 b 2014-06-18 17:54:42 2014-06-19 18:51:36 1448 trip
#7 3 b 2014-06-19 19:15 2014-06-19 20:15 4062 stop
#8 4 b 2014-06-19 22:15 2014-06-19 23:15 353 trip
如果我们 运行 stop
而不是 trip
的相同代码,我们会得到以下结果
#new_id Ship minTime maxTime duration.minutes event dist.sailed
# <int> <dbl> <time> <time> <dbl> <chr> <dbl>
#1 1 205482000 2014-06-01 00:04:01 2014-06-05 14:10:39 6607 stop NA
#2 2 205482000 2014-06-05 14:53:39 2014-06-06 13:50:49 186 trip 50254.20
#3 2 205482000 2014-06-05 18:24:20 2014-06-06 13:50:49 1166 trip 349194.11
#4 3 205482000 2014-06-06 13:55:08 2014-06-06 15:07:10 NA exit NA
#5 4 205482000 2014-06-14 20:52:42 2014-06-14 21:58:33 NA enter NA
#6 5 205482000 2014-06-14 22:03:22 2014-06-15 20:13:53 1121 trip 347816.08
#7 5 205482000 2014-06-15 17:10:43 2014-06-15 20:13:53 183 trip 50035.89
#8 6 205482000 2014-06-15 21:09:33 2014-06-20 17:59:15 6947 stop NA
#9 7 205482000 2014-06-20 18:41:56 2014-06-21 19:19:56 176 trip 49982.69
#10 7 205482000 2014-06-20 22:00:56 2014-06-21 19:19:56 1279 trip 351978.74
#11 8 205482000 2014-06-21 19:22:15 2014-06-21 20:21:25 NA exit NA