在 R 中按指定时间长度扩展时间序列的最快方法
Fastest way to expand a time series by specified time lengths in R
我有来自两种类型 activity 记录器的数据。第一个记录器记录记录器处于潮湿或干燥状态的秒数(参见 act1)。第二个记录器每 3 秒采样 wet/dry,并记录每 10 分钟湿润的样本总数。给定 3 秒的采样间隔,在每 10 分钟周期结束时记录的值范围从 0(始终干燥)到 200(始终潮湿),参见 act2。
我想使用最有效的方法对第一个记录器的数据进行整形和重新采样,以复制第二个记录器的格式。
我在此处提供的示例使用了数据的子样本(6 行),但是我的实际数据集包含超过一年的观察结果(40,000 多行),目前仍然是 运行 3 天。
act1 <- structure(list(
Valid = c("ok", "ok", "ok", "ok", "ok", "ok"),
Date = structure(c(1425579093, 1425579171, 1425579177, 1425579216, 1425579225, 1425579240),
class = c("POSIXct", "POSIXt"), tzone = ""),
Activity = c(78L, 6L, 39L, 9L, 15L, 9L),
Wet = c("wet", "dry", "wet", "dry", "wet", "dry")),
row.names = c("2", "3", "4", "5", "6", "7"),
class = "data.frame")
act2 <- structure(list(
Valid = c("ok", "ok", "ok", "ok", "ok", "ok"),
Date = structure(c(1425579093, 1425579171, 1425579177, 1425579216, 1425579225, 1425579240),
class = c("POSIXct", "POSIXt"), tzone = ""),
Activity = c(78L, 6L, 39L, 9L, 15L, 9L),
Wet = c("wet", "dry", "wet", "dry", "wet", "dry")), row.names = c("2", "3", "4", "5", "6", "7"),
class = "data.frame")
使用 lapply 我在 act1[ 中扩展了 Date 列(POSIXct 格式) =35=] 数据帧基于 Activity 列中指定的间隔,并在 Wet 列中保留对相应状态的引用。
act1 <- lapply(1:nrow(act1), function(x){
data.frame(
Valid = rep(act1[x, 1], act1[x, 3]),
Date = strptime(act1[x, 2], format = "%Y-%m-%d%H:%M:%S")+(seq_len(act1[x, 3])-1),
Activity = rep(1, act1[x, 3]),
Wet = rep(act1[x, 4], act1[x, 3])
)})
act1 <- as.data.frame(do.call(rbind, act1))
然后我使用 dplyr 和 lubridate 将每个观察值分组到 3 个第二个容器中,并确定每个容器中的最后一个观察值是否是湿。我将剩余的湿观测值分组为 10 分钟的箱子并总结了有多少样本是湿的。
library(dplyr)
library(lubridate)
act1 <- act1 %>%
mutate(interval = floor_date(Date, unit="minutes") + seconds(floor(second(Date)/3)*3)) %>%
group_by(interval) %>%
summarise(Valid = "ok",
Wet = Wet[which(Date==max(Date))]=="wet") %>%
mutate(int10 = floor_date(interval, unit="hour") +
minutes(floor(minute(interval)/10)*10) +
(min(interval) - min(floor_date(interval, unit="hour") + minutes(floor(minute(interval)/10)*10)))) %>%
group_by(int10) %>%
summarise(Valid = "ok",
Activity = sum(Wet)) %>%
rename(Date = int10) %>%
select(Valid,Date,Activity)
我在此处提供的示例使用了原始数据集的一个子集(6 行),但是我的实际数据集包含超过一年的观察结果(40,000 多行),目前 仍然运行3天后!
矢量化、rep
、cut
和 seq
应该在您完成此任务的工具箱中。
您的第一个涉及 lapply
的主语句可以缩短 - 您只是想重复这些行。例如,act1[c(1,1), ]
将 return 第一行 act1
2 次。在您的循环中,您访问 act1[x, 3]
4 次。下面这一行将复制我们想要的行我们需要的次数:
act1a <- act1_copy[rep(seq_len(nrow(act1_copy)), act1_copy[['Activity']]), ]
> nrow(act1_copy)
[1] 6
> seq_len(nrow(act1_copy))
[1] 1 2 3 4 5 6
> act1_copy[['Activity']]
[1] 78 6 39 9 15 9
> rep(seq_len(nrow(act1_copy)), act1_copy[['Activity']])
[1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[60] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[119] 3 3 3 3 3 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 6 6 6 6 6 6 6 6 6
# or if you're into external packages, this is a lot nicer looking:
tidyr::uncount(act1_copy, weights = Activity)
下一步是更正秒数并将 Activity
重做为 1
。
act1a[['Date']] <- act1a[['Date']] + sequence(act1_copy[['Activity']]) - 1
act1a[['Activity']] <- 1L
现在,让旧日志数据与新日志数据匹配(即每 3 秒一次日志)的最后一步是按 3 秒分组。请务必注意,每 3 秒一次相当于每 3 行一次。因此,根据 act$Date
完成的可信度,我们可以采用以下两种方法之一:
act1b <- act1a[!duplicated(cut(act1a$Date, '3 sec', labels = F)), ]
# or if you're sure there is one reading per second, you can just do once every three rows
act1b <- act1a[seq(from = 1, to = nrow(act1a), by = 3), ]
# what cut() looks like for reference
cut(act1a$Date, '3 sec', labels = F)
[1] 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6 6 7 7 7 8 8 8 9 9 9 10 10 10 11 11 11 12 12 12 13 13 13
[40] 14 14 14 15 15 15 16 16 16 17 17 17 18 18 18 19 19 19 20 20 20 21 21 21 22 22 22 23 23 23 24 24 24 25 25 25 26 26 26
[79] 27 27 27 28 28 28 29 29 29 30 30 30 31 31 31 32 32 32 33 33 33 34 34 34 35 35 35 36 36 36 37 37 37 38 38 38 39 39 39
[118] 40 40 40 41 41 41 42 42 42 43 43 43 44 44 44 45 45 45 46 46 46 47 47 47 48 48 48 49 49 49 50 50 50 51 51 51 52 52 52
#or with labels:
cut(act1a$Date, '3 sec')
[1] 2015-03-05 13:11:33 2015-03-05 13:11:33 2015-03-05 13:11:33 2015-03-05 13:11:36 2015-03-05 13:11:36
[6] 2015-03-05 13:11:36 2015-03-05 13:11:39 2015-03-05 13:11:39 2015-03-05 13:11:39 2015-03-05 13:11:42
[11] 2015-03-05 13:11:42 2015-03-05 13:11:42 2015-03-05 13:11:45 2015-03-05 13:11:45 2015-03-05 13:11:45
[16] 2015-03-05 13:11:48 2015-03-05 13:11:48 2015-03-05 13:11:48 2015-03-05 13:11:51 2015-03-05 13:11:51
[21] 2015-03-05 13:11:51 2015-03-05 13:11:54 2015-03-05 13:11:54 2015-03-05 13:11:54 2015-03-05 13:11:57
# truncated for brevity.
最后一步是聚合数据。就像最后一步一样,我们可以使用 cut()
来使用时间分组,或者我们可以再次使用 rep(seq())
来使我们的分组更快一些。
aggregate(act1b$Wet, list(Date = cut(act1b$Date, '10 min')), FUN = function(x) sum(x == 'wet'))
#or if you know there is one reading per second,
aggregate(act1b$Wet,
list(Date = rep(act1b$Date[seq(from = 1, to = nrow(act1b), by = 10 * 60 / 3)]
, each = 10 * 60 / 3
, length.out = nrow(act1b)))
, FUN = function(x) sum(x == 'wet'))
把它们放在一起,你得到:
act1a <- act1_copy[rep(seq_len(nrow(act1_copy)), act1_copy[['Activity']]), ]
act1a[['Date']] <- act1a[['Date']] + sequence(act1_copy[['Activity']]) - 1
act1a[['Activity']] <- 1L
act1b <- act1a[seq(from = 1, to = nrow(act1a), by = 3), ]
aggregate(act1b$Wet,
list(Date = rep(act1b$Date[seq(from = 1, to = nrow(act1b), by = 10 * 60 / 3)]
, each = 10 * 60 / 3
, length.out = nrow(act1b)))
, FUN = function(x) sum(x == 'wet'))
library(tidyr)
library(dplyr)
tidyr::uncount(act1_copy, weights = Activity)%>%
mutate(Activity = 1L
, Date = Date + sequence(act1_copy[['Activity']]) - 1)%>%
slice(seq(from = 1, to = nrow(.), by = 3))%>%
group_by(Date =rep(Date[seq(from = 1, to = nrow(.), by = 10 * 60 / 3)]
, each = 10 * 60 / 3
, length.out = nrow(.)))%>%
summarize(Wet = sum(Wet == 'wet'))
# A tibble: 1 x 2
Date Wet
<dttm> <int>
1 2015-03-05 13:11:33 44
有一些性能和代码 - 请注意 data.table
我在 10 分钟的总结中遇到的问题,所以它不是完全相同的:
Unit: milliseconds
expr min lq mean median uq max neval
cole_base 2.044400 2.174451 2.328061 2.253251 2.340801 6.424400 100
cole_dplyr 3.152901 3.359501 3.502880 3.428101 3.515302 8.248401 100
cole_dt 3.308601 3.541151 3.884475 3.698201 3.796652 13.155701 100
original_all 32.626601 33.061152 34.531462 33.392151 34.237601 50.499501 100
library(microbenchmark)
library(data.table)
library(dplyr)
library(tidyr)
library(lubridate)
act1_copy <- structure(list(
Valid = c("ok", "ok", "ok", "ok", "ok", "ok"),
Date = structure(c(1425579093, 1425579171, 1425579177, 1425579216, 1425579225, 1425579240),
class = c("POSIXct", "POSIXt"), tzone = ""),
Activity = c(78L, 6L, 39L, 9L, 15L, 9L),
Wet = c("wet", "dry", "wet", "dry", "wet", "dry")),
row.names = c("2", "3", "4", "5", "6", "7"),
class = "data.frame")
dt <- as.data.table(act1_copy)
microbenchmark( cole_base = {
act1a <- act1_copy[rep(seq_len(nrow(act1_copy)), act1_copy[['Activity']]), ]
act1a[['Date']] <- act1a[['Date']] + sequence(act1_copy[['Activity']]) - 1
# if you know there is definitly one reading per second
# act1a[['Date']] <- act1a[['Date']] + seq_len(nrow(act1a)) - 1
act1a[['Activity']] <- 1L
# act1b <- act1a[!duplicated(cut(act1a$Date, '3 sec', labels = F)), ]
# or if you're sure there is one reading per second, you can just do once every three rows
act1b <- act1a[seq(from = 1, to = nrow(act1a), by = 3), ]
# aggregate(act1b$Wet, list(Date = cut(act1b$Date, '10 min')), FUN = function(x) sum(x == 'wet'))
#or if you know there is one reading per second,
aggregate(act1b$Wet,
list(Date = rep(act1b$Date[seq(from = 1, to = nrow(act1b), by = 10 * 60 / 3)]
, each = 10 * 60 / 3
, length.out = nrow(act1b)))
, FUN = function(x) sum(x == 'wet'))
}
, cole_dplyr = {
tidyr::uncount(act1_copy, weights = Activity)%>%
mutate(Activity = 1L
, Date = Date + sequence(act1_copy[['Activity']]) - 1)%>%
# filter(!duplicated(cut(Date, '3 sec', labels = F)))%>%
slice(seq(from = 1, to = nrow(.), by = 3))%>%
# group_by(Date = cut(Date, '10 min'))%>%
group_by(Date =rep(Date[seq(from = 1, to = nrow(.), by = 10 * 60 / 3)]
, each = 10 * 60 / 3
, length.out = nrow(.)))%>%
summarize(Wet = sum(Wet == 'wet'))
}
, cole_dt = {
copy(dt)[rep(seq_len(.N), Activity)
, .(Date = Date + sequence(act1_copy[['Activity']]) - 1
,Wet, Valid, Activity = 1L)
][seq(from = 1, to = .N, by = 3)
, .(Wet = sum(Wet == 'wet'))
, by = cut(Date, '10 min')]
}
, original_all = {
act1 <- lapply(1:nrow(act1_copy), function(x){
data.frame(
Valid = rep(act1_copy[x, 1], act1_copy[x, 3]),
Date = strptime(act1_copy[x, 2], format = "%Y-%m-%d%H:%M:%S")+(seq_len(act1_copy[x, 3])-1),
Activity = rep(1, act1_copy[x, 3]),
Wet = rep(act1_copy[x, 4], act1_copy[x, 3])
)})
act1 <- as.data.frame(do.call(rbind, act1))
act1 <- act1 %>%
mutate(interval = floor_date(Date, unit="minutes") + seconds(floor(second(Date)/3)*3)) %>%
group_by(interval) %>%
summarise(Valid = "ok",
Wet = Wet[which(Date==max(Date))]=="wet") %>%
mutate(int10 = floor_date(interval, unit="hour") +
minutes(floor(minute(interval)/10)*10) +
(min(interval) - min(floor_date(interval, unit="hour") + minutes(floor(minute(interval)/10)*10)))) %>%
group_by(int10) %>%
summarise(Valid = "ok",
Activity = sum(Wet)) %>%
rename(Date = int10) %>%
select(Valid,Date,Activity)
}
)
我有来自两种类型 activity 记录器的数据。第一个记录器记录记录器处于潮湿或干燥状态的秒数(参见 act1)。第二个记录器每 3 秒采样 wet/dry,并记录每 10 分钟湿润的样本总数。给定 3 秒的采样间隔,在每 10 分钟周期结束时记录的值范围从 0(始终干燥)到 200(始终潮湿),参见 act2。
我想使用最有效的方法对第一个记录器的数据进行整形和重新采样,以复制第二个记录器的格式。
我在此处提供的示例使用了数据的子样本(6 行),但是我的实际数据集包含超过一年的观察结果(40,000 多行),目前仍然是 运行 3 天。
act1 <- structure(list(
Valid = c("ok", "ok", "ok", "ok", "ok", "ok"),
Date = structure(c(1425579093, 1425579171, 1425579177, 1425579216, 1425579225, 1425579240),
class = c("POSIXct", "POSIXt"), tzone = ""),
Activity = c(78L, 6L, 39L, 9L, 15L, 9L),
Wet = c("wet", "dry", "wet", "dry", "wet", "dry")),
row.names = c("2", "3", "4", "5", "6", "7"),
class = "data.frame")
act2 <- structure(list(
Valid = c("ok", "ok", "ok", "ok", "ok", "ok"),
Date = structure(c(1425579093, 1425579171, 1425579177, 1425579216, 1425579225, 1425579240),
class = c("POSIXct", "POSIXt"), tzone = ""),
Activity = c(78L, 6L, 39L, 9L, 15L, 9L),
Wet = c("wet", "dry", "wet", "dry", "wet", "dry")), row.names = c("2", "3", "4", "5", "6", "7"),
class = "data.frame")
使用 lapply 我在 act1[ 中扩展了 Date 列(POSIXct 格式) =35=] 数据帧基于 Activity 列中指定的间隔,并在 Wet 列中保留对相应状态的引用。
act1 <- lapply(1:nrow(act1), function(x){
data.frame(
Valid = rep(act1[x, 1], act1[x, 3]),
Date = strptime(act1[x, 2], format = "%Y-%m-%d%H:%M:%S")+(seq_len(act1[x, 3])-1),
Activity = rep(1, act1[x, 3]),
Wet = rep(act1[x, 4], act1[x, 3])
)})
act1 <- as.data.frame(do.call(rbind, act1))
然后我使用 dplyr 和 lubridate 将每个观察值分组到 3 个第二个容器中,并确定每个容器中的最后一个观察值是否是湿。我将剩余的湿观测值分组为 10 分钟的箱子并总结了有多少样本是湿的。
library(dplyr)
library(lubridate)
act1 <- act1 %>%
mutate(interval = floor_date(Date, unit="minutes") + seconds(floor(second(Date)/3)*3)) %>%
group_by(interval) %>%
summarise(Valid = "ok",
Wet = Wet[which(Date==max(Date))]=="wet") %>%
mutate(int10 = floor_date(interval, unit="hour") +
minutes(floor(minute(interval)/10)*10) +
(min(interval) - min(floor_date(interval, unit="hour") + minutes(floor(minute(interval)/10)*10)))) %>%
group_by(int10) %>%
summarise(Valid = "ok",
Activity = sum(Wet)) %>%
rename(Date = int10) %>%
select(Valid,Date,Activity)
我在此处提供的示例使用了原始数据集的一个子集(6 行),但是我的实际数据集包含超过一年的观察结果(40,000 多行),目前 仍然运行3天后!
矢量化、rep
、cut
和 seq
应该在您完成此任务的工具箱中。
您的第一个涉及 lapply
的主语句可以缩短 - 您只是想重复这些行。例如,act1[c(1,1), ]
将 return 第一行 act1
2 次。在您的循环中,您访问 act1[x, 3]
4 次。下面这一行将复制我们想要的行我们需要的次数:
act1a <- act1_copy[rep(seq_len(nrow(act1_copy)), act1_copy[['Activity']]), ]
> nrow(act1_copy)
[1] 6
> seq_len(nrow(act1_copy))
[1] 1 2 3 4 5 6
> act1_copy[['Activity']]
[1] 78 6 39 9 15 9
> rep(seq_len(nrow(act1_copy)), act1_copy[['Activity']])
[1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[60] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[119] 3 3 3 3 3 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 6 6 6 6 6 6 6 6 6
# or if you're into external packages, this is a lot nicer looking:
tidyr::uncount(act1_copy, weights = Activity)
下一步是更正秒数并将 Activity
重做为 1
。
act1a[['Date']] <- act1a[['Date']] + sequence(act1_copy[['Activity']]) - 1
act1a[['Activity']] <- 1L
现在,让旧日志数据与新日志数据匹配(即每 3 秒一次日志)的最后一步是按 3 秒分组。请务必注意,每 3 秒一次相当于每 3 行一次。因此,根据 act$Date
完成的可信度,我们可以采用以下两种方法之一:
act1b <- act1a[!duplicated(cut(act1a$Date, '3 sec', labels = F)), ]
# or if you're sure there is one reading per second, you can just do once every three rows
act1b <- act1a[seq(from = 1, to = nrow(act1a), by = 3), ]
# what cut() looks like for reference
cut(act1a$Date, '3 sec', labels = F)
[1] 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6 6 7 7 7 8 8 8 9 9 9 10 10 10 11 11 11 12 12 12 13 13 13
[40] 14 14 14 15 15 15 16 16 16 17 17 17 18 18 18 19 19 19 20 20 20 21 21 21 22 22 22 23 23 23 24 24 24 25 25 25 26 26 26
[79] 27 27 27 28 28 28 29 29 29 30 30 30 31 31 31 32 32 32 33 33 33 34 34 34 35 35 35 36 36 36 37 37 37 38 38 38 39 39 39
[118] 40 40 40 41 41 41 42 42 42 43 43 43 44 44 44 45 45 45 46 46 46 47 47 47 48 48 48 49 49 49 50 50 50 51 51 51 52 52 52
#or with labels:
cut(act1a$Date, '3 sec')
[1] 2015-03-05 13:11:33 2015-03-05 13:11:33 2015-03-05 13:11:33 2015-03-05 13:11:36 2015-03-05 13:11:36
[6] 2015-03-05 13:11:36 2015-03-05 13:11:39 2015-03-05 13:11:39 2015-03-05 13:11:39 2015-03-05 13:11:42
[11] 2015-03-05 13:11:42 2015-03-05 13:11:42 2015-03-05 13:11:45 2015-03-05 13:11:45 2015-03-05 13:11:45
[16] 2015-03-05 13:11:48 2015-03-05 13:11:48 2015-03-05 13:11:48 2015-03-05 13:11:51 2015-03-05 13:11:51
[21] 2015-03-05 13:11:51 2015-03-05 13:11:54 2015-03-05 13:11:54 2015-03-05 13:11:54 2015-03-05 13:11:57
# truncated for brevity.
最后一步是聚合数据。就像最后一步一样,我们可以使用 cut()
来使用时间分组,或者我们可以再次使用 rep(seq())
来使我们的分组更快一些。
aggregate(act1b$Wet, list(Date = cut(act1b$Date, '10 min')), FUN = function(x) sum(x == 'wet'))
#or if you know there is one reading per second,
aggregate(act1b$Wet,
list(Date = rep(act1b$Date[seq(from = 1, to = nrow(act1b), by = 10 * 60 / 3)]
, each = 10 * 60 / 3
, length.out = nrow(act1b)))
, FUN = function(x) sum(x == 'wet'))
把它们放在一起,你得到:
act1a <- act1_copy[rep(seq_len(nrow(act1_copy)), act1_copy[['Activity']]), ]
act1a[['Date']] <- act1a[['Date']] + sequence(act1_copy[['Activity']]) - 1
act1a[['Activity']] <- 1L
act1b <- act1a[seq(from = 1, to = nrow(act1a), by = 3), ]
aggregate(act1b$Wet,
list(Date = rep(act1b$Date[seq(from = 1, to = nrow(act1b), by = 10 * 60 / 3)]
, each = 10 * 60 / 3
, length.out = nrow(act1b)))
, FUN = function(x) sum(x == 'wet'))
library(tidyr)
library(dplyr)
tidyr::uncount(act1_copy, weights = Activity)%>%
mutate(Activity = 1L
, Date = Date + sequence(act1_copy[['Activity']]) - 1)%>%
slice(seq(from = 1, to = nrow(.), by = 3))%>%
group_by(Date =rep(Date[seq(from = 1, to = nrow(.), by = 10 * 60 / 3)]
, each = 10 * 60 / 3
, length.out = nrow(.)))%>%
summarize(Wet = sum(Wet == 'wet'))
# A tibble: 1 x 2
Date Wet
<dttm> <int>
1 2015-03-05 13:11:33 44
有一些性能和代码 - 请注意 data.table
我在 10 分钟的总结中遇到的问题,所以它不是完全相同的:
Unit: milliseconds
expr min lq mean median uq max neval
cole_base 2.044400 2.174451 2.328061 2.253251 2.340801 6.424400 100
cole_dplyr 3.152901 3.359501 3.502880 3.428101 3.515302 8.248401 100
cole_dt 3.308601 3.541151 3.884475 3.698201 3.796652 13.155701 100
original_all 32.626601 33.061152 34.531462 33.392151 34.237601 50.499501 100
library(microbenchmark)
library(data.table)
library(dplyr)
library(tidyr)
library(lubridate)
act1_copy <- structure(list(
Valid = c("ok", "ok", "ok", "ok", "ok", "ok"),
Date = structure(c(1425579093, 1425579171, 1425579177, 1425579216, 1425579225, 1425579240),
class = c("POSIXct", "POSIXt"), tzone = ""),
Activity = c(78L, 6L, 39L, 9L, 15L, 9L),
Wet = c("wet", "dry", "wet", "dry", "wet", "dry")),
row.names = c("2", "3", "4", "5", "6", "7"),
class = "data.frame")
dt <- as.data.table(act1_copy)
microbenchmark( cole_base = {
act1a <- act1_copy[rep(seq_len(nrow(act1_copy)), act1_copy[['Activity']]), ]
act1a[['Date']] <- act1a[['Date']] + sequence(act1_copy[['Activity']]) - 1
# if you know there is definitly one reading per second
# act1a[['Date']] <- act1a[['Date']] + seq_len(nrow(act1a)) - 1
act1a[['Activity']] <- 1L
# act1b <- act1a[!duplicated(cut(act1a$Date, '3 sec', labels = F)), ]
# or if you're sure there is one reading per second, you can just do once every three rows
act1b <- act1a[seq(from = 1, to = nrow(act1a), by = 3), ]
# aggregate(act1b$Wet, list(Date = cut(act1b$Date, '10 min')), FUN = function(x) sum(x == 'wet'))
#or if you know there is one reading per second,
aggregate(act1b$Wet,
list(Date = rep(act1b$Date[seq(from = 1, to = nrow(act1b), by = 10 * 60 / 3)]
, each = 10 * 60 / 3
, length.out = nrow(act1b)))
, FUN = function(x) sum(x == 'wet'))
}
, cole_dplyr = {
tidyr::uncount(act1_copy, weights = Activity)%>%
mutate(Activity = 1L
, Date = Date + sequence(act1_copy[['Activity']]) - 1)%>%
# filter(!duplicated(cut(Date, '3 sec', labels = F)))%>%
slice(seq(from = 1, to = nrow(.), by = 3))%>%
# group_by(Date = cut(Date, '10 min'))%>%
group_by(Date =rep(Date[seq(from = 1, to = nrow(.), by = 10 * 60 / 3)]
, each = 10 * 60 / 3
, length.out = nrow(.)))%>%
summarize(Wet = sum(Wet == 'wet'))
}
, cole_dt = {
copy(dt)[rep(seq_len(.N), Activity)
, .(Date = Date + sequence(act1_copy[['Activity']]) - 1
,Wet, Valid, Activity = 1L)
][seq(from = 1, to = .N, by = 3)
, .(Wet = sum(Wet == 'wet'))
, by = cut(Date, '10 min')]
}
, original_all = {
act1 <- lapply(1:nrow(act1_copy), function(x){
data.frame(
Valid = rep(act1_copy[x, 1], act1_copy[x, 3]),
Date = strptime(act1_copy[x, 2], format = "%Y-%m-%d%H:%M:%S")+(seq_len(act1_copy[x, 3])-1),
Activity = rep(1, act1_copy[x, 3]),
Wet = rep(act1_copy[x, 4], act1_copy[x, 3])
)})
act1 <- as.data.frame(do.call(rbind, act1))
act1 <- act1 %>%
mutate(interval = floor_date(Date, unit="minutes") + seconds(floor(second(Date)/3)*3)) %>%
group_by(interval) %>%
summarise(Valid = "ok",
Wet = Wet[which(Date==max(Date))]=="wet") %>%
mutate(int10 = floor_date(interval, unit="hour") +
minutes(floor(minute(interval)/10)*10) +
(min(interval) - min(floor_date(interval, unit="hour") + minutes(floor(minute(interval)/10)*10)))) %>%
group_by(int10) %>%
summarise(Valid = "ok",
Activity = sum(Wet)) %>%
rename(Date = int10) %>%
select(Valid,Date,Activity)
}
)