整个 data.frame R 的分段插值
Piecewise interpolation for entire data.frame R
我有一个来自使用特殊压缩算法的来源的数据集。简而言之,只有当斜率(变化率)的变化大于一定百分比(比如 5%)时,才会记录新的测量值。
但是,对于我目前正在进行的分析,我需要定期获取值。我可以使用 approx
、approxfun
或 spline
对不同的变量与时间进行分段插值(下面数据中的 tme
),但我想这样做对于单次拍摄的所有变量(data.table
的列)。
library(data.table)
q = setDT(
structure(list(tme = structure(c(1463172120, 1463173320, 1463175720,
1463180520, 1463182920, 1463187720, 1463188920, 1463190120, 1463191320,
1463192520, 1463202180, 1463203380, 1463204580, 1463205780, 1463206980,
1463208180, 1463218980, 1463233440, 1463244240, 1463245440, 1463246640,
1463247840, 1463249040, 1463250240, 1463251440, 1463252640, 1463253840,
1463255040, 1463256240, 1463316360, 1463317560, 1463318760, 1463319960,
1463321160, 1463322360, 1463323560, 1463324760, 1463325960, 1463327160,
1463328360, 1463329560, 1463330760, 1463331960), class = c("POSIXct",
"POSIXt"), tzone = "America/Montreal"), rh = c(50.36, 47.31,
46.39, 46.99, 47.89, 50.37, 51.29, 51.92, 54.97, 67.64, 69.38,
68.96, 69.89, 56.66, 51.23, 55.38, 64.36, 50.72, 31.33, 31.38,
32.65, 33.15, 33.05, 31.87, 32.58, 32.65, 31.06, 29.82, 28.72,
67.95, 66.68, 64.66, 62.12, 59.86, 58.11, 57.41, 56.5, 56.16,
55.69, 54.57, 53.89, 53.81, 52.01), degc = c(30.0055555555556,
30.3611111111111, 30.6611111111111, 30.5833333333333, 30.2666666666667,
28.6888888888889, 28.2555555555556, 28.0722222222222, 27.4944444444444,
25.0722222222222, 24.8111111111111, 24.7166666666667, 24.1666666666667,
25.4111111111111, 25.5222222222222, 24.3555555555556, 22.7722222222222,
25.5222222222222, 27.8111111111111, 27.9888888888889, 28.0277777777778,
28.1333333333333, 28.5333333333333, 28.7, 28.85, 29.1555555555556,
28.8388888888889, 29.5111111111111, 29.6722222222222, 22.3888888888889,
22.5722222222222, 22.9444444444444, 23.3722222222222, 23.6777777777778,
23.8777777777778, 24.2055555555556, 24.6888888888889, 24.9777777777778,
25.3888888888889, 25.8, 26.1, 26.1555555555556, 26.7388888888889
)), .Names = c("tme", "rh", "degc"), row.names = c(NA, -43L), class = c("data.table",
"data.frame")))
q
是我查询的数据集。以下是适用于单个变量的方法(本例中为 degc
):
interpolate_degc <- approxfun(x = q$tme, y = q$degc, method = "linear")
# To get the uniform samples:
width <- "10 mins"
new_times <- seq.POSIXt(from = q$tme[1], to = q$tme[nrow(q)], by = width)
new_degc <- interpolate_degc(new_times)
我想一次对所有变量执行此操作,最好使用 data.table
。
这似乎有效:
cols = c("rh", "degc")
DT = q[.(seq(min(tme), max(tme), by="10 mins")), on=.(tme)]
DT[, (cols) := lapply(cols, function(z) with(q,
approxfun(x = tme, y = get(z), method = "linear")
)(tme))]
tme rh degc
1: 2016-05-13 16:42:00 50.360 30.00556
2: 2016-05-13 16:52:00 48.835 30.18333
3: 2016-05-13 17:02:00 47.310 30.36111
4: 2016-05-13 17:12:00 47.080 30.43611
5: 2016-05-13 17:22:00 46.850 30.51111
---
263: 2016-05-15 12:22:00 54.026 26.04000
264: 2016-05-15 12:32:00 53.866 26.11667
265: 2016-05-15 12:42:00 53.826 26.14444
266: 2016-05-15 12:52:00 53.270 26.33056
267: 2016-05-15 13:02:00 52.370 26.62222
通常,当您想遍历列时,lapply
或 Map
就可以了。
工作原理: 在 with(q, ...)
、tme
和 get(z)
中引用 q
的列,但在外部其中,我们正在查看 DT
的列(在本例中只是 tme
)。
做同样事情的另一种方法:
q[, {
tt = seq(min(tme), max(tme), by="10 mins")
c(
.(tme = tt),
lapply(.SD, function(z) approxfun(x = tme, y = z, method="linear")(tt))
)
}, .SDcols=cols]
对于时间序列,我喜欢使用专门的包,如 xts
和 zoo
:
library(xts)
ts <- merge(xts(x = q[,-1], order.by = q[,1]), new_times)
head(ts)
#> rh degc
#> 2016-05-13 16:42:00 50.36 30.00556
#> 2016-05-13 16:52:00 NA NA
#> 2016-05-13 17:02:00 47.31 30.36111
#> 2016-05-13 17:12:00 NA NA
#> 2016-05-13 17:22:00 NA NA
#> 2016-05-13 17:32:00 NA NA
head(na.approx(ts))
#> rh degc
#> 2016-05-13 16:42:00 50.360 30.00556
#> 2016-05-13 16:52:00 48.835 30.18333
#> 2016-05-13 17:02:00 47.310 30.36111
#> 2016-05-13 17:12:00 47.080 30.43611
#> 2016-05-13 17:22:00 46.850 30.51111
#> 2016-05-13 17:32:00 46.620 30.58611
head(na.spline(ts))
#> rh degc
#> 2016-05-13 16:42:00 50.36000 30.00556
#> 2016-05-13 16:52:00 48.52407 30.20524
#> 2016-05-13 17:02:00 47.31000 30.36111
#> 2016-05-13 17:12:00 46.62601 30.47791
#> 2016-05-13 17:22:00 46.33972 30.56219
#> 2016-05-13 17:32:00 46.30857 30.62093
我有一个来自使用特殊压缩算法的来源的数据集。简而言之,只有当斜率(变化率)的变化大于一定百分比(比如 5%)时,才会记录新的测量值。
但是,对于我目前正在进行的分析,我需要定期获取值。我可以使用 approx
、approxfun
或 spline
对不同的变量与时间进行分段插值(下面数据中的 tme
),但我想这样做对于单次拍摄的所有变量(data.table
的列)。
library(data.table)
q = setDT(
structure(list(tme = structure(c(1463172120, 1463173320, 1463175720,
1463180520, 1463182920, 1463187720, 1463188920, 1463190120, 1463191320,
1463192520, 1463202180, 1463203380, 1463204580, 1463205780, 1463206980,
1463208180, 1463218980, 1463233440, 1463244240, 1463245440, 1463246640,
1463247840, 1463249040, 1463250240, 1463251440, 1463252640, 1463253840,
1463255040, 1463256240, 1463316360, 1463317560, 1463318760, 1463319960,
1463321160, 1463322360, 1463323560, 1463324760, 1463325960, 1463327160,
1463328360, 1463329560, 1463330760, 1463331960), class = c("POSIXct",
"POSIXt"), tzone = "America/Montreal"), rh = c(50.36, 47.31,
46.39, 46.99, 47.89, 50.37, 51.29, 51.92, 54.97, 67.64, 69.38,
68.96, 69.89, 56.66, 51.23, 55.38, 64.36, 50.72, 31.33, 31.38,
32.65, 33.15, 33.05, 31.87, 32.58, 32.65, 31.06, 29.82, 28.72,
67.95, 66.68, 64.66, 62.12, 59.86, 58.11, 57.41, 56.5, 56.16,
55.69, 54.57, 53.89, 53.81, 52.01), degc = c(30.0055555555556,
30.3611111111111, 30.6611111111111, 30.5833333333333, 30.2666666666667,
28.6888888888889, 28.2555555555556, 28.0722222222222, 27.4944444444444,
25.0722222222222, 24.8111111111111, 24.7166666666667, 24.1666666666667,
25.4111111111111, 25.5222222222222, 24.3555555555556, 22.7722222222222,
25.5222222222222, 27.8111111111111, 27.9888888888889, 28.0277777777778,
28.1333333333333, 28.5333333333333, 28.7, 28.85, 29.1555555555556,
28.8388888888889, 29.5111111111111, 29.6722222222222, 22.3888888888889,
22.5722222222222, 22.9444444444444, 23.3722222222222, 23.6777777777778,
23.8777777777778, 24.2055555555556, 24.6888888888889, 24.9777777777778,
25.3888888888889, 25.8, 26.1, 26.1555555555556, 26.7388888888889
)), .Names = c("tme", "rh", "degc"), row.names = c(NA, -43L), class = c("data.table",
"data.frame")))
q
是我查询的数据集。以下是适用于单个变量的方法(本例中为 degc
):
interpolate_degc <- approxfun(x = q$tme, y = q$degc, method = "linear")
# To get the uniform samples:
width <- "10 mins"
new_times <- seq.POSIXt(from = q$tme[1], to = q$tme[nrow(q)], by = width)
new_degc <- interpolate_degc(new_times)
我想一次对所有变量执行此操作,最好使用 data.table
。
这似乎有效:
cols = c("rh", "degc")
DT = q[.(seq(min(tme), max(tme), by="10 mins")), on=.(tme)]
DT[, (cols) := lapply(cols, function(z) with(q,
approxfun(x = tme, y = get(z), method = "linear")
)(tme))]
tme rh degc
1: 2016-05-13 16:42:00 50.360 30.00556
2: 2016-05-13 16:52:00 48.835 30.18333
3: 2016-05-13 17:02:00 47.310 30.36111
4: 2016-05-13 17:12:00 47.080 30.43611
5: 2016-05-13 17:22:00 46.850 30.51111
---
263: 2016-05-15 12:22:00 54.026 26.04000
264: 2016-05-15 12:32:00 53.866 26.11667
265: 2016-05-15 12:42:00 53.826 26.14444
266: 2016-05-15 12:52:00 53.270 26.33056
267: 2016-05-15 13:02:00 52.370 26.62222
通常,当您想遍历列时,lapply
或 Map
就可以了。
工作原理: 在 with(q, ...)
、tme
和 get(z)
中引用 q
的列,但在外部其中,我们正在查看 DT
的列(在本例中只是 tme
)。
做同样事情的另一种方法:
q[, {
tt = seq(min(tme), max(tme), by="10 mins")
c(
.(tme = tt),
lapply(.SD, function(z) approxfun(x = tme, y = z, method="linear")(tt))
)
}, .SDcols=cols]
对于时间序列,我喜欢使用专门的包,如 xts
和 zoo
:
library(xts)
ts <- merge(xts(x = q[,-1], order.by = q[,1]), new_times)
head(ts)
#> rh degc
#> 2016-05-13 16:42:00 50.36 30.00556
#> 2016-05-13 16:52:00 NA NA
#> 2016-05-13 17:02:00 47.31 30.36111
#> 2016-05-13 17:12:00 NA NA
#> 2016-05-13 17:22:00 NA NA
#> 2016-05-13 17:32:00 NA NA
head(na.approx(ts))
#> rh degc
#> 2016-05-13 16:42:00 50.360 30.00556
#> 2016-05-13 16:52:00 48.835 30.18333
#> 2016-05-13 17:02:00 47.310 30.36111
#> 2016-05-13 17:12:00 47.080 30.43611
#> 2016-05-13 17:22:00 46.850 30.51111
#> 2016-05-13 17:32:00 46.620 30.58611
head(na.spline(ts))
#> rh degc
#> 2016-05-13 16:42:00 50.36000 30.00556
#> 2016-05-13 16:52:00 48.52407 30.20524
#> 2016-05-13 17:02:00 47.31000 30.36111
#> 2016-05-13 17:12:00 46.62601 30.47791
#> 2016-05-13 17:22:00 46.33972 30.56219
#> 2016-05-13 17:32:00 46.30857 30.62093