R:低于可变分位数阈值的滚动平均值
R: Rolling mean on values below variable quantile threshold
我有一个大data.table
。我需要:
- 数字列的滚动分位数
- 应用于低于(移动)分位数阈值的值的数字列的滚动平均值
library(data.table)
set.seed(101)
data <- data.table(group=c(rep("A",10),rep("B",7)), value=rnorm(17))
> data
group value
1: A -0.3260365
2: A 0.5524619
3: A -0.6749438
4: A 0.2143595
5: A 0.3107692
6: A 1.1739663
7: A 0.6187899
8: A -0.1127343
9: A 0.9170283
10: A -0.2232594
11: B 0.5264481
12: B -0.7948444
13: B 1.4277555
14: B -1.4668197
15: B -0.2366834
16: B -0.1933380
17: B -0.8497547
为了确定 'value' 列的滚动分位数,我使用了包 caTools
中的函数 runquantile()
,它在我的数据集上执行大约需要 1 分钟。
对于相同的滚动window(此处k=4
),当关注计算时间时,如何获得移动分位数以下的滚动平均值?在示例中,结果应类似于 'mean_below_q'.
列
library(caTools)
data[,rolling_q := c(rep(NA,3),runquantile(value,k=4,0.4,endrule="trim")),group]
> data
group value rolling_q mean_below_q
1: A -0.3260365 NA NA
2: A 0.5524619 NA NA
3: A -0.6749438 NA NA
4: A 0.2143595 -0.21795730 -0.50049020
5: A 0.3107692 0.23364141 -0.23029220
6: A 1.1739663 0.23364141 -0.23029220
7: A 0.6187899 0.37237334 0.26256430
8: A -0.1127343 0.37237334 0.09901745
9: A 0.9170283 0.67843754 0.25302780
10: A -0.2232594 0.03357052 -0.16799680
11: B 0.5264481 NA NA
12: B -0.7948444 NA NA
13: B 1.4277555 NA NA
14: B -1.4668197 -0.53058593 -1.13083200
15: B -0.2366834 -0.68321222 -1.13083200
16: B -0.1933380 -0.22801430 -0.85175150
17: B -0.8497547 -0.72714047 -1.15828700
非常感谢。
根据@chinsoon12 brilliant 脚本,它解决了问题,并且在大型数据集上大约需要 15 分钟。
# Option 1)
library(data.table)
set.seed(101)
data <- data.table(group=c(rep("A",10),rep("B",7),rep("C",13)), value=c(rnorm(17),rep(0,3),-0.9,rep(0,4),-0.5,-0.8,rep(0,3)))
sz <- 4L
prob <- 0.4
#see stats:::quantile.default
index <- 1 + (sz - 1) * prob
lo <- floor(index)
hi <- ceiling(index)
data[, c("sr", "er") := .(.I - sz + 1L, .I)]
data[, rolling_q := frollapply(value, sz, function(x) {
x <- sort(x, partial = unique(c(lo, hi)))
qs <- x[lo]
i <- which(index > lo & x[hi] != qs)
h <- (index - lo)[i]
qs[i] <- (1 - h) * qs[i] + h * x[hi[i]]
qs
#.(q, sum(x[1L:lo]) / lo)
}), group]
data[, mean_below_q :=
data[data, on=.(group, er>=sr, er<=er), allow.cartesian=TRUE,
by=.EACHI, mean(value[value<=i.rolling_q])]$V1
]
最后,我尝试将 caTools
包中的 runquantile()
与选项 1) 代码一起用于低于阈值的滚动平均值。它似乎工作正常,在大型数据集上总共花费 2.5 分钟。
# rolling_q via runquantile & mean_below_q via Option 1)
library(data.table)
library(caTools)
set.seed(101)
data <- data.table(group=c(rep("A",10),rep("B",7),rep("C",13)), value=c(rnorm(17),rep(0,3),-0.9,rep(0,4),-0.5,-0.8,rep(0,3)))
data[,rolling_q := c(rep(NA,3),runquantile(value,k=4,0.4,endrule="trim")),group]
sz <- 4L
prob <- 0.4
index <- 1 + (sz - 1) * prob
lo <- floor(index)
hi <- ceiling(index)
data[, c("sr", "er") := .(.I - sz + 1L, .I)]
data[, mean_below_q :=
data[data, on=.(group, er>=sr, er<=er), allow.cartesian=TRUE,
by=.EACHI, mean(value[value<=i.rolling_q])]$V1
]
> data
group value rolling_q sr er mean_below_q
1: A -0.3260365 NA -2 1 NA
2: A 0.5524619 NA -1 2 NA
3: A -0.6749438 NA 0 3 NA
4: A 0.2143595 -0.21795730 1 4 -0.50049017
5: A 0.3107692 0.23364141 2 5 -0.23029219
6: A 1.1739663 0.23364141 3 6 -0.23029219
7: A 0.6187899 0.37237334 4 7 0.26256434
8: A -0.1127343 0.37237334 5 8 0.09901745
9: A 0.9170283 0.67843754 6 9 0.25302777
10: A -0.2232594 0.03357052 7 10 -0.16799684
11: B 0.5264481 NA 8 11 NA
12: B -0.7948444 NA 9 12 NA
13: B 1.4277555 NA 10 13 NA
14: B -1.4668197 -0.53058593 11 14 -1.13083206
15: B -0.2366834 -0.68321222 12 15 -1.13083206
16: B -0.1933380 -0.22801430 13 16 -0.85175154
17: B -0.8497547 -0.72714047 14 17 -1.15828722
18: C 0.0000000 NA 15 18 NA
19: C 0.0000000 NA 16 19 NA
20: C 0.0000000 NA 17 20 NA
21: C -0.9000000 0.00000000 18 21 -0.22500000
22: C 0.0000000 0.00000000 19 22 -0.22500000
23: C 0.0000000 0.00000000 20 23 -0.22500000
24: C 0.0000000 0.00000000 21 24 -0.22500000
25: C 0.0000000 0.00000000 22 25 0.00000000
26: C -0.5000000 0.00000000 23 26 -0.12500000
27: C -0.8000000 -0.40000000 24 27 -0.65000000
28: C 0.0000000 -0.40000000 25 28 -0.65000000
29: C 0.0000000 -0.40000000 26 29 -0.65000000
30: C 0.0000000 0.00000000 27 30 -0.20000000
group value rolling_q sr er mean_below_q
我认为有一个错字,因为数据显示的平均值低于百分位数而不是高于百分位数。
这里有 2 个选项:
1) 使用 frollapply
(因为它 returns 长度为 1 的值),因此您需要另一个非等连接来计算低于百分位数的平均值。
data[, rolling_q := frollapply(value, sz, function(x) {
x <- sort(x, partial = unique(c(lo, hi)))
qs <- x[lo]
i <- which(index > lo & x[hi] != qs)
h <- (index - lo)[i]
qs[i] <- (1 - h) * qs[i] + h * x[hi[i]]
qs
#.(q, sum(x[1L:lo]) / lo)
}), group]
data[, mean_below_q :=
data[data, on=.(group, er>=sr, er<=er), allow.cartesian=TRUE,
by=.EACHI, mean(value[value<=i.rolling_q])]$V1
]
2) 使用 1 个非等连接来计算两个值
data[, c("rolling_q", "mean_below_q") :=
.SD[.SD, on=.(group, er>=sr, er<=er), allow.cartesian=TRUE,
by=.EACHI, {
if (.N >= sz) {
x <- x.value
x <- sort(x, partial = unique(c(lo, hi)))
qs <- x[lo]
i <- which(index > lo & x[hi] != qs)
h <- (index - lo)[i]
qs[i] <- (1 - h) * qs[i] + h * x[hi[i]]
qs
.(qs, sum(x[1L:lo]) / lo)
} else
.(NA_real_, NA_real_)
}][, (1L:3L) := NULL]
]
输出:
group value sr er rolling_q mean_below_q
1: A -0.3260365 -2 1 NA NA
2: A 0.5524619 -1 2 NA NA
3: A -0.6749438 0 3 NA NA
4: A 0.2143595 1 4 -0.21795730 -0.50049017
5: A 0.3107692 2 5 0.23364141 -0.23029219
6: A 1.1739663 3 6 0.23364141 -0.23029219
7: A 0.6187899 4 7 0.37237334 0.26256434
8: A -0.1127343 5 8 0.37237334 0.09901745
9: A 0.9170283 6 9 0.67843754 0.25302777
10: A -0.2232594 7 10 0.03357052 -0.16799684
11: B 0.5264481 8 11 NA NA
12: B -0.7948444 9 12 NA NA
13: B 1.4277555 10 13 NA NA
14: B -1.4668197 11 14 -0.53058593 -1.13083206
15: B -0.2366834 12 15 -0.68321222 -1.13083206
16: B -0.1933380 13 16 -0.22801430 -0.85175154
17: B -0.8497547 14 17 -0.72714047 -1.15828722
数据:
library(data.table)
set.seed(101)
data <- data.table(group=c(rep("A",10),rep("B",7)), value=rnorm(17))
sz <- 4L
prob <- 0.4
#see stats:::quantile.default
index <- 1 + (sz - 1) * prob
lo <- floor(index)
hi <- ceiling(index)
data[, c("sr", "er") := .(.I - sz + 1L, .I)]
我有一个大data.table
。我需要:
- 数字列的滚动分位数
- 应用于低于(移动)分位数阈值的值的数字列的滚动平均值
library(data.table)
set.seed(101)
data <- data.table(group=c(rep("A",10),rep("B",7)), value=rnorm(17))
> data
group value
1: A -0.3260365
2: A 0.5524619
3: A -0.6749438
4: A 0.2143595
5: A 0.3107692
6: A 1.1739663
7: A 0.6187899
8: A -0.1127343
9: A 0.9170283
10: A -0.2232594
11: B 0.5264481
12: B -0.7948444
13: B 1.4277555
14: B -1.4668197
15: B -0.2366834
16: B -0.1933380
17: B -0.8497547
为了确定 'value' 列的滚动分位数,我使用了包 caTools
中的函数 runquantile()
,它在我的数据集上执行大约需要 1 分钟。
对于相同的滚动window(此处k=4
),当关注计算时间时,如何获得移动分位数以下的滚动平均值?在示例中,结果应类似于 'mean_below_q'.
library(caTools)
data[,rolling_q := c(rep(NA,3),runquantile(value,k=4,0.4,endrule="trim")),group]
> data
group value rolling_q mean_below_q
1: A -0.3260365 NA NA
2: A 0.5524619 NA NA
3: A -0.6749438 NA NA
4: A 0.2143595 -0.21795730 -0.50049020
5: A 0.3107692 0.23364141 -0.23029220
6: A 1.1739663 0.23364141 -0.23029220
7: A 0.6187899 0.37237334 0.26256430
8: A -0.1127343 0.37237334 0.09901745
9: A 0.9170283 0.67843754 0.25302780
10: A -0.2232594 0.03357052 -0.16799680
11: B 0.5264481 NA NA
12: B -0.7948444 NA NA
13: B 1.4277555 NA NA
14: B -1.4668197 -0.53058593 -1.13083200
15: B -0.2366834 -0.68321222 -1.13083200
16: B -0.1933380 -0.22801430 -0.85175150
17: B -0.8497547 -0.72714047 -1.15828700
非常感谢。
根据@chinsoon12 brilliant 脚本,它解决了问题,并且在大型数据集上大约需要 15 分钟。
# Option 1)
library(data.table)
set.seed(101)
data <- data.table(group=c(rep("A",10),rep("B",7),rep("C",13)), value=c(rnorm(17),rep(0,3),-0.9,rep(0,4),-0.5,-0.8,rep(0,3)))
sz <- 4L
prob <- 0.4
#see stats:::quantile.default
index <- 1 + (sz - 1) * prob
lo <- floor(index)
hi <- ceiling(index)
data[, c("sr", "er") := .(.I - sz + 1L, .I)]
data[, rolling_q := frollapply(value, sz, function(x) {
x <- sort(x, partial = unique(c(lo, hi)))
qs <- x[lo]
i <- which(index > lo & x[hi] != qs)
h <- (index - lo)[i]
qs[i] <- (1 - h) * qs[i] + h * x[hi[i]]
qs
#.(q, sum(x[1L:lo]) / lo)
}), group]
data[, mean_below_q :=
data[data, on=.(group, er>=sr, er<=er), allow.cartesian=TRUE,
by=.EACHI, mean(value[value<=i.rolling_q])]$V1
]
最后,我尝试将 caTools
包中的 runquantile()
与选项 1) 代码一起用于低于阈值的滚动平均值。它似乎工作正常,在大型数据集上总共花费 2.5 分钟。
# rolling_q via runquantile & mean_below_q via Option 1)
library(data.table)
library(caTools)
set.seed(101)
data <- data.table(group=c(rep("A",10),rep("B",7),rep("C",13)), value=c(rnorm(17),rep(0,3),-0.9,rep(0,4),-0.5,-0.8,rep(0,3)))
data[,rolling_q := c(rep(NA,3),runquantile(value,k=4,0.4,endrule="trim")),group]
sz <- 4L
prob <- 0.4
index <- 1 + (sz - 1) * prob
lo <- floor(index)
hi <- ceiling(index)
data[, c("sr", "er") := .(.I - sz + 1L, .I)]
data[, mean_below_q :=
data[data, on=.(group, er>=sr, er<=er), allow.cartesian=TRUE,
by=.EACHI, mean(value[value<=i.rolling_q])]$V1
]
> data
group value rolling_q sr er mean_below_q
1: A -0.3260365 NA -2 1 NA
2: A 0.5524619 NA -1 2 NA
3: A -0.6749438 NA 0 3 NA
4: A 0.2143595 -0.21795730 1 4 -0.50049017
5: A 0.3107692 0.23364141 2 5 -0.23029219
6: A 1.1739663 0.23364141 3 6 -0.23029219
7: A 0.6187899 0.37237334 4 7 0.26256434
8: A -0.1127343 0.37237334 5 8 0.09901745
9: A 0.9170283 0.67843754 6 9 0.25302777
10: A -0.2232594 0.03357052 7 10 -0.16799684
11: B 0.5264481 NA 8 11 NA
12: B -0.7948444 NA 9 12 NA
13: B 1.4277555 NA 10 13 NA
14: B -1.4668197 -0.53058593 11 14 -1.13083206
15: B -0.2366834 -0.68321222 12 15 -1.13083206
16: B -0.1933380 -0.22801430 13 16 -0.85175154
17: B -0.8497547 -0.72714047 14 17 -1.15828722
18: C 0.0000000 NA 15 18 NA
19: C 0.0000000 NA 16 19 NA
20: C 0.0000000 NA 17 20 NA
21: C -0.9000000 0.00000000 18 21 -0.22500000
22: C 0.0000000 0.00000000 19 22 -0.22500000
23: C 0.0000000 0.00000000 20 23 -0.22500000
24: C 0.0000000 0.00000000 21 24 -0.22500000
25: C 0.0000000 0.00000000 22 25 0.00000000
26: C -0.5000000 0.00000000 23 26 -0.12500000
27: C -0.8000000 -0.40000000 24 27 -0.65000000
28: C 0.0000000 -0.40000000 25 28 -0.65000000
29: C 0.0000000 -0.40000000 26 29 -0.65000000
30: C 0.0000000 0.00000000 27 30 -0.20000000
group value rolling_q sr er mean_below_q
我认为有一个错字,因为数据显示的平均值低于百分位数而不是高于百分位数。
这里有 2 个选项:
1) 使用 frollapply
(因为它 returns 长度为 1 的值),因此您需要另一个非等连接来计算低于百分位数的平均值。
data[, rolling_q := frollapply(value, sz, function(x) {
x <- sort(x, partial = unique(c(lo, hi)))
qs <- x[lo]
i <- which(index > lo & x[hi] != qs)
h <- (index - lo)[i]
qs[i] <- (1 - h) * qs[i] + h * x[hi[i]]
qs
#.(q, sum(x[1L:lo]) / lo)
}), group]
data[, mean_below_q :=
data[data, on=.(group, er>=sr, er<=er), allow.cartesian=TRUE,
by=.EACHI, mean(value[value<=i.rolling_q])]$V1
]
2) 使用 1 个非等连接来计算两个值
data[, c("rolling_q", "mean_below_q") :=
.SD[.SD, on=.(group, er>=sr, er<=er), allow.cartesian=TRUE,
by=.EACHI, {
if (.N >= sz) {
x <- x.value
x <- sort(x, partial = unique(c(lo, hi)))
qs <- x[lo]
i <- which(index > lo & x[hi] != qs)
h <- (index - lo)[i]
qs[i] <- (1 - h) * qs[i] + h * x[hi[i]]
qs
.(qs, sum(x[1L:lo]) / lo)
} else
.(NA_real_, NA_real_)
}][, (1L:3L) := NULL]
]
输出:
group value sr er rolling_q mean_below_q
1: A -0.3260365 -2 1 NA NA
2: A 0.5524619 -1 2 NA NA
3: A -0.6749438 0 3 NA NA
4: A 0.2143595 1 4 -0.21795730 -0.50049017
5: A 0.3107692 2 5 0.23364141 -0.23029219
6: A 1.1739663 3 6 0.23364141 -0.23029219
7: A 0.6187899 4 7 0.37237334 0.26256434
8: A -0.1127343 5 8 0.37237334 0.09901745
9: A 0.9170283 6 9 0.67843754 0.25302777
10: A -0.2232594 7 10 0.03357052 -0.16799684
11: B 0.5264481 8 11 NA NA
12: B -0.7948444 9 12 NA NA
13: B 1.4277555 10 13 NA NA
14: B -1.4668197 11 14 -0.53058593 -1.13083206
15: B -0.2366834 12 15 -0.68321222 -1.13083206
16: B -0.1933380 13 16 -0.22801430 -0.85175154
17: B -0.8497547 14 17 -0.72714047 -1.15828722
数据:
library(data.table)
set.seed(101)
data <- data.table(group=c(rep("A",10),rep("B",7)), value=rnorm(17))
sz <- 4L
prob <- 0.4
#see stats:::quantile.default
index <- 1 + (sz - 1) * prob
lo <- floor(index)
hi <- ceiling(index)
data[, c("sr", "er") := .(.I - sz + 1L, .I)]