R:低于可变分位数阈值的滚动平均值

R: Rolling mean on values below variable quantile threshold

我有一个大data.table。我需要:

library(data.table)

set.seed(101)
data <- data.table(group=c(rep("A",10),rep("B",7)), value=rnorm(17))

> data
    group      value
 1:     A -0.3260365
 2:     A  0.5524619
 3:     A -0.6749438
 4:     A  0.2143595
 5:     A  0.3107692
 6:     A  1.1739663
 7:     A  0.6187899
 8:     A -0.1127343
 9:     A  0.9170283
10:     A -0.2232594
11:     B  0.5264481
12:     B -0.7948444
13:     B  1.4277555
14:     B -1.4668197
15:     B -0.2366834
16:     B -0.1933380
17:     B -0.8497547

为了确定 'value' 列的滚动分位数,我使用了包 caTools 中的函数 runquantile(),它在我的数据集上执行大约需要 1 分钟。

对于相同的滚动window(此处k=4),当关注计算时间时,如何获得移动分位数以下的滚动平均值?在示例中,结果应类似于 'mean_below_q'.

library(caTools)

data[,rolling_q := c(rep(NA,3),runquantile(value,k=4,0.4,endrule="trim")),group]

> data
    group      value   rolling_q mean_below_q
 1:     A -0.3260365          NA           NA
 2:     A  0.5524619          NA           NA
 3:     A -0.6749438          NA           NA
 4:     A  0.2143595 -0.21795730  -0.50049020
 5:     A  0.3107692  0.23364141  -0.23029220
 6:     A  1.1739663  0.23364141  -0.23029220
 7:     A  0.6187899  0.37237334   0.26256430
 8:     A -0.1127343  0.37237334   0.09901745
 9:     A  0.9170283  0.67843754   0.25302780
10:     A -0.2232594  0.03357052  -0.16799680
11:     B  0.5264481          NA           NA
12:     B -0.7948444          NA           NA
13:     B  1.4277555          NA           NA
14:     B -1.4668197 -0.53058593  -1.13083200
15:     B -0.2366834 -0.68321222  -1.13083200
16:     B -0.1933380 -0.22801430  -0.85175150
17:     B -0.8497547 -0.72714047  -1.15828700

非常感谢。


根据@chinsoon12 brilliant 脚本,它解决了问题,并且在大型数据集上大约需要 15 分钟。

# Option 1)

library(data.table)

set.seed(101)
data <- data.table(group=c(rep("A",10),rep("B",7),rep("C",13)), value=c(rnorm(17),rep(0,3),-0.9,rep(0,4),-0.5,-0.8,rep(0,3)))
sz <- 4L
prob <- 0.4

#see stats:::quantile.default
index <- 1 + (sz - 1) * prob
lo <- floor(index)
hi <- ceiling(index)
data[, c("sr", "er") := .(.I - sz + 1L, .I)]

data[, rolling_q := frollapply(value, sz, function(x) {
        x <- sort(x, partial = unique(c(lo, hi)))
        qs <- x[lo]
        i <- which(index > lo & x[hi] != qs)
        h <- (index - lo)[i]
        qs[i] <- (1 - h) * qs[i] + h * x[hi[i]]
        qs
        #.(q, sum(x[1L:lo]) / lo)
    }), group]

data[, mean_below_q := 
    data[data, on=.(group, er>=sr, er<=er), allow.cartesian=TRUE,
        by=.EACHI, mean(value[value<=i.rolling_q])]$V1
]

最后,我尝试将 caTools 包中的 runquantile() 与选项 1) 代码一起用于低于阈值的滚动平均值。它似乎工作正常,在大型数据集上总共花费 2.5 分钟。

# rolling_q via runquantile & mean_below_q via Option 1)

library(data.table)
library(caTools)

set.seed(101)
data <- data.table(group=c(rep("A",10),rep("B",7),rep("C",13)), value=c(rnorm(17),rep(0,3),-0.9,rep(0,4),-0.5,-0.8,rep(0,3)))

data[,rolling_q := c(rep(NA,3),runquantile(value,k=4,0.4,endrule="trim")),group]

sz <- 4L
prob <- 0.4

index <- 1 + (sz - 1) * prob
lo <- floor(index)
hi <- ceiling(index)
data[, c("sr", "er") := .(.I - sz + 1L, .I)]

data[, mean_below_q := 
    data[data, on=.(group, er>=sr, er<=er), allow.cartesian=TRUE,
        by=.EACHI, mean(value[value<=i.rolling_q])]$V1
]

> data
    group      value   rolling_q sr er mean_below_q
 1:     A -0.3260365          NA -2  1           NA
 2:     A  0.5524619          NA -1  2           NA
 3:     A -0.6749438          NA  0  3           NA
 4:     A  0.2143595 -0.21795730  1  4  -0.50049017
 5:     A  0.3107692  0.23364141  2  5  -0.23029219
 6:     A  1.1739663  0.23364141  3  6  -0.23029219
 7:     A  0.6187899  0.37237334  4  7   0.26256434
 8:     A -0.1127343  0.37237334  5  8   0.09901745
 9:     A  0.9170283  0.67843754  6  9   0.25302777
10:     A -0.2232594  0.03357052  7 10  -0.16799684
11:     B  0.5264481          NA  8 11           NA
12:     B -0.7948444          NA  9 12           NA
13:     B  1.4277555          NA 10 13           NA
14:     B -1.4668197 -0.53058593 11 14  -1.13083206
15:     B -0.2366834 -0.68321222 12 15  -1.13083206
16:     B -0.1933380 -0.22801430 13 16  -0.85175154
17:     B -0.8497547 -0.72714047 14 17  -1.15828722
18:     C  0.0000000          NA 15 18           NA
19:     C  0.0000000          NA 16 19           NA
20:     C  0.0000000          NA 17 20           NA
21:     C -0.9000000  0.00000000 18 21  -0.22500000
22:     C  0.0000000  0.00000000 19 22  -0.22500000
23:     C  0.0000000  0.00000000 20 23  -0.22500000
24:     C  0.0000000  0.00000000 21 24  -0.22500000
25:     C  0.0000000  0.00000000 22 25   0.00000000
26:     C -0.5000000  0.00000000 23 26  -0.12500000
27:     C -0.8000000 -0.40000000 24 27  -0.65000000
28:     C  0.0000000 -0.40000000 25 28  -0.65000000
29:     C  0.0000000 -0.40000000 26 29  -0.65000000
30:     C  0.0000000  0.00000000 27 30  -0.20000000
    group      value   rolling_q sr er mean_below_q

我认为有一个错字,因为数据显示的平均值低于百分位数而不是高于百分位数。

这里有 2 个选项:

1) 使用 frollapply(因为它 returns 长度为 1 的值),因此您需要另一个非等连接来计算低于百分位数的平均值。

data[, rolling_q := frollapply(value, sz, function(x) {
        x <- sort(x, partial = unique(c(lo, hi)))
        qs <- x[lo]
        i <- which(index > lo & x[hi] != qs)
        h <- (index - lo)[i]
        qs[i] <- (1 - h) * qs[i] + h * x[hi[i]]
        qs
        #.(q, sum(x[1L:lo]) / lo)
    }), group]

data[, mean_below_q := 
    data[data, on=.(group, er>=sr, er<=er), allow.cartesian=TRUE,
        by=.EACHI, mean(value[value<=i.rolling_q])]$V1
]

2) 使用 1 个非等连接来计算两个值

data[, c("rolling_q", "mean_below_q") := 
    .SD[.SD, on=.(group, er>=sr, er<=er), allow.cartesian=TRUE,
        by=.EACHI, {
            if (.N >= sz) {
                x <- x.value
                x <- sort(x, partial = unique(c(lo, hi)))
                qs <- x[lo]
                i <- which(index > lo & x[hi] != qs)
                h <- (index - lo)[i]
                qs[i] <- (1 - h) * qs[i] + h * x[hi[i]]
                qs

                .(qs, sum(x[1L:lo]) / lo)
            } else 
                .(NA_real_, NA_real_)
        }][, (1L:3L) := NULL]
]

输出:

    group      value sr er   rolling_q mean_below_q
 1:     A -0.3260365 -2  1          NA           NA
 2:     A  0.5524619 -1  2          NA           NA
 3:     A -0.6749438  0  3          NA           NA
 4:     A  0.2143595  1  4 -0.21795730  -0.50049017
 5:     A  0.3107692  2  5  0.23364141  -0.23029219
 6:     A  1.1739663  3  6  0.23364141  -0.23029219
 7:     A  0.6187899  4  7  0.37237334   0.26256434
 8:     A -0.1127343  5  8  0.37237334   0.09901745
 9:     A  0.9170283  6  9  0.67843754   0.25302777
10:     A -0.2232594  7 10  0.03357052  -0.16799684
11:     B  0.5264481  8 11          NA           NA
12:     B -0.7948444  9 12          NA           NA
13:     B  1.4277555 10 13          NA           NA
14:     B -1.4668197 11 14 -0.53058593  -1.13083206
15:     B -0.2366834 12 15 -0.68321222  -1.13083206
16:     B -0.1933380 13 16 -0.22801430  -0.85175154
17:     B -0.8497547 14 17 -0.72714047  -1.15828722

数据:

library(data.table)

set.seed(101)
data <- data.table(group=c(rep("A",10),rep("B",7)), value=rnorm(17))
sz <- 4L
prob <- 0.4

#see stats:::quantile.default
index <- 1 + (sz - 1) * prob
lo <- floor(index)
hi <- ceiling(index)
data[, c("sr", "er") := .(.I - sz + 1L, .I)]