指定值的最接近后续索引
Closest subsequent index for a specified value
考虑一个向量:
int = c(1, 1, 0, 5, 2, 0, 0, 2)
我想获取指定值的最接近的后续索引(不是差异)。函数的第一个参数应该是向量,而第二个参数应该是想要看到最接近的后续元素的值。
例如,
f(int, 0)
# [1] 2 1 0 2 1 0 0 NA
这里向量(1)的第一个元素距离第一个后面的0有两个位置,(3 - 1 = 2),所以应该return2。那么第二个元素就是1位置远离 0 (2 - 1 = 1)。当没有后续值匹配指定值时,return NA(这里是最后一个元素的情况,因为没有后续值是0)。
其他示例:
f(int, 1)
# [1] 0 0 NA NA NA NA NA NA
f(int, 2)
# [1] 4 3 2 1 0 2 1 0
f(int, 3)
# [1] NA NA NA NA NA NA NA NA
这也适用于字符向量:
char = c("A", "B", "C", "A", "A")
f(char, "A")
# [1] 0 2 1 0 0
此处 f
被定义为递归函数,它在 lookup
向量的较短尾部调用自身:
f <- function(lookup,val ) {
ind <- which(lookup == val)[1] -1
if (length(lookup) > 1) {
c(ind, f(lookup[-1], val))
} else {
ind
}
}
寻找从第n个位置到向量末尾的匹配项,然后得到第一个匹配项:
f <- function(v, x){
sapply(seq_along(v), function(i){
which(v[ i:length(v) ] == x)[ 1 ] - 1
})
}
f(int, 0)
# [1] 2 1 0 2 1 0 0 NA
f(int, 1)
# [1] 0 0 NA NA NA NA NA NA
f(int, 2)
# [1] 4 3 2 1 0 2 1 0
f(int, 3)
# [1] NA NA NA NA NA NA NA NA
f(char, "A")
# [1] 0 2 1 0 0
找到每个值(数字或字符)的位置
int = c(1, 1, 0, 5, 2, 0, 0, 2)
value = 0
idx = which(int == value)
## [1] 3 6 7
展开索引以指示最接近的感兴趣值,在 int
中的最后一个值之后使用 NA。
nearest = rep(NA, length(int))
nearest[1:max(idx)] = rep(idx, diff(c(0, idx))),
## [1] 3 3 3 6 6 6 7 NA
使用简单的算法找出当前值的索引与最近值的索引之间的差异
abs(seq_along(int) - nearest)
## [1] 2 1 0 2 1 0 0 NA
写成函数
f <- function(x, value) {
idx = which(x == value)
nearest = rep(NA, length(x))
if (length(idx)) # non-NA values only if `value` in `x`
nearest[1:max(idx)] = rep(idx, diff(c(0, idx)))
abs(seq_along(x) - nearest)
}
我们有
> f(int, 0)
[1] 2 1 0 2 1 0 0 NA
> f(int, 1)
[1] 0 0 NA NA NA NA NA NA
> f(int, 2)
[1] 4 3 2 1 0 2 1 0
> f(char, "A")
[1] 0 2 1 0 0
> f(char, "B")
[1] 1 0 NA NA NA
> f(char, "C")
[1] 2 1 0 NA NA
该解决方案不涉及递归或 R-level 循环,因此即使对于长向量也应该很快。
这是一种使用 Reduce()
的方法,然后进行一些操作以获得 NA
值。
f <- function(vec, value) {
replace(
Reduce(
function(x, y)
x + (y * x) ,
vec != value,
right = TRUE,
accumulate = TRUE
),
max(tail(which(vec == value), 1), 0) < seq_along(vec),
NA
)
}
f(int, 0)
[1] 2 1 0 2 1 0 0 NA
f(int, 1)
[1] 0 0 NA NA NA NA NA NA
f(int, 2)
[1] 4 3 2 1 0 2 1 0
f(int, 3)
[1] NA NA NA NA NA NA NA NA
char = c("A", "B", "C", "A", "A")
f(char, "A")
[1] 0 2 1 0 0
另一种可能的解决方案,基于purrr::map2_dbl
:
library(purrr)
int = c(1, 1, 0, 5, 2, 0, 0, 2)
f <- function(int, num)
{
n <- length(int)
map2_dbl(num, 1:n, ~ ifelse(length(which(.x == int[.y:n])) == 0, NA,
min(which(.x == int[.y:n])) - 1))
}
f(int, 0)
#> [1] 2 1 0 2 1 0 0 NA
f(int, 1)
#> [1] 0 0 NA NA NA NA NA NA
f(int, 2)
#> [1] 4 3 2 1 0 2 1 0
f(int, 3)
#> [1] NA NA NA NA NA NA NA NA
char = c("A", "B", "C", "A", "A")
f(char, "A")
#> [1] 0 2 1 0 0
使用sequence
:
f <- function(v, x){
d = diff(c(0, which(v == x)))
vec <- sequence(d, d-1, by = -1)
length(vec) <- length(int)
vec
}
输出
int = c(1, 1, 0, 5, 2, 0, 0, 2)
char = c("A", "B", "C", "A", "A")
f(int, 0)
# [1] 2 1 0 2 1 0 0 NA
f(int, 1)
# [1] 0 0 NA NA NA NA NA NA
f(int, 2)
# [1] 4 3 2 1 0 2 1 0
f(char, "A")
# [1] 0 2 1 0 0
基准 (n = 1000):
set.seed(123)
int = sample(0:100, size = 1000, replace = T)
library(microbenchmark)
bm <- microbenchmark(
fSequence(int, 0),
fzx8754(int, 0),
fRecursive(int, 0),
fMartinMorgan(int, 0),
fMap2dbl(int, 0),
fReduce(int, 0),
fAve(int, 0),
fjblood94(int, 0),
times = 10L,
setup = gc(FALSE)
)
autoplot(bm)
Martin Morgan 的解决方案似乎是最快的,其次是此答案的 sequence
解决方案、sbarbit 的递归解决方案和 jblood94 的 for 循环解决方案。
使用的函数:
fSequence <- function(v, x){
vec <- sequence(diff(c(0, which(v == x))), diff(c(0, which(v == x))) - 1, by = -1)
length(vec) <- length(v)
vec
}
fzx8754 <- function(v, x){
sapply(seq_along(v), function(i){
which(v[ i:length(v) ] == x)[ 1 ] - 1
})
}
fRecursive <- function(lookup,val ) {
ind <- which(lookup == val)[1] -1
if (length(lookup) > 1) {
c(ind, f(lookup[-1], val))
} else {
ind
}
}
fMartinMorgan <- function(x, value) {
idx = which(x == value)
nearest = rep(NA, length(x))
nearest[1:max(idx)] = rep(idx, diff(c(0, idx)))
abs(seq_along(x) - nearest)
}
fMap2dbl <- function(int, num)
{
n <- length(int)
map2_dbl(num, 1:n, ~ ifelse(length(which(.x == int[.y:n])) == 0, NA,
min(which(.x == int[.y:n])) - 1))
}
fReduce <- function(vec, value) {
replace(
Reduce(
function(x, y)
x + (y * x) ,
vec != value,
right = TRUE,
accumulate = TRUE
),
max(tail(which(vec == value), 1), 0) < seq_along(vec),
NA
)
}
fAve <- function(init, k) {
ave(
seq_along(init),
c(0, head(cumsum(init == k), -1)),
FUN = function(x) if (any(x == k)) rev(seq_along(x) - 1) else NA
)
}
fjblood94 <- function(v, val) {
out <- integer(length(v))
if (v[length(v)] != val) out[length(v)] <- NA_integer_
for (i in (length(v) - 1L):1) {
if (v[i] == val) {
out[i] <- 0L
} else {
out[i] <- out[i + 1L] + 1L
}
}
return(out)
}
一个single-passfor
循环简单高效:
f1 <- function(v, val) {
out <- integer(length(v))
if (v[length(v)] != val) out[length(v)] <- NA_integer_
for (i in (length(v) - 1L):1) {
if (v[i] == val) {
out[i] <- 0L
} else {
out[i] <- out[i + 1L] + 1L
}
}
return(out)
}
int <- c(1, 1, 0, 5, 2, 0, 0, 2)
chr <- c("A", "B", "C", "A", "A")
f1(int, 0)
#> [1] 2 1 0 2 1 0 0 NA
f1(chr, "A")
#> [1] 0 2 1 0 0
与其他解决方案的基准对比:
f2 <- function(v, x){
sapply(seq_along(v), function(i){
which(v[ i:length(v) ] == x)[ 1 ] - 1
})
}
f3 <- function(lookup,val ) {
ind <- which(lookup == val)[1] -1
if (length(lookup) > 1) {
c(ind, f3(lookup[-1], val))
} else {
ind
}
}
f4 <- function(x, value) {
idx = which(x == value)
nearest = rep(NA, length(x))
nearest[1:max(idx)] = rep(idx, diff(c(0, idx)))
abs(seq_along(x) - nearest)
}
f5 <- function(vec, value) {
replace(
Reduce(
function(x, y)
x + (y * x) ,
vec != value,
right = TRUE,
accumulate = TRUE
),
max(tail(which(vec == value), 1), 0) < seq_along(vec),
NA
)
}
microbenchmark::microbenchmark(f1 = {f1(int, 0); f1(chr, "A")},
f2 = {f2(int, 0); f2(chr, "A")},
f3 = {f3(int, 0); f3(chr, "A")},
f4 = {f4(int, 0); f4(chr, "A")},
f5 = {f5(int, 0); f5(chr, "A")},
check = "equal")
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> f1 6.0 7.50 8.990 8.40 9.60 18.3 100
#> f2 54.2 61.45 71.752 65.55 79.40 131.8 100
#> f3 25.5 28.60 33.393 30.75 35.90 105.2 100
#> f4 22.3 26.30 30.599 28.00 32.65 82.4 100
#> f5 59.7 64.55 73.474 69.10 75.70 157.0 100
使用 ave
+ cumsum
的基础 R 选项
f <- function(init, k) {
ave(
seq_along(init),
c(0, head(cumsum(init == k), -1)),
FUN = function(x) if (any(x == k)) rev(seq_along(x) - 1) else NA
)
}
你会看到
> f(init, 0)
[1] 2 1 0 2 1 0 0 NA
> f(init, 1)
[1] 0 0 NA NA NA NA NA NA
> f(init, 2)
[1] 4 3 2 1 0 2 1 0
> f(init, 3)
[1] NA NA NA NA NA NA NA NA
考虑一个向量:
int = c(1, 1, 0, 5, 2, 0, 0, 2)
我想获取指定值的最接近的后续索引(不是差异)。函数的第一个参数应该是向量,而第二个参数应该是想要看到最接近的后续元素的值。
例如,
f(int, 0)
# [1] 2 1 0 2 1 0 0 NA
这里向量(1)的第一个元素距离第一个后面的0有两个位置,(3 - 1 = 2),所以应该return2。那么第二个元素就是1位置远离 0 (2 - 1 = 1)。当没有后续值匹配指定值时,return NA(这里是最后一个元素的情况,因为没有后续值是0)。
其他示例:
f(int, 1)
# [1] 0 0 NA NA NA NA NA NA
f(int, 2)
# [1] 4 3 2 1 0 2 1 0
f(int, 3)
# [1] NA NA NA NA NA NA NA NA
这也适用于字符向量:
char = c("A", "B", "C", "A", "A")
f(char, "A")
# [1] 0 2 1 0 0
此处 f
被定义为递归函数,它在 lookup
向量的较短尾部调用自身:
f <- function(lookup,val ) {
ind <- which(lookup == val)[1] -1
if (length(lookup) > 1) {
c(ind, f(lookup[-1], val))
} else {
ind
}
}
寻找从第n个位置到向量末尾的匹配项,然后得到第一个匹配项:
f <- function(v, x){
sapply(seq_along(v), function(i){
which(v[ i:length(v) ] == x)[ 1 ] - 1
})
}
f(int, 0)
# [1] 2 1 0 2 1 0 0 NA
f(int, 1)
# [1] 0 0 NA NA NA NA NA NA
f(int, 2)
# [1] 4 3 2 1 0 2 1 0
f(int, 3)
# [1] NA NA NA NA NA NA NA NA
f(char, "A")
# [1] 0 2 1 0 0
找到每个值(数字或字符)的位置
int = c(1, 1, 0, 5, 2, 0, 0, 2)
value = 0
idx = which(int == value)
## [1] 3 6 7
展开索引以指示最接近的感兴趣值,在 int
中的最后一个值之后使用 NA。
nearest = rep(NA, length(int))
nearest[1:max(idx)] = rep(idx, diff(c(0, idx))),
## [1] 3 3 3 6 6 6 7 NA
使用简单的算法找出当前值的索引与最近值的索引之间的差异
abs(seq_along(int) - nearest)
## [1] 2 1 0 2 1 0 0 NA
写成函数
f <- function(x, value) {
idx = which(x == value)
nearest = rep(NA, length(x))
if (length(idx)) # non-NA values only if `value` in `x`
nearest[1:max(idx)] = rep(idx, diff(c(0, idx)))
abs(seq_along(x) - nearest)
}
我们有
> f(int, 0)
[1] 2 1 0 2 1 0 0 NA
> f(int, 1)
[1] 0 0 NA NA NA NA NA NA
> f(int, 2)
[1] 4 3 2 1 0 2 1 0
> f(char, "A")
[1] 0 2 1 0 0
> f(char, "B")
[1] 1 0 NA NA NA
> f(char, "C")
[1] 2 1 0 NA NA
该解决方案不涉及递归或 R-level 循环,因此即使对于长向量也应该很快。
这是一种使用 Reduce()
的方法,然后进行一些操作以获得 NA
值。
f <- function(vec, value) {
replace(
Reduce(
function(x, y)
x + (y * x) ,
vec != value,
right = TRUE,
accumulate = TRUE
),
max(tail(which(vec == value), 1), 0) < seq_along(vec),
NA
)
}
f(int, 0)
[1] 2 1 0 2 1 0 0 NA
f(int, 1)
[1] 0 0 NA NA NA NA NA NA
f(int, 2)
[1] 4 3 2 1 0 2 1 0
f(int, 3)
[1] NA NA NA NA NA NA NA NA
char = c("A", "B", "C", "A", "A")
f(char, "A")
[1] 0 2 1 0 0
另一种可能的解决方案,基于purrr::map2_dbl
:
library(purrr)
int = c(1, 1, 0, 5, 2, 0, 0, 2)
f <- function(int, num)
{
n <- length(int)
map2_dbl(num, 1:n, ~ ifelse(length(which(.x == int[.y:n])) == 0, NA,
min(which(.x == int[.y:n])) - 1))
}
f(int, 0)
#> [1] 2 1 0 2 1 0 0 NA
f(int, 1)
#> [1] 0 0 NA NA NA NA NA NA
f(int, 2)
#> [1] 4 3 2 1 0 2 1 0
f(int, 3)
#> [1] NA NA NA NA NA NA NA NA
char = c("A", "B", "C", "A", "A")
f(char, "A")
#> [1] 0 2 1 0 0
使用sequence
:
f <- function(v, x){
d = diff(c(0, which(v == x)))
vec <- sequence(d, d-1, by = -1)
length(vec) <- length(int)
vec
}
输出
int = c(1, 1, 0, 5, 2, 0, 0, 2)
char = c("A", "B", "C", "A", "A")
f(int, 0)
# [1] 2 1 0 2 1 0 0 NA
f(int, 1)
# [1] 0 0 NA NA NA NA NA NA
f(int, 2)
# [1] 4 3 2 1 0 2 1 0
f(char, "A")
# [1] 0 2 1 0 0
基准 (n = 1000):
set.seed(123)
int = sample(0:100, size = 1000, replace = T)
library(microbenchmark)
bm <- microbenchmark(
fSequence(int, 0),
fzx8754(int, 0),
fRecursive(int, 0),
fMartinMorgan(int, 0),
fMap2dbl(int, 0),
fReduce(int, 0),
fAve(int, 0),
fjblood94(int, 0),
times = 10L,
setup = gc(FALSE)
)
autoplot(bm)
Martin Morgan 的解决方案似乎是最快的,其次是此答案的 sequence
解决方案、sbarbit 的递归解决方案和 jblood94 的 for 循环解决方案。
使用的函数:
fSequence <- function(v, x){
vec <- sequence(diff(c(0, which(v == x))), diff(c(0, which(v == x))) - 1, by = -1)
length(vec) <- length(v)
vec
}
fzx8754 <- function(v, x){
sapply(seq_along(v), function(i){
which(v[ i:length(v) ] == x)[ 1 ] - 1
})
}
fRecursive <- function(lookup,val ) {
ind <- which(lookup == val)[1] -1
if (length(lookup) > 1) {
c(ind, f(lookup[-1], val))
} else {
ind
}
}
fMartinMorgan <- function(x, value) {
idx = which(x == value)
nearest = rep(NA, length(x))
nearest[1:max(idx)] = rep(idx, diff(c(0, idx)))
abs(seq_along(x) - nearest)
}
fMap2dbl <- function(int, num)
{
n <- length(int)
map2_dbl(num, 1:n, ~ ifelse(length(which(.x == int[.y:n])) == 0, NA,
min(which(.x == int[.y:n])) - 1))
}
fReduce <- function(vec, value) {
replace(
Reduce(
function(x, y)
x + (y * x) ,
vec != value,
right = TRUE,
accumulate = TRUE
),
max(tail(which(vec == value), 1), 0) < seq_along(vec),
NA
)
}
fAve <- function(init, k) {
ave(
seq_along(init),
c(0, head(cumsum(init == k), -1)),
FUN = function(x) if (any(x == k)) rev(seq_along(x) - 1) else NA
)
}
fjblood94 <- function(v, val) {
out <- integer(length(v))
if (v[length(v)] != val) out[length(v)] <- NA_integer_
for (i in (length(v) - 1L):1) {
if (v[i] == val) {
out[i] <- 0L
} else {
out[i] <- out[i + 1L] + 1L
}
}
return(out)
}
一个single-passfor
循环简单高效:
f1 <- function(v, val) {
out <- integer(length(v))
if (v[length(v)] != val) out[length(v)] <- NA_integer_
for (i in (length(v) - 1L):1) {
if (v[i] == val) {
out[i] <- 0L
} else {
out[i] <- out[i + 1L] + 1L
}
}
return(out)
}
int <- c(1, 1, 0, 5, 2, 0, 0, 2)
chr <- c("A", "B", "C", "A", "A")
f1(int, 0)
#> [1] 2 1 0 2 1 0 0 NA
f1(chr, "A")
#> [1] 0 2 1 0 0
与其他解决方案的基准对比:
f2 <- function(v, x){
sapply(seq_along(v), function(i){
which(v[ i:length(v) ] == x)[ 1 ] - 1
})
}
f3 <- function(lookup,val ) {
ind <- which(lookup == val)[1] -1
if (length(lookup) > 1) {
c(ind, f3(lookup[-1], val))
} else {
ind
}
}
f4 <- function(x, value) {
idx = which(x == value)
nearest = rep(NA, length(x))
nearest[1:max(idx)] = rep(idx, diff(c(0, idx)))
abs(seq_along(x) - nearest)
}
f5 <- function(vec, value) {
replace(
Reduce(
function(x, y)
x + (y * x) ,
vec != value,
right = TRUE,
accumulate = TRUE
),
max(tail(which(vec == value), 1), 0) < seq_along(vec),
NA
)
}
microbenchmark::microbenchmark(f1 = {f1(int, 0); f1(chr, "A")},
f2 = {f2(int, 0); f2(chr, "A")},
f3 = {f3(int, 0); f3(chr, "A")},
f4 = {f4(int, 0); f4(chr, "A")},
f5 = {f5(int, 0); f5(chr, "A")},
check = "equal")
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> f1 6.0 7.50 8.990 8.40 9.60 18.3 100
#> f2 54.2 61.45 71.752 65.55 79.40 131.8 100
#> f3 25.5 28.60 33.393 30.75 35.90 105.2 100
#> f4 22.3 26.30 30.599 28.00 32.65 82.4 100
#> f5 59.7 64.55 73.474 69.10 75.70 157.0 100
使用 ave
+ cumsum
f <- function(init, k) {
ave(
seq_along(init),
c(0, head(cumsum(init == k), -1)),
FUN = function(x) if (any(x == k)) rev(seq_along(x) - 1) else NA
)
}
你会看到
> f(init, 0)
[1] 2 1 0 2 1 0 0 NA
> f(init, 1)
[1] 0 0 NA NA NA NA NA NA
> f(init, 2)
[1] 4 3 2 1 0 2 1 0
> f(init, 3)
[1] NA NA NA NA NA NA NA NA