指定值的最接近后续索引

Closest subsequent index for a specified value

考虑一个向量:

int = c(1, 1, 0, 5, 2, 0, 0, 2)

我想获取指定值的最接近的后续索引(不是差异)。函数的第一个参数应该是向量,而第二个参数应该是想要看到最接近的后续元素的值。

例如,

f(int, 0)
# [1] 2 1 0 2 1 0 0 NA

这里向量(1)的第一个元素距离第一个后面的0有两个位置,(3 - 1 = 2),所以应该return2。那么第二个元素就是1位置远离 0 (2 - 1 = 1)。当没有后续值匹配指定值时,return NA(这里是最后一个元素的情况,因为没有后续值是0)。

其他示例:

f(int, 1)
# [1] 0 0 NA NA NA NA NA NA

f(int, 2) 
# [1] 4 3 2 1 0 2 1 0

f(int, 3) 
# [1] NA NA NA NA NA NA NA NA

这也适用于字符向量:

char = c("A", "B", "C", "A", "A")

f(char, "A") 
# [1] 0 2 1 0 0

此处 f 被定义为递归函数,它在 lookup 向量的较短尾部调用自身:

f <- function(lookup,val ) {
  ind <- which(lookup == val)[1] -1
  if (length(lookup) > 1) {
    c(ind, f(lookup[-1], val))
  } else {
    ind
  }
}

寻找从第n个位置到向量末尾的匹配项,然后得到第一个匹配项:

f <- function(v, x){
  sapply(seq_along(v), function(i){
    which(v[ i:length(v) ] == x)[ 1 ] - 1
  })
}

f(int, 0)
# [1]  2  1  0  2  1  0  0 NA
f(int, 1)
# [1]  0  0 NA NA NA NA NA NA
f(int, 2)
# [1] 4 3 2 1 0 2 1 0
f(int, 3) 
# [1] NA NA NA NA NA NA NA NA

f(char, "A") 
# [1] 0 2 1 0 0

找到每个值(数字或字符)的位置

int = c(1, 1, 0, 5, 2, 0, 0, 2)
value = 0
idx = which(int == value)
## [1] 3 6 7

展开索引以指示最接近的感兴趣值,在 int 中的最后一个值之后使用 NA。

nearest = rep(NA, length(int))
nearest[1:max(idx)] = rep(idx, diff(c(0, idx))),
## [1]  3  3  3  6  6  6  7 NA

使用简单的算法找出当前值的索引与最近值的索引之间的差异

abs(seq_along(int) - nearest)
## [1]  2  1  0  2  1  0  0 NA

写成函数

f <- function(x, value) {
    idx = which(x == value)
    nearest = rep(NA, length(x))
    if (length(idx)) # non-NA values only if `value` in `x`
        nearest[1:max(idx)] = rep(idx, diff(c(0, idx)))
    abs(seq_along(x) - nearest)
}

我们有

> f(int, 0)
[1]  2  1  0  2  1  0  0 NA
> f(int, 1)
[1]  0  0 NA NA NA NA NA NA
> f(int, 2)
[1] 4 3 2 1 0 2 1 0
> f(char, "A")
[1] 0 2 1 0 0
> f(char, "B")
[1]  1  0 NA NA NA
> f(char, "C")
[1]  2  1  0 NA NA

该解决方案不涉及递归或 R-level 循环,因此即使对于长向量也应该很快。

这是一种使用 Reduce() 的方法,然后进行一些操作以获得 NA 值。

f <- function(vec, value) {
replace(
  Reduce(
    function(x, y)
      x  + (y * x) ,
    vec != value,
    right = TRUE,
    accumulate = TRUE
  ),
  max(tail(which(vec == value), 1), 0) < seq_along(vec),
  NA
)
}

f(int, 0)          
[1]  2  1  0  2  1  0  0 NA

f(int, 1)          
[1]  0  0 NA NA NA NA NA NA

f(int, 2) 
[1] 4 3 2 1 0 2 1 0

f(int, 3) 
[1] NA NA NA NA NA NA NA NA

char = c("A", "B", "C", "A", "A")

f(char, "A") 
[1] 0 2 1 0 0

另一种可能的解决方案,基于purrr::map2_dbl

library(purrr)

int = c(1, 1, 0, 5, 2, 0, 0, 2)

f <- function(int, num)
{
  n <- length(int)
  
  map2_dbl(num, 1:n, ~ ifelse(length(which(.x == int[.y:n])) == 0, NA, 
      min(which(.x == int[.y:n])) - 1))
}

f(int, 0)
#> [1]  2  1  0  2  1  0  0 NA

f(int, 1)          
#> [1]  0  0 NA NA NA NA NA NA

f(int, 2) 
#> [1] 4 3 2 1 0 2 1 0

f(int, 3) 
#> [1] NA NA NA NA NA NA NA NA

char = c("A", "B", "C", "A", "A")

f(char, "A") 
#> [1] 0 2 1 0 0

使用sequence:

f <- function(v, x){
  d = diff(c(0, which(v == x)))
  vec <- sequence(d, d-1, by = -1)
  length(vec) <- length(int)
  vec
}

输出

int = c(1, 1, 0, 5, 2, 0, 0, 2)
char = c("A", "B", "C", "A", "A")

f(int, 0)
# [1]  2  1  0  2  1  0  0 NA

f(int, 1)
# [1]  0  0 NA NA NA NA NA NA

f(int, 2)
# [1] 4 3 2 1 0 2 1 0

f(char, "A")
# [1] 0 2 1 0 0

基准 (n = 1000):

set.seed(123)
int = sample(0:100, size = 1000, replace = T)

library(microbenchmark)
bm <- microbenchmark(
  fSequence(int, 0),
  fzx8754(int, 0),
  fRecursive(int, 0), 
  fMartinMorgan(int, 0), 
  fMap2dbl(int, 0),
  fReduce(int, 0),
  fAve(int, 0),
  fjblood94(int, 0),
  times = 10L,
  setup = gc(FALSE)
)
autoplot(bm)

Martin Morgan 的解决方案似乎是最快的,其次是此答案的 sequence 解决方案、sbarbit 的递归解决方案和 jblood94 的 for 循环解决方案。

使用的函数:

fSequence <- function(v, x){
  vec <- sequence(diff(c(0, which(v == x))), diff(c(0, which(v == x))) - 1, by = -1)
  length(vec) <- length(v)
  vec
}

fzx8754 <- function(v, x){
  sapply(seq_along(v), function(i){
    which(v[ i:length(v) ] == x)[ 1 ] - 1
  })
}

fRecursive <- function(lookup,val ) {
  ind <- which(lookup == val)[1] -1
  if (length(lookup) > 1) {
    c(ind, f(lookup[-1], val))
  } else {
    ind
  }
}

fMartinMorgan <- function(x, value) {
  idx = which(x == value)
  nearest = rep(NA, length(x))
  nearest[1:max(idx)] = rep(idx, diff(c(0, idx)))
  abs(seq_along(x) - nearest)
}

fMap2dbl <- function(int, num)
{
  n <- length(int)
  
  map2_dbl(num, 1:n, ~ ifelse(length(which(.x == int[.y:n])) == 0, NA, 
                              min(which(.x == int[.y:n])) - 1))
}

fReduce <- function(vec, value) {
  replace(
    Reduce(
      function(x, y)
        x  + (y * x) ,
      vec != value,
      right = TRUE,
      accumulate = TRUE
    ),
    max(tail(which(vec == value), 1), 0) < seq_along(vec),
    NA
  )
}

fAve <- function(init, k) {
  ave(
    seq_along(init),
    c(0, head(cumsum(init == k), -1)),
    FUN = function(x) if (any(x == k)) rev(seq_along(x) - 1) else NA
  )
}

fjblood94 <- function(v, val) {
  out <- integer(length(v))
  if (v[length(v)] != val) out[length(v)] <- NA_integer_
  
  for (i in (length(v) - 1L):1) {
    if (v[i] == val) {
      out[i] <- 0L
    } else {
      out[i] <- out[i + 1L] + 1L
    }
  }
  
  return(out)
}

一个single-passfor循环简单高效:

f1 <- function(v, val) {
  out <- integer(length(v))
  if (v[length(v)] != val) out[length(v)] <- NA_integer_
  
  for (i in (length(v) - 1L):1) {
    if (v[i] == val) {
      out[i] <- 0L
    } else {
      out[i] <- out[i + 1L] + 1L
    }
  }
  
  return(out)
}

int <- c(1, 1, 0, 5, 2, 0, 0, 2)
chr <- c("A", "B", "C", "A", "A")
f1(int, 0)
#> [1]  2  1  0  2  1  0  0 NA
f1(chr, "A")
#> [1] 0 2 1 0 0

与其他解决方案的基准对比:

f2 <- function(v, x){
  sapply(seq_along(v), function(i){
    which(v[ i:length(v) ] == x)[ 1 ] - 1
  })
}

f3 <- function(lookup,val ) {
  ind <- which(lookup == val)[1] -1
  if (length(lookup) > 1) {
    c(ind, f3(lookup[-1], val))
  } else {
    ind
  }
}

f4 <- function(x, value) {
  idx = which(x == value)
  nearest = rep(NA, length(x))
  nearest[1:max(idx)] = rep(idx, diff(c(0, idx)))
  abs(seq_along(x) - nearest)
}

f5 <- function(vec, value) {
  replace(
    Reduce(
      function(x, y)
        x  + (y * x) ,
      vec != value,
      right = TRUE,
      accumulate = TRUE
    ),
    max(tail(which(vec == value), 1), 0) < seq_along(vec),
    NA
  )
}

microbenchmark::microbenchmark(f1 = {f1(int, 0); f1(chr, "A")},
                               f2 = {f2(int, 0); f2(chr, "A")},
                               f3 = {f3(int, 0); f3(chr, "A")},
                               f4 = {f4(int, 0); f4(chr, "A")},
                               f5 = {f5(int, 0); f5(chr, "A")},
                               check = "equal")
#> Unit: microseconds
#>  expr  min    lq   mean median    uq   max neval
#>    f1  6.0  7.50  8.990   8.40  9.60  18.3   100
#>    f2 54.2 61.45 71.752  65.55 79.40 131.8   100
#>    f3 25.5 28.60 33.393  30.75 35.90 105.2   100
#>    f4 22.3 26.30 30.599  28.00 32.65  82.4   100
#>    f5 59.7 64.55 73.474  69.10 75.70 157.0   100

使用 ave + cumsum

的基础 R 选项
f <- function(init, k) {
  ave(
    seq_along(init),
    c(0, head(cumsum(init == k), -1)),
    FUN = function(x) if (any(x == k)) rev(seq_along(x) - 1) else NA
  )
}

你会看到

> f(init, 0)
[1]  2  1  0  2  1  0  0 NA

> f(init, 1)
[1]  0  0 NA NA NA NA NA NA

> f(init, 2)
[1] 4 3 2 1 0 2 1 0

> f(init, 3)
[1] NA NA NA NA NA NA NA NA