R计算连续列中模式匹配的数量

R count number of pattern matches in consecutive columns

我有一个由“0”和“1”组成的数据框,如下所示:

DATA <- data.frame("V1" = c(0,0,0,0,1,1,0,1,1,1),
                   "V2" = c(1,0,0,0,1,1,0,1,1,1),
                   "V3" = c(0,0,0,0,1,0,0,1,1,1),
                   "V4" = c(1,1,1,0,1,1,0,1,1,1),
                   "V5" = c(0,0,0,0,1,1,0,1,1,1))

我想知道在每一行中有多少次“0”后跟下一列中的“1”。如果第一列值是“1”,也应该计算在内。

我有一个循环,它将每一行绑定到一个向量中,然后使用 stringi::stri_count_fixedstringr::str_count:

计算 '01' 的数量
  for(n in 1:nrow(DATA)) {
    # Paste row into a single character vector, with extra 0 at start in case
    # the first column value is 1.
    STRING <- do.call(paste0, c(0, DATA[n, 1:ncol(DATA)]))

    # Count number of 0-1 transitions.
    COUNT <- stringr::str_count(STRING, pattern = "01")

    # Add this to the summary column.
    DATA$Count[n] <- COUNT
  }

但是,对于我的真实数据集(3000 - 4000 列),这两个都非常慢。有什么加快速度的想法吗?

期望的输出:

> DATA$Count
[1] 2 1 1 0 1 2 0 1 1 1

使用dplyr:

DATA %>%
    rowwise() %>%
    mutate(count = sum(diff(c(0, c_across(everything()))) == 1))

     V1    V2    V3    V4    V5 count
   <dbl> <dbl> <dbl> <dbl> <dbl> <int>
 1     0     1     0     1     0     2
 2     0     0     0     1     0     1
 3     0     0     0     1     0     1
 4     0     0     0     0     0     0
 5     1     1     1     1     1     1
 6     1     1     0     1     1     2
 7     0     0     0     0     0     0
 8     1     1     1     1     1     1
 9     1     1     1     1     1     1
10     1     1     1     1     1     1
library(data.table)

DATA$Count = 
  lapply(transpose(DATA), \(x) sum(shift(x, fill = 0L) == 0L & x == 1L)) |> 
  unlist(use.names = FALSE)

# > DATA
#    V1 V2 V3 V4 V5 Count
# 1   0  1  0  1  0     2
# 2   0  0  0  1  0     1
# 3   0  0  0  1  0     1
# 4   0  0  0  0  0     0
# 5   1  1  1  1  1     1
# 6   1  1  0  1  1     2
# 7   0  0  0  0  0     0
# 8   1  1  1  1  1     1
# 9   1  1  1  1  1     1
# 10  1  1  1  1  1     1

基准:

df = data.table::setDF(lapply(seq_len(4000L), \(x) sample(0L:1L, size = 100L, replace=TRUE)))


bench::mark(
  sindri = {
    lapply(transpose(df), \(x) sum(shift(x, fill = 0L) == 0L & x == 1L)) |> 
      unlist(use.names = FALSE)
  },
  tmfmnk = {
    df %>%
      rowwise() %>%
      mutate(count = sum(diff(c(0, c_across(everything()))) == 1))
  },
  Yuriy = {
    rowSums((df[1:(ncol(df) - 1)] - df[2:ncol(df)]) == 1) + (rowSums(df) == ncol(df))
  },
  iterations = 1L,
  check = FALSE,
  relative = TRUE
)



  expression   min median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory                  time           gc              
  <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl> <int> <dbl>   <bch:tm> <list> <list>                  <list>         <list>          
1 sindri       1      1      113.        1         NaN     1     0    12.77ms <NULL> <Rprofmem [559 x 3]>    <bench_tm [1]> <tibble [1 x 3]>
2 tmfmnk     113.   113.       1        31.1       Inf     1     1      1.45s <NULL> <Rprofmem [30,205 x 3]> <bench_tm [1]> <tibble [1 x 3]>
3 Yuriy       35.8   35.8      3.17      1.37      NaN     1     0   456.42ms <NULL> <Rprofmem [8,260 x 3]>  <bench_tm [1]> <tibble [1 x 3]>

基础

df <- data.frame("V1" = c(0,0,0,0,1,1,0,1,1,1),
                 "V2" = c(1,0,0,0,1,1,0,1,1,1),
                 "V3" = c(0,0,0,0,1,0,0,1,1,1),
                 "V4" = c(1,1,1,0,1,1,0,1,1,1),
                 "V5" = c(0,0,0,0,1,1,0,1,1,1))

df$Count <-
  rowSums((df[1:(ncol(df) - 1)] - df[2:ncol(df)]) == 1) + 
  (rowSums(df) == ncol(df))


df
#>    V1 V2 V3 V4 V5 Count
#> 1   0  1  0  1  0     2
#> 2   0  0  0  1  0     1
#> 3   0  0  0  1  0     1
#> 4   0  0  0  0  0     0
#> 5   1  1  1  1  1     1
#> 6   1  1  0  1  1     1
#> 7   0  0  0  0  0     0
#> 8   1  1  1  1  1     1
#> 9   1  1  1  1  1     1
#> 10  1  1  1  1  1     1

reprex package (v2.0.1)

于 2022-05-30 创建

一个可能的解决方案,在base R

DATA$Count <- 
  apply(DATA, 1, \(x) x[1] + sum((x[2:length(x)] - x[1:(length(x)-1)]) > 0))
DATA

#>    V1 V2 V3 V4 V5 Count
#> 1   0  1  0  1  0     2
#> 2   0  0  0  1  0     1
#> 3   0  0  0  1  0     1
#> 4   0  0  0  0  0     0
#> 5   1  1  1  1  1     1
#> 6   1  1  0  1  1     2
#> 7   0  0  0  0  0     0
#> 8   1  1  1  1  1     1
#> 9   1  1  1  1  1     1
#> 10  1  1  1  1  1     1