R计算连续列中模式匹配的数量
R count number of pattern matches in consecutive columns
我有一个由“0”和“1”组成的数据框,如下所示:
DATA <- data.frame("V1" = c(0,0,0,0,1,1,0,1,1,1),
"V2" = c(1,0,0,0,1,1,0,1,1,1),
"V3" = c(0,0,0,0,1,0,0,1,1,1),
"V4" = c(1,1,1,0,1,1,0,1,1,1),
"V5" = c(0,0,0,0,1,1,0,1,1,1))
我想知道在每一行中有多少次“0”后跟下一列中的“1”。如果第一列值是“1”,也应该计算在内。
我有一个循环,它将每一行绑定到一个向量中,然后使用 stringi::stri_count_fixed
或 stringr::str_count
:
计算 '01' 的数量
for(n in 1:nrow(DATA)) {
# Paste row into a single character vector, with extra 0 at start in case
# the first column value is 1.
STRING <- do.call(paste0, c(0, DATA[n, 1:ncol(DATA)]))
# Count number of 0-1 transitions.
COUNT <- stringr::str_count(STRING, pattern = "01")
# Add this to the summary column.
DATA$Count[n] <- COUNT
}
但是,对于我的真实数据集(3000 - 4000 列),这两个都非常慢。有什么加快速度的想法吗?
期望的输出:
> DATA$Count
[1] 2 1 1 0 1 2 0 1 1 1
使用dplyr
:
DATA %>%
rowwise() %>%
mutate(count = sum(diff(c(0, c_across(everything()))) == 1))
V1 V2 V3 V4 V5 count
<dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 0 1 0 1 0 2
2 0 0 0 1 0 1
3 0 0 0 1 0 1
4 0 0 0 0 0 0
5 1 1 1 1 1 1
6 1 1 0 1 1 2
7 0 0 0 0 0 0
8 1 1 1 1 1 1
9 1 1 1 1 1 1
10 1 1 1 1 1 1
library(data.table)
DATA$Count =
lapply(transpose(DATA), \(x) sum(shift(x, fill = 0L) == 0L & x == 1L)) |>
unlist(use.names = FALSE)
# > DATA
# V1 V2 V3 V4 V5 Count
# 1 0 1 0 1 0 2
# 2 0 0 0 1 0 1
# 3 0 0 0 1 0 1
# 4 0 0 0 0 0 0
# 5 1 1 1 1 1 1
# 6 1 1 0 1 1 2
# 7 0 0 0 0 0 0
# 8 1 1 1 1 1 1
# 9 1 1 1 1 1 1
# 10 1 1 1 1 1 1
基准:
df = data.table::setDF(lapply(seq_len(4000L), \(x) sample(0L:1L, size = 100L, replace=TRUE)))
bench::mark(
sindri = {
lapply(transpose(df), \(x) sum(shift(x, fill = 0L) == 0L & x == 1L)) |>
unlist(use.names = FALSE)
},
tmfmnk = {
df %>%
rowwise() %>%
mutate(count = sum(diff(c(0, c_across(everything()))) == 1))
},
Yuriy = {
rowSums((df[1:(ncol(df) - 1)] - df[2:ncol(df)]) == 1) + (rowSums(df) == ncol(df))
},
iterations = 1L,
check = FALSE,
relative = TRUE
)
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 sindri 1 1 113. 1 NaN 1 0 12.77ms <NULL> <Rprofmem [559 x 3]> <bench_tm [1]> <tibble [1 x 3]>
2 tmfmnk 113. 113. 1 31.1 Inf 1 1 1.45s <NULL> <Rprofmem [30,205 x 3]> <bench_tm [1]> <tibble [1 x 3]>
3 Yuriy 35.8 35.8 3.17 1.37 NaN 1 0 456.42ms <NULL> <Rprofmem [8,260 x 3]> <bench_tm [1]> <tibble [1 x 3]>
基础
df <- data.frame("V1" = c(0,0,0,0,1,1,0,1,1,1),
"V2" = c(1,0,0,0,1,1,0,1,1,1),
"V3" = c(0,0,0,0,1,0,0,1,1,1),
"V4" = c(1,1,1,0,1,1,0,1,1,1),
"V5" = c(0,0,0,0,1,1,0,1,1,1))
df$Count <-
rowSums((df[1:(ncol(df) - 1)] - df[2:ncol(df)]) == 1) +
(rowSums(df) == ncol(df))
df
#> V1 V2 V3 V4 V5 Count
#> 1 0 1 0 1 0 2
#> 2 0 0 0 1 0 1
#> 3 0 0 0 1 0 1
#> 4 0 0 0 0 0 0
#> 5 1 1 1 1 1 1
#> 6 1 1 0 1 1 1
#> 7 0 0 0 0 0 0
#> 8 1 1 1 1 1 1
#> 9 1 1 1 1 1 1
#> 10 1 1 1 1 1 1
由 reprex package (v2.0.1)
于 2022-05-30 创建
一个可能的解决方案,在base R
:
DATA$Count <-
apply(DATA, 1, \(x) x[1] + sum((x[2:length(x)] - x[1:(length(x)-1)]) > 0))
DATA
#> V1 V2 V3 V4 V5 Count
#> 1 0 1 0 1 0 2
#> 2 0 0 0 1 0 1
#> 3 0 0 0 1 0 1
#> 4 0 0 0 0 0 0
#> 5 1 1 1 1 1 1
#> 6 1 1 0 1 1 2
#> 7 0 0 0 0 0 0
#> 8 1 1 1 1 1 1
#> 9 1 1 1 1 1 1
#> 10 1 1 1 1 1 1
我有一个由“0”和“1”组成的数据框,如下所示:
DATA <- data.frame("V1" = c(0,0,0,0,1,1,0,1,1,1),
"V2" = c(1,0,0,0,1,1,0,1,1,1),
"V3" = c(0,0,0,0,1,0,0,1,1,1),
"V4" = c(1,1,1,0,1,1,0,1,1,1),
"V5" = c(0,0,0,0,1,1,0,1,1,1))
我想知道在每一行中有多少次“0”后跟下一列中的“1”。如果第一列值是“1”,也应该计算在内。
我有一个循环,它将每一行绑定到一个向量中,然后使用 stringi::stri_count_fixed
或 stringr::str_count
:
for(n in 1:nrow(DATA)) {
# Paste row into a single character vector, with extra 0 at start in case
# the first column value is 1.
STRING <- do.call(paste0, c(0, DATA[n, 1:ncol(DATA)]))
# Count number of 0-1 transitions.
COUNT <- stringr::str_count(STRING, pattern = "01")
# Add this to the summary column.
DATA$Count[n] <- COUNT
}
但是,对于我的真实数据集(3000 - 4000 列),这两个都非常慢。有什么加快速度的想法吗?
期望的输出:
> DATA$Count
[1] 2 1 1 0 1 2 0 1 1 1
使用dplyr
:
DATA %>%
rowwise() %>%
mutate(count = sum(diff(c(0, c_across(everything()))) == 1))
V1 V2 V3 V4 V5 count
<dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 0 1 0 1 0 2
2 0 0 0 1 0 1
3 0 0 0 1 0 1
4 0 0 0 0 0 0
5 1 1 1 1 1 1
6 1 1 0 1 1 2
7 0 0 0 0 0 0
8 1 1 1 1 1 1
9 1 1 1 1 1 1
10 1 1 1 1 1 1
library(data.table)
DATA$Count =
lapply(transpose(DATA), \(x) sum(shift(x, fill = 0L) == 0L & x == 1L)) |>
unlist(use.names = FALSE)
# > DATA
# V1 V2 V3 V4 V5 Count
# 1 0 1 0 1 0 2
# 2 0 0 0 1 0 1
# 3 0 0 0 1 0 1
# 4 0 0 0 0 0 0
# 5 1 1 1 1 1 1
# 6 1 1 0 1 1 2
# 7 0 0 0 0 0 0
# 8 1 1 1 1 1 1
# 9 1 1 1 1 1 1
# 10 1 1 1 1 1 1
基准:
df = data.table::setDF(lapply(seq_len(4000L), \(x) sample(0L:1L, size = 100L, replace=TRUE)))
bench::mark(
sindri = {
lapply(transpose(df), \(x) sum(shift(x, fill = 0L) == 0L & x == 1L)) |>
unlist(use.names = FALSE)
},
tmfmnk = {
df %>%
rowwise() %>%
mutate(count = sum(diff(c(0, c_across(everything()))) == 1))
},
Yuriy = {
rowSums((df[1:(ncol(df) - 1)] - df[2:ncol(df)]) == 1) + (rowSums(df) == ncol(df))
},
iterations = 1L,
check = FALSE,
relative = TRUE
)
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 sindri 1 1 113. 1 NaN 1 0 12.77ms <NULL> <Rprofmem [559 x 3]> <bench_tm [1]> <tibble [1 x 3]>
2 tmfmnk 113. 113. 1 31.1 Inf 1 1 1.45s <NULL> <Rprofmem [30,205 x 3]> <bench_tm [1]> <tibble [1 x 3]>
3 Yuriy 35.8 35.8 3.17 1.37 NaN 1 0 456.42ms <NULL> <Rprofmem [8,260 x 3]> <bench_tm [1]> <tibble [1 x 3]>
基础
df <- data.frame("V1" = c(0,0,0,0,1,1,0,1,1,1),
"V2" = c(1,0,0,0,1,1,0,1,1,1),
"V3" = c(0,0,0,0,1,0,0,1,1,1),
"V4" = c(1,1,1,0,1,1,0,1,1,1),
"V5" = c(0,0,0,0,1,1,0,1,1,1))
df$Count <-
rowSums((df[1:(ncol(df) - 1)] - df[2:ncol(df)]) == 1) +
(rowSums(df) == ncol(df))
df
#> V1 V2 V3 V4 V5 Count
#> 1 0 1 0 1 0 2
#> 2 0 0 0 1 0 1
#> 3 0 0 0 1 0 1
#> 4 0 0 0 0 0 0
#> 5 1 1 1 1 1 1
#> 6 1 1 0 1 1 1
#> 7 0 0 0 0 0 0
#> 8 1 1 1 1 1 1
#> 9 1 1 1 1 1 1
#> 10 1 1 1 1 1 1
由 reprex package (v2.0.1)
于 2022-05-30 创建一个可能的解决方案,在base R
:
DATA$Count <-
apply(DATA, 1, \(x) x[1] + sum((x[2:length(x)] - x[1:(length(x)-1)]) > 0))
DATA
#> V1 V2 V3 V4 V5 Count
#> 1 0 1 0 1 0 2
#> 2 0 0 0 1 0 1
#> 3 0 0 0 1 0 1
#> 4 0 0 0 0 0 0
#> 5 1 1 1 1 1 1
#> 6 1 1 0 1 1 2
#> 7 0 0 0 0 0 0
#> 8 1 1 1 1 1 1
#> 9 1 1 1 1 1 1
#> 10 1 1 1 1 1 1