对于一组固定的模式,如何检测字符串中模式的存在?
How do I detect the presence of a pattern in a string, for a fixed set of patterns?
在变量选择的上下文中,我试图计算在自举迭代中选择变量的次数。下面提供了该问题的简单版本以及我的解决方案 (answer
)。但是当处理 50 或 100 个变量时,我的解决方案很快变得笨拙。
我有一组要计算的变量名 (pred
),所以我认为应该可以根据这些值创建新列,然后检测每个列的相关字符串。但是如果不手动设置列名并粘贴函数,我无法弄清楚如何。一定有更好的方法...
欢迎任何其他解决方案,包括 tidyverse 或 purrr...
library(dplyr)
df <- mtcars
n <- nrow(df)
pred <- colnames(df)[2:length(df)]
target <- "mpg"
mpg_formula <- paste(target, "~", paste(pred, collapse = "+"))
steplm <- data.frame()
bootnum <- 10
for (i in 1:bootnum) {
message("Fitting model ", i, " out of ", bootnum)
data.id <- sample(1:dim(df)[1], replace = T)
fit.lms <- step(lm(mpg_formula, data=df[data.id, ]),
direction = "backward",
trace = 0)
selected.vars <- paste(sort(names(coef(fit.lms)[-1])), collapse = ", ")
step.result <- data.frame("model" = selected.vars,
"nvar" = length(names(coef(fit.lms)[-1])))
steplm <- dplyr::bind_rows(steplm, step.result)
}
steplm %>%
transmute(
steplm %>%
transmute(
cyl = grepl(pattern = "cyl", x = model),
disp = grepl(pattern = "disp", x = model),
hp = grepl(pattern = "hp", x = model),
drat = grepl(pattern = "drat", x = model),
wt = grepl(pattern = "wt", x = model),
qsec = grepl(pattern = "qsec", x = model),
vs = grepl(pattern = "vs", x = model),
am = grepl(pattern = "am", x = model),
gear = grepl(pattern = "gear", x = model),
carb = grepl(pattern = "carb", x = model)
) -> answer
这会产生以下 data.frame(或矩阵),我可以从中对列求和以获得我想要的值(或进行矩阵运算以获得项之间的成对和联合依赖关系)。这只是为了指出下一步需要矩阵格式...
cyl disp hp drat wt qsec vs am gear carb
TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
FALSE TRUE FALSE TRUE FALSE FALSE TRUE TRUE FALSE TRUE
TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE
TRUE FALSE FALSE TRUE TRUE TRUE FALSE TRUE FALSE TRUE
TRUE TRUE FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE
TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE
FALSE FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE TRUE
FALSE TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE
TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
您可以使用 sapply
:
sapply(pred, grepl, steplm$model)
# cyl disp hp drat wt qsec vs am gear carb
# [1,] FALSE FALSE FALSE TRUE TRUE TRUE FALSE TRUE FALSE FALSE
# [2,] TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE TRUE TRUE
# [3,] FALSE FALSE FALSE FALSE TRUE TRUE FALSE TRUE FALSE TRUE
# [4,] FALSE FALSE FALSE TRUE TRUE TRUE FALSE TRUE TRUE FALSE
# [5,] FALSE TRUE FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE
# [6,] TRUE FALSE TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE
# [7,] FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE TRUE
# [8,] FALSE TRUE FALSE TRUE TRUE TRUE FALSE FALSE TRUE FALSE
# [9,] FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE
#[10,] TRUE TRUE TRUE FALSE FALSE FALSE FALSE TRUE FALSE TRUE
sapply
returns 一个矩阵。如果需要数据帧,可以将 data.frame
包装到 sapply
输出。
identical(data.frame(sapply(pred, grepl, steplm$model)), answer)
#[1] TRUE
您可以考虑对 bootstrap 重采样采取不同的方法,more similar to this。
library(tidyverse)
library(rsample)
library(broom)
car_boot <- bootstraps(mtcars, times = 100)
car_boot
#> # Bootstrap sampling
#> # A tibble: 100 x 2
#> splits id
#> <list> <chr>
#> 1 <split [32/10]> Bootstrap001
#> 2 <split [32/12]> Bootstrap002
#> 3 <split [32/12]> Bootstrap003
#> 4 <split [32/12]> Bootstrap004
#> 5 <split [32/13]> Bootstrap005
#> 6 <split [32/9]> Bootstrap006
#> 7 <split [32/11]> Bootstrap007
#> 8 <split [32/10]> Bootstrap008
#> 9 <split [32/13]> Bootstrap009
#> 10 <split [32/14]> Bootstrap010
#> # … with 90 more rows
fit_mpg <- function(split) {
step(lm(mpg ~ ., data = analysis(split)), direction = "backward", trace = 0)
}
boot_models <- car_boot %>%
mutate(model = map(splits, fit_mpg),
coef_info = map(model, tidy))
boot_coefs <- boot_models %>%
unnest(coef_info)
boot_coefs
#> # A tibble: 645 x 8
#> splits id model term estimate std.error statistic p.value
#> <list> <chr> <lis> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 <split [32/… Bootstrap… <lm> (Interc… 6.16 7.09 0.868 3.93e-1
#> 2 <split [32/… Bootstrap… <lm> drat 1.89 1.44 1.31 2.01e-1
#> 3 <split [32/… Bootstrap… <lm> wt -1.92 0.738 -2.61 1.50e-2
#> 4 <split [32/… Bootstrap… <lm> qsec 0.754 0.307 2.45 2.12e-2
#> 5 <split [32/… Bootstrap… <lm> am 3.80 1.49 2.55 1.70e-2
#> 6 <split [32/… Bootstrap… <lm> carb -0.859 0.407 -2.11 4.47e-2
#> 7 <split [32/… Bootstrap… <lm> (Interc… 8.51 4.05 2.10 4.48e-2
#> 8 <split [32/… Bootstrap… <lm> disp 0.00752 0.00474 1.59 1.25e-1
#> 9 <split [32/… Bootstrap… <lm> wt -1.17 0.708 -1.66 1.09e-1
#> 10 <split [32/… Bootstrap… <lm> gear 5.37 0.757 7.10 1.25e-7
#> # … with 635 more rows
boot_coefs %>%
select(id, term) %>%
filter(term != "(Intercept)") %>%
mutate(value = TRUE) %>%
complete(id, term, fill = list(value = FALSE)) %>%
pivot_wider(names_from = term, values_from = value)
#> # A tibble: 100 x 11
#> id am carb cyl disp drat gear hp qsec vs wt
#> <chr> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
#> 1 Bootstrap001 TRUE TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE TRUE
#> 2 Bootstrap002 FALSE TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE
#> 3 Bootstrap003 TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE
#> 4 Bootstrap004 FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
#> 5 Bootstrap005 FALSE TRUE FALSE TRUE TRUE TRUE FALSE FALSE FALSE TRUE
#> 6 Bootstrap006 FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE
#> 7 Bootstrap007 FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE
#> 8 Bootstrap008 FALSE FALSE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
#> 9 Bootstrap009 TRUE TRUE FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE
#> 10 Bootstrap010 FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
#> # … with 90 more rows
由 reprex package (v2.0.0)
于 2021-04-07 创建
在变量选择的上下文中,我试图计算在自举迭代中选择变量的次数。下面提供了该问题的简单版本以及我的解决方案 (answer
)。但是当处理 50 或 100 个变量时,我的解决方案很快变得笨拙。
我有一组要计算的变量名 (pred
),所以我认为应该可以根据这些值创建新列,然后检测每个列的相关字符串。但是如果不手动设置列名并粘贴函数,我无法弄清楚如何。一定有更好的方法...
欢迎任何其他解决方案,包括 tidyverse 或 purrr...
library(dplyr)
df <- mtcars
n <- nrow(df)
pred <- colnames(df)[2:length(df)]
target <- "mpg"
mpg_formula <- paste(target, "~", paste(pred, collapse = "+"))
steplm <- data.frame()
bootnum <- 10
for (i in 1:bootnum) {
message("Fitting model ", i, " out of ", bootnum)
data.id <- sample(1:dim(df)[1], replace = T)
fit.lms <- step(lm(mpg_formula, data=df[data.id, ]),
direction = "backward",
trace = 0)
selected.vars <- paste(sort(names(coef(fit.lms)[-1])), collapse = ", ")
step.result <- data.frame("model" = selected.vars,
"nvar" = length(names(coef(fit.lms)[-1])))
steplm <- dplyr::bind_rows(steplm, step.result)
}
steplm %>%
transmute(
steplm %>%
transmute(
cyl = grepl(pattern = "cyl", x = model),
disp = grepl(pattern = "disp", x = model),
hp = grepl(pattern = "hp", x = model),
drat = grepl(pattern = "drat", x = model),
wt = grepl(pattern = "wt", x = model),
qsec = grepl(pattern = "qsec", x = model),
vs = grepl(pattern = "vs", x = model),
am = grepl(pattern = "am", x = model),
gear = grepl(pattern = "gear", x = model),
carb = grepl(pattern = "carb", x = model)
) -> answer
这会产生以下 data.frame(或矩阵),我可以从中对列求和以获得我想要的值(或进行矩阵运算以获得项之间的成对和联合依赖关系)。这只是为了指出下一步需要矩阵格式...
cyl disp hp drat wt qsec vs am gear carb
TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
FALSE TRUE FALSE TRUE FALSE FALSE TRUE TRUE FALSE TRUE
TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE
TRUE FALSE FALSE TRUE TRUE TRUE FALSE TRUE FALSE TRUE
TRUE TRUE FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE
TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE
FALSE FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE TRUE
FALSE TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE
TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
您可以使用 sapply
:
sapply(pred, grepl, steplm$model)
# cyl disp hp drat wt qsec vs am gear carb
# [1,] FALSE FALSE FALSE TRUE TRUE TRUE FALSE TRUE FALSE FALSE
# [2,] TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE TRUE TRUE
# [3,] FALSE FALSE FALSE FALSE TRUE TRUE FALSE TRUE FALSE TRUE
# [4,] FALSE FALSE FALSE TRUE TRUE TRUE FALSE TRUE TRUE FALSE
# [5,] FALSE TRUE FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE
# [6,] TRUE FALSE TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE
# [7,] FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE TRUE
# [8,] FALSE TRUE FALSE TRUE TRUE TRUE FALSE FALSE TRUE FALSE
# [9,] FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE
#[10,] TRUE TRUE TRUE FALSE FALSE FALSE FALSE TRUE FALSE TRUE
sapply
returns 一个矩阵。如果需要数据帧,可以将 data.frame
包装到 sapply
输出。
identical(data.frame(sapply(pred, grepl, steplm$model)), answer)
#[1] TRUE
您可以考虑对 bootstrap 重采样采取不同的方法,more similar to this。
library(tidyverse)
library(rsample)
library(broom)
car_boot <- bootstraps(mtcars, times = 100)
car_boot
#> # Bootstrap sampling
#> # A tibble: 100 x 2
#> splits id
#> <list> <chr>
#> 1 <split [32/10]> Bootstrap001
#> 2 <split [32/12]> Bootstrap002
#> 3 <split [32/12]> Bootstrap003
#> 4 <split [32/12]> Bootstrap004
#> 5 <split [32/13]> Bootstrap005
#> 6 <split [32/9]> Bootstrap006
#> 7 <split [32/11]> Bootstrap007
#> 8 <split [32/10]> Bootstrap008
#> 9 <split [32/13]> Bootstrap009
#> 10 <split [32/14]> Bootstrap010
#> # … with 90 more rows
fit_mpg <- function(split) {
step(lm(mpg ~ ., data = analysis(split)), direction = "backward", trace = 0)
}
boot_models <- car_boot %>%
mutate(model = map(splits, fit_mpg),
coef_info = map(model, tidy))
boot_coefs <- boot_models %>%
unnest(coef_info)
boot_coefs
#> # A tibble: 645 x 8
#> splits id model term estimate std.error statistic p.value
#> <list> <chr> <lis> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 <split [32/… Bootstrap… <lm> (Interc… 6.16 7.09 0.868 3.93e-1
#> 2 <split [32/… Bootstrap… <lm> drat 1.89 1.44 1.31 2.01e-1
#> 3 <split [32/… Bootstrap… <lm> wt -1.92 0.738 -2.61 1.50e-2
#> 4 <split [32/… Bootstrap… <lm> qsec 0.754 0.307 2.45 2.12e-2
#> 5 <split [32/… Bootstrap… <lm> am 3.80 1.49 2.55 1.70e-2
#> 6 <split [32/… Bootstrap… <lm> carb -0.859 0.407 -2.11 4.47e-2
#> 7 <split [32/… Bootstrap… <lm> (Interc… 8.51 4.05 2.10 4.48e-2
#> 8 <split [32/… Bootstrap… <lm> disp 0.00752 0.00474 1.59 1.25e-1
#> 9 <split [32/… Bootstrap… <lm> wt -1.17 0.708 -1.66 1.09e-1
#> 10 <split [32/… Bootstrap… <lm> gear 5.37 0.757 7.10 1.25e-7
#> # … with 635 more rows
boot_coefs %>%
select(id, term) %>%
filter(term != "(Intercept)") %>%
mutate(value = TRUE) %>%
complete(id, term, fill = list(value = FALSE)) %>%
pivot_wider(names_from = term, values_from = value)
#> # A tibble: 100 x 11
#> id am carb cyl disp drat gear hp qsec vs wt
#> <chr> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
#> 1 Bootstrap001 TRUE TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE TRUE
#> 2 Bootstrap002 FALSE TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE
#> 3 Bootstrap003 TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE
#> 4 Bootstrap004 FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
#> 5 Bootstrap005 FALSE TRUE FALSE TRUE TRUE TRUE FALSE FALSE FALSE TRUE
#> 6 Bootstrap006 FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE
#> 7 Bootstrap007 FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE
#> 8 Bootstrap008 FALSE FALSE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
#> 9 Bootstrap009 TRUE TRUE FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE
#> 10 Bootstrap010 FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
#> # … with 90 more rows
由 reprex package (v2.0.0)
于 2021-04-07 创建