R:随机抽样混合变量
R: Randomly Sampling Mixed Variables
我正在使用 R 编程语言。
假设我有以下 10 个变量(num_var_1、num_var_2、num_var_3、num_var_4、num_var_5、factor_var_1、factor_var_2, factor_var_3, factor_var_4, factor_var_5):
num_var_1 <- rnorm(1000, 10, 1)
num_var_2 <- rnorm(1000, 10, 5)
num_var_3 <- rnorm(1000, 10, 10)
num_var_4 <- rnorm(1000, 10, 10)
num_var_5 <- rnorm(1000, 10, 10)
factor_1 <- c("A","B", "C")
factor_2 <- c("AA","BB", "CC")
factor_3 <- c("AAA","BBB", "CCC", "DDD")
factor_4 <- c("AAAA","BBBB", "CCCC", "DDDD", "EEEE")
factor_5 <- c("AAAAA","BBBBB", "CCCCC", "DDDDD", "EEEEE", "FFFFFF")
factor_var_1 <- as.factor(sample(factor_1, 1000, replace=TRUE, prob=c(0.3, 0.5, 0.2)))
factor_var_2 <- as.factor(sample(factor_2, 1000, replace=TRUE, prob=c(0.5, 0.3, 0.2)))
factor_var_3 <- as.factor(sample(factor_3, 1000, replace=TRUE, prob=c(0.5, 0.2, 0.2, 0.1)))
factor_var_4 <- as.factor(sample(factor_4, 1000, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
factor_var_5 <- as.factor(sample(factor_4, 1000, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
my_data = data.frame(id,num_var_1, num_var_2, num_var_3, num_var_4, num_var_5, factor_var_1, factor_var_2, factor_var_3, factor_var_4, factor_var_5)
> head(my_data)
id num_var_1 num_var_2 num_var_3 num_var_4 num_var_5 factor_var_1 factor_var_2 factor_var_3 factor_var_4 factor_var_5
1 1 9.439524 5.021006 4.883963 8.496925 11.965498 B AA AAA CCCC AAAA
2 2 9.769823 4.800225 12.369379 6.722429 16.501132 B AA AAA AAAA AAAA
3 3 11.558708 9.910099 4.584108 -4.481653 16.710042 C AA BBB AAAA CCCC
4 4 10.070508 9.339124 22.192276 3.027154 -2.841578 B CC DDD BBBB AAAA
5 5 10.129288 -2.746714 11.741359 35.984902 -10.261096 B AA AAA DDDD DDDD
6 6 11.715065 15.202867 3.847317 9.625850 32.053261 B AA CCC BBBB EEEE
问题:我想做如下(在现实生活中,我只有“my_data”数据集):
- 从“my_data”中选择随机数的变量
- 对于步骤 1 中的“因子变量”,随机 select 每个变量的一些“水平”
- 对于第 1 步中的“数字变量”,随机 select 每个变量范围内的数字
- 多次重复步骤 1 - 步骤 3
例如,这可能是这样的:
- 迭代 1:num_var_3 (9.1),num_var_5 (2.1),factor_var_2(AA、CC)
- 迭代 2:num_var_1 (5.01),factor_var_3(AAA、CCC、DDD),factor_var_4(CCCC、DDDD),factor_var_5 (EEEEE)
等等
到目前为止我尝试了什么: 我尝试手动执行此操作:
#Iteration 1
# 4 variables are selected
n = sample.int(10, 1)
[1] 4
# which 4 variables are selected (each number corresponds to their position):
sample.int(10, length(n))
[1] 6 2 1 4
num_var_1
num_var_2
num_var_4
factor_var_1
#select random points for the continuous variables
p1 <- runif(1, min(num_var_1), max(num_var_1))
p2 <- runif(1, min(num_var_2), max(num_var_2))
p4 <- runif(1, min(num_var_4), max(num_var_4))
> p1
[1] 10.6902
> p2
[1] 18.11022
> p4
[1] -4.778462
#select random factor levels for the factor variable
nlevel = nlevels(factor_var_1)
nlevels = sample.int(nlevel, 1)
[1] 2
sample(factor_1, nlevels, replace=TRUE, prob=c(0.3, 0.5, 0.2))
[1] "A" "B"
# Desired Output
Iteration 1: num_var_1 = 10.6902 , num_var_2 = 18.11022 , num_var_4 = -4.778462, factor_var_1 = "A, B"
但这需要很长时间才能完成。
问题:谁能告诉我怎么做(即执行 10 次这样的迭代并记录结果)?
谢谢!
注:数据汇总
> summary(my_data)
id num_var_1 num_var_2 num_var_3 num_var_4 num_var_5 factor_var_1 factor_var_2 factor_var_3 factor_var_4 factor_var_5
Min. : 1.0 Min. : 6.658 Min. :-6.007 Min. :-23.775 Min. :-20.301 Min. :-20.59 A:294 AA:513 AAA:514 AAAA:495 AAAA:327
1st Qu.: 250.8 1st Qu.: 9.401 1st Qu.: 6.374 1st Qu.: 2.759 1st Qu.: 2.794 1st Qu.: 3.89 B:507 BB:291 BBB:202 BBBB:190 BBBB:271
Median : 500.5 Median :10.066 Median : 9.978 Median : 10.068 Median : 10.134 Median : 10.25 C:199 CC:196 CCC:199 CCCC: 94 CCCC:125
Mean : 500.5 Mean :10.061 Mean : 9.766 Mean : 9.938 Mean : 9.979 Mean : 10.33 DDD: 85 DDDD:103 DDDD:138
3rd Qu.: 750.2 3rd Qu.:10.716 3rd Qu.:13.188 3rd Qu.: 16.399 3rd Qu.: 17.404 3rd Qu.: 17.14 EEEE:118 EEEE:139
Max. :1000.0 Max. :13.270 Max. :24.805 Max. : 41.441 Max. : 42.262 Max. : 38.80
> str(my_data)
'data.frame': 1000 obs. of 11 variables:
$ id : int 1 2 3 4 5 6 7 8 9 10 ...
$ num_var_1 : num 9.13 9.96 8.2 10.49 9.19 ...
$ num_var_2 : num 19.03 3.31 16.73 20.52 10.35 ...
$ num_var_3 : num 25.45 6.26 24.99 8.11 26.45 ...
$ num_var_4 : num 21.284 2.313 3.203 -0.347 11.847 ...
$ num_var_5 : num 9.26 7.39 -1.4 13.94 10.71 ...
$ factor_var_1: Factor w/ 3 levels "A","B","C": 1 2 1 3 2 1 1 3 3 3 ...
$ factor_var_2: Factor w/ 3 levels "AA","BB","CC": 2 1 3 1 2 1 1 2 2 2 ...
$ factor_var_3: Factor w/ 4 levels "AAA","BBB","CCC",..: 3 1 1 1 4 1 4 4 1 3 ...
$ factor_var_4: Factor w/ 5 levels "AAAA","BBBB",..: 3 1 2 1 1 1 5 1 1 1 ...
$ factor_var_5: Factor w/ 5 levels "AAAA","BBBB",..: 1 2 4 2 1 4 4 3 1 2 ...
重要的是选择变量的比率是恒定的。
你可以通过 sum(1:var_num ) / (var_num^2)
.
获得
下一点是向量化(向量运算)。 (但据我所知,很难向量化你的因子过程,所以我没有这样做)。
sample
和 runif
returns 向量当 n
、size
> 1 时。它们非常快,所以我计算了 num_vals 的所有 runif 值和 fact_vals 的所有 nlevels,无论它是否被选中。
注意:
我在你的例子中使用 sample(factor_var_1, nlevels, replace=TRUE)
而不是 sample(factor_1, nlevels, replace=TRUE, prob=c(0.3, 0.5, 0.2))
。
注 2:
map
和map2
是'sapply'和'mapply'的近亲。
library(dplyr); library(purrr)
# calc the ratio of choosing variable
var_num <- ncol(my_data) - 1
var_select_ratio <- sum(1:var_num) / (var_num^2)
num_func <- function(vec, iter_num) {
random_val = runif(iter_num, min(vec), max(vec))
is_select <- sample(c(NA, 1), iter_num,
prob = c(1 - var_select_ratio, var_select_ratio), replace = TRUE)
return(random_val * is_select)
}
fac_func <- function(vec, iter_num) {
nlevels <- sample.int(length(levels(vec)), iter_num, replace = TRUE)
is_select <- sample(c(0, 1), iter_num,
prob = c(1 - var_select_ratio, var_select_ratio), replace = TRUE)
out <- map2(nlevels, is_select, # NOTE: this process isn't vectorized
function(nl, ic){
if(ic == 0) NULL else sample(vec, nl)
})
return(out)
}
integ_func <- function(vec, iter_num) {
if(is.factor(vec)) fac_func(vec, iter_num) else num_func(vec, iter_num)
}
set.seed(1)
res <- my_data %>%
select(-id) %>%
map(~ integ_func(.x, iter_num = 10)) %>% # use the func with each cols
as_tibble() # just appearance
# output
> res
# A tibble: 10 × 10
num_var_1 num_var_2 num_var_3 num_var_4 num_var_5 factor_var_1 factor_var_2 factor_var_3 factor_var_4 factor_var_5
<dbl> <dbl> <dbl> <dbl> <dbl> <list> <list> <list> <list> <list>
1 8.80 25.9 27.2 35.2 1.14 <fct [1]> <NULL> <fct [1]> <fct [1]> <fct [1]>
2 9.53 NA NA NA 18.9 <fct [1]> <fct [2]> <NULL> <NULL> <fct [1]>
3 NA 16.2 24.7 6.71 NA <fct [1]> <fct [1]> <fct [3]> <NULL> <fct [5]>
# if you want to paste factor_var
res2 <- res %>%
mutate_if(is.list, function(col) map_chr(col, function(cell) paste(sort(cell), collapse = " "))) %>% # paste
mutate_if(is.character, function(col) na_if(col, "")) # replace "" to NA
> res2
# A tibble: 10 × 10
num_var_1 num_var_2 num_var_3 num_var_4 num_var_5 factor_var_1 factor_var_2 factor_var_3 factor_var_4 factor_var_5
<dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr>
1 8.80 25.9 27.2 35.2 1.14 B NA BBB AAAA DDDDD
2 9.53 NA NA NA 18.9 B AA BB NA NA BBBBB
3 NA 16.2 24.7 6.71 NA B BB AAA AAA DDD NA AAAAA AAAAA BBBBB BBBBB FFF…
# the data I used
# (a litte modified, e.g., `factor_var_5` using not `factor_4` but `factor_5`)
set.seed(1)
num_var_1 <- rnorm(1000, 10, 1)
num_var_2 <- rnorm(1000, 10, 5)
num_var_3 <- rnorm(1000, 10, 10)
num_var_4 <- rnorm(1000, 10, 10)
num_var_5 <- rnorm(1000, 10, 10)
factor_1 <- c("A","B", "C")
factor_2 <- c("AA","BB", "CC")
factor_3 <- c("AAA","BBB", "CCC", "DDD")
factor_4 <- c("AAAA","BBBB", "CCCC", "DDDD", "EEEE")
factor_5 <- c("AAAAA","BBBBB", "CCCCC", "DDDDD", "EEEEE", "FFFFFF")
factor_var_1 <- as.factor(sample(factor_1, 1000, replace=TRUE, prob=c(0.3, 0.5, 0.2)))
factor_var_2 <- as.factor(sample(factor_2, 1000, replace=TRUE, prob=c(0.5, 0.3, 0.2)))
factor_var_3 <- as.factor(sample(factor_3, 1000, replace=TRUE, prob=c(0.5, 0.2, 0.2, 0.1)))
factor_var_4 <- as.factor(sample(factor_4, 1000, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
factor_var_5 <- as.factor(sample(factor_5, 1000, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1, 0.2)))
my_data = data.frame(id = 1:length(num_var_1), num_var_1, num_var_2, num_var_3, num_var_4, num_var_5,
factor_var_1, factor_var_2, factor_var_3, factor_var_4, factor_var_5)
我正在使用 R 编程语言。
假设我有以下 10 个变量(num_var_1、num_var_2、num_var_3、num_var_4、num_var_5、factor_var_1、factor_var_2, factor_var_3, factor_var_4, factor_var_5):
num_var_1 <- rnorm(1000, 10, 1)
num_var_2 <- rnorm(1000, 10, 5)
num_var_3 <- rnorm(1000, 10, 10)
num_var_4 <- rnorm(1000, 10, 10)
num_var_5 <- rnorm(1000, 10, 10)
factor_1 <- c("A","B", "C")
factor_2 <- c("AA","BB", "CC")
factor_3 <- c("AAA","BBB", "CCC", "DDD")
factor_4 <- c("AAAA","BBBB", "CCCC", "DDDD", "EEEE")
factor_5 <- c("AAAAA","BBBBB", "CCCCC", "DDDDD", "EEEEE", "FFFFFF")
factor_var_1 <- as.factor(sample(factor_1, 1000, replace=TRUE, prob=c(0.3, 0.5, 0.2)))
factor_var_2 <- as.factor(sample(factor_2, 1000, replace=TRUE, prob=c(0.5, 0.3, 0.2)))
factor_var_3 <- as.factor(sample(factor_3, 1000, replace=TRUE, prob=c(0.5, 0.2, 0.2, 0.1)))
factor_var_4 <- as.factor(sample(factor_4, 1000, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
factor_var_5 <- as.factor(sample(factor_4, 1000, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
my_data = data.frame(id,num_var_1, num_var_2, num_var_3, num_var_4, num_var_5, factor_var_1, factor_var_2, factor_var_3, factor_var_4, factor_var_5)
> head(my_data)
id num_var_1 num_var_2 num_var_3 num_var_4 num_var_5 factor_var_1 factor_var_2 factor_var_3 factor_var_4 factor_var_5
1 1 9.439524 5.021006 4.883963 8.496925 11.965498 B AA AAA CCCC AAAA
2 2 9.769823 4.800225 12.369379 6.722429 16.501132 B AA AAA AAAA AAAA
3 3 11.558708 9.910099 4.584108 -4.481653 16.710042 C AA BBB AAAA CCCC
4 4 10.070508 9.339124 22.192276 3.027154 -2.841578 B CC DDD BBBB AAAA
5 5 10.129288 -2.746714 11.741359 35.984902 -10.261096 B AA AAA DDDD DDDD
6 6 11.715065 15.202867 3.847317 9.625850 32.053261 B AA CCC BBBB EEEE
问题:我想做如下(在现实生活中,我只有“my_data”数据集):
- 从“my_data”中选择随机数的变量
- 对于步骤 1 中的“因子变量”,随机 select 每个变量的一些“水平”
- 对于第 1 步中的“数字变量”,随机 select 每个变量范围内的数字
- 多次重复步骤 1 - 步骤 3
例如,这可能是这样的:
- 迭代 1:num_var_3 (9.1),num_var_5 (2.1),factor_var_2(AA、CC)
- 迭代 2:num_var_1 (5.01),factor_var_3(AAA、CCC、DDD),factor_var_4(CCCC、DDDD),factor_var_5 (EEEEE)
等等
到目前为止我尝试了什么: 我尝试手动执行此操作:
#Iteration 1
# 4 variables are selected
n = sample.int(10, 1)
[1] 4
# which 4 variables are selected (each number corresponds to their position):
sample.int(10, length(n))
[1] 6 2 1 4
num_var_1
num_var_2
num_var_4
factor_var_1
#select random points for the continuous variables
p1 <- runif(1, min(num_var_1), max(num_var_1))
p2 <- runif(1, min(num_var_2), max(num_var_2))
p4 <- runif(1, min(num_var_4), max(num_var_4))
> p1
[1] 10.6902
> p2
[1] 18.11022
> p4
[1] -4.778462
#select random factor levels for the factor variable
nlevel = nlevels(factor_var_1)
nlevels = sample.int(nlevel, 1)
[1] 2
sample(factor_1, nlevels, replace=TRUE, prob=c(0.3, 0.5, 0.2))
[1] "A" "B"
# Desired Output
Iteration 1: num_var_1 = 10.6902 , num_var_2 = 18.11022 , num_var_4 = -4.778462, factor_var_1 = "A, B"
但这需要很长时间才能完成。
问题:谁能告诉我怎么做(即执行 10 次这样的迭代并记录结果)?
谢谢!
注:数据汇总
> summary(my_data)
id num_var_1 num_var_2 num_var_3 num_var_4 num_var_5 factor_var_1 factor_var_2 factor_var_3 factor_var_4 factor_var_5
Min. : 1.0 Min. : 6.658 Min. :-6.007 Min. :-23.775 Min. :-20.301 Min. :-20.59 A:294 AA:513 AAA:514 AAAA:495 AAAA:327
1st Qu.: 250.8 1st Qu.: 9.401 1st Qu.: 6.374 1st Qu.: 2.759 1st Qu.: 2.794 1st Qu.: 3.89 B:507 BB:291 BBB:202 BBBB:190 BBBB:271
Median : 500.5 Median :10.066 Median : 9.978 Median : 10.068 Median : 10.134 Median : 10.25 C:199 CC:196 CCC:199 CCCC: 94 CCCC:125
Mean : 500.5 Mean :10.061 Mean : 9.766 Mean : 9.938 Mean : 9.979 Mean : 10.33 DDD: 85 DDDD:103 DDDD:138
3rd Qu.: 750.2 3rd Qu.:10.716 3rd Qu.:13.188 3rd Qu.: 16.399 3rd Qu.: 17.404 3rd Qu.: 17.14 EEEE:118 EEEE:139
Max. :1000.0 Max. :13.270 Max. :24.805 Max. : 41.441 Max. : 42.262 Max. : 38.80
> str(my_data)
'data.frame': 1000 obs. of 11 variables:
$ id : int 1 2 3 4 5 6 7 8 9 10 ...
$ num_var_1 : num 9.13 9.96 8.2 10.49 9.19 ...
$ num_var_2 : num 19.03 3.31 16.73 20.52 10.35 ...
$ num_var_3 : num 25.45 6.26 24.99 8.11 26.45 ...
$ num_var_4 : num 21.284 2.313 3.203 -0.347 11.847 ...
$ num_var_5 : num 9.26 7.39 -1.4 13.94 10.71 ...
$ factor_var_1: Factor w/ 3 levels "A","B","C": 1 2 1 3 2 1 1 3 3 3 ...
$ factor_var_2: Factor w/ 3 levels "AA","BB","CC": 2 1 3 1 2 1 1 2 2 2 ...
$ factor_var_3: Factor w/ 4 levels "AAA","BBB","CCC",..: 3 1 1 1 4 1 4 4 1 3 ...
$ factor_var_4: Factor w/ 5 levels "AAAA","BBBB",..: 3 1 2 1 1 1 5 1 1 1 ...
$ factor_var_5: Factor w/ 5 levels "AAAA","BBBB",..: 1 2 4 2 1 4 4 3 1 2 ...
重要的是选择变量的比率是恒定的。
你可以通过 sum(1:var_num ) / (var_num^2)
.
获得
下一点是向量化(向量运算)。 (但据我所知,很难向量化你的因子过程,所以我没有这样做)。
sample
和 runif
returns 向量当 n
、size
> 1 时。它们非常快,所以我计算了 num_vals 的所有 runif 值和 fact_vals 的所有 nlevels,无论它是否被选中。
注意:
我在你的例子中使用 sample(factor_var_1, nlevels, replace=TRUE)
而不是 sample(factor_1, nlevels, replace=TRUE, prob=c(0.3, 0.5, 0.2))
。
注 2:
map
和map2
是'sapply'和'mapply'的近亲。
library(dplyr); library(purrr)
# calc the ratio of choosing variable
var_num <- ncol(my_data) - 1
var_select_ratio <- sum(1:var_num) / (var_num^2)
num_func <- function(vec, iter_num) {
random_val = runif(iter_num, min(vec), max(vec))
is_select <- sample(c(NA, 1), iter_num,
prob = c(1 - var_select_ratio, var_select_ratio), replace = TRUE)
return(random_val * is_select)
}
fac_func <- function(vec, iter_num) {
nlevels <- sample.int(length(levels(vec)), iter_num, replace = TRUE)
is_select <- sample(c(0, 1), iter_num,
prob = c(1 - var_select_ratio, var_select_ratio), replace = TRUE)
out <- map2(nlevels, is_select, # NOTE: this process isn't vectorized
function(nl, ic){
if(ic == 0) NULL else sample(vec, nl)
})
return(out)
}
integ_func <- function(vec, iter_num) {
if(is.factor(vec)) fac_func(vec, iter_num) else num_func(vec, iter_num)
}
set.seed(1)
res <- my_data %>%
select(-id) %>%
map(~ integ_func(.x, iter_num = 10)) %>% # use the func with each cols
as_tibble() # just appearance
# output
> res
# A tibble: 10 × 10
num_var_1 num_var_2 num_var_3 num_var_4 num_var_5 factor_var_1 factor_var_2 factor_var_3 factor_var_4 factor_var_5
<dbl> <dbl> <dbl> <dbl> <dbl> <list> <list> <list> <list> <list>
1 8.80 25.9 27.2 35.2 1.14 <fct [1]> <NULL> <fct [1]> <fct [1]> <fct [1]>
2 9.53 NA NA NA 18.9 <fct [1]> <fct [2]> <NULL> <NULL> <fct [1]>
3 NA 16.2 24.7 6.71 NA <fct [1]> <fct [1]> <fct [3]> <NULL> <fct [5]>
# if you want to paste factor_var
res2 <- res %>%
mutate_if(is.list, function(col) map_chr(col, function(cell) paste(sort(cell), collapse = " "))) %>% # paste
mutate_if(is.character, function(col) na_if(col, "")) # replace "" to NA
> res2
# A tibble: 10 × 10
num_var_1 num_var_2 num_var_3 num_var_4 num_var_5 factor_var_1 factor_var_2 factor_var_3 factor_var_4 factor_var_5
<dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr>
1 8.80 25.9 27.2 35.2 1.14 B NA BBB AAAA DDDDD
2 9.53 NA NA NA 18.9 B AA BB NA NA BBBBB
3 NA 16.2 24.7 6.71 NA B BB AAA AAA DDD NA AAAAA AAAAA BBBBB BBBBB FFF…
# the data I used
# (a litte modified, e.g., `factor_var_5` using not `factor_4` but `factor_5`)
set.seed(1)
num_var_1 <- rnorm(1000, 10, 1)
num_var_2 <- rnorm(1000, 10, 5)
num_var_3 <- rnorm(1000, 10, 10)
num_var_4 <- rnorm(1000, 10, 10)
num_var_5 <- rnorm(1000, 10, 10)
factor_1 <- c("A","B", "C")
factor_2 <- c("AA","BB", "CC")
factor_3 <- c("AAA","BBB", "CCC", "DDD")
factor_4 <- c("AAAA","BBBB", "CCCC", "DDDD", "EEEE")
factor_5 <- c("AAAAA","BBBBB", "CCCCC", "DDDDD", "EEEEE", "FFFFFF")
factor_var_1 <- as.factor(sample(factor_1, 1000, replace=TRUE, prob=c(0.3, 0.5, 0.2)))
factor_var_2 <- as.factor(sample(factor_2, 1000, replace=TRUE, prob=c(0.5, 0.3, 0.2)))
factor_var_3 <- as.factor(sample(factor_3, 1000, replace=TRUE, prob=c(0.5, 0.2, 0.2, 0.1)))
factor_var_4 <- as.factor(sample(factor_4, 1000, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
factor_var_5 <- as.factor(sample(factor_5, 1000, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1, 0.2)))
my_data = data.frame(id = 1:length(num_var_1), num_var_1, num_var_2, num_var_3, num_var_4, num_var_5,
factor_var_1, factor_var_2, factor_var_3, factor_var_4, factor_var_5)