R - 使用数据框中的所有非零分数作为用 sample() 替换自身的概率
R - Use all non-zero fractions in a data frame as probabilities for replacing themselves with sample()
我有一个名为 my_data
的 R tibble
,它由 (1) 个零或 (2) 个介于零和一之间的数字组成:
> my_data
# A tibble: 30 x 40
s1 s2 s3 s4 s5 s6 s7 s8 s9 s10 s11 s12 s13
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 0 0 0 0 0 0.969 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 0 0 0
5 0 0 0 0 0 0 0 0 0 0 0 0 0
6 0 0 0 0 0 0 0 0 0 0 0 0 0
7 0 0 0 0 0 0 0 0 0 0 0 0 0
8 0 0 0 0 0 0 0 0 0 0 0 0 0
9 0 0 0 0 0 0 0 0 0 0 0 0 0
10 0 0 0 0 0 0 0 0 0 0 0 0 0
# ... with 20 more rows, and 27 more variables: s14 <dbl>, s15 <dbl>, s16 <dbl>,
# s17 <dbl>, s18 <dbl>, s19 <dbl>, s20 <dbl>, s21 <dbl>, s22 <dbl>, s23 <dbl>,
# s24 <dbl>, s25 <dbl>, s26 <dbl>, s27 <dbl>, s28 <dbl>, s29 <dbl>, s30 <dbl>,
# s31 <dbl>, s32 <dbl>, s33 <dbl>, s34 <dbl>, s35 <dbl>, s36 <dbl>, s37 <dbl>,
# s38 <dbl>, s39 <dbl>, s40 <dbl>
我想以一定的概率将my_data
中的所有非零数字(例如s7
列中的0.969)替换为1,其中数字本身 是概率,否则它们将被替换为 0。例如,number 0.969(在名为 s7
的列中)被 1 替换的概率为 0.969,被 0 替换的概率为 0.031 .
我试过了,但没用:
# Doesn't work:
my_data %>%
mutate_all(function(x) {
case_when(x == 0 ~ 0,
x > 0 ~ sample(0:1, size = 1, prob = c(1 - x, x)))
})
我该怎么做?我应该使用 purrr::map()
(如何?)还是其他什么?谢谢!
这里是 my_data
的 dput()
:
structure(list(s1 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), s2 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0.956159271283707, 0), s3 = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.982878394164842,
0, 0, 0, 0, 0.982878394164842), s4 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0.959674748019852, 0, 0, 0, 0, 0, 0, 0,
0, 0.959674748019852, 0, 0, 0, 0, 0), s5 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.876892497722416,
0, 0, 0, 0, 0, 0.876892497722416, 0, 0), s6 = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0.989641778880238, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0.989641778880238, 0, 0, 0, 0, 0), s7 = c(0.969355168732184,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.969355168732184,
0, 0, 0, 0, 0, 0, 0, 0), s8 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0.991517098892877, 0, 0, 0, 0, 0, 0, 0, 0, 0.991517098892877,
0, 0, 0, 0.991517098892877, 0.991517098892877), s9 = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.929770500656618, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.929770500656618, 0, 0), s10 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), s11 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.993637560789263,
0), s12 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0.949340969426271, 0, 0, 0.949340969426271,
0, 0), s13 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.919919910704918, 0), s14 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.937896138681889,
0, 0, 0.937896138681889, 0, 0, 0, 0, 0), s15 = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0.991517098892877, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.991517098892877), s16 = c(0.956159271283707,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.956159271283707, 0, 0,
0, 0, 0, 0, 0, 0.956159271283707, 0, 0, 0, 0, 0, 0, 0.956159271283707,
0.956159271283707), s17 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0.597187792371775, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0.597187792371775, 0), s18 = c(0.975209130375021, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.975209130375021, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.975209130375021), s19 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), s20 = c(0.937234650859115, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.937234650859115, 0, 0, 0,
0, 0, 0.937234650859115, 0, 0, 0, 0, 0, 0, 0, 0), s21 = c(0.929770500656618,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.929770500656618,
0, 0, 0, 0, 0, 0.929770500656618, 0, 0, 0, 0, 0, 0, 0.929770500656618,
0), s22 = c(0.929770500656618, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.929770500656618, 0, 0, 0, 0,
0, 0, 0, 0.929770500656618), s23 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.921445826350068), s24 = c(0.919919910704918, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0), s25 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.993637560789263, 0),
s26 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.942968974602997, 0.942968974602997
), s27 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.959674748019852,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), s28 = c(0.999498946154851,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), s29 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0.988925875658174, 0), s30 = c(0, 0.975209130375021,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0.975209130375021, 0), s31 = c(0.986350500013957,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.986350500013957, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.986350500013957
), s32 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.942968974602997,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.942968974602997),
s33 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.927760110879459,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), s34 = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.919919910704918,
0, 0, 0, 0, 0, 0, 0, 0, 0.919919910704918, 0, 0, 0.919919910704918,
0, 0), s35 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.956159271283707,
0, 0, 0, 0, 0, 0, 0, 0.956159271283707, 0, 0, 0, 0, 0, 0,
0.956159271283707, 0), s36 = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0.991517098892877, 0, 0.991517098892877, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0.991517098892877, 0.991517098892877,
0), s37 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.919919910704918, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.919919910704918,
0), s38 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), s39 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0.999972102622884, 0, 0, 0, 0, 0), s40 = c(0.942968974602997,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0.942968974602997, 0, 0, 0, 0, 0, 0, 0, 0)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -30L))
你可以试试:
library(tidyverse)
as_tibble(apply(df, c(1,2), function(x) sample(c(0,1),1,prob=c(1-x,x))))
通常不鼓励从矩阵转换为 data.frame,但在这里您似乎确实有一个格式为 data.frame 的矩阵,所以我选择了它。
要避免转换,您可以这样做:
df %>% mutate_all(~ map_dbl(.,~sample(c(0,1),1,prob=c(1-.x,.x))))
以下将在采样前测试该值,但我不确定它会更快还是更快:
df %>% mutate_all(~ map_if(.,~. != 0, ~sample(c(0,1),1,prob=c(1-.x,.x))) %>% unlist)
如果你真的想使用你的自定义函数(case_when
)你可以做
df %>%
rowwise() %>%
mutate_all(function(x) {
case_when(x == 0 ~ 0L,
x > 0 ~ sample(0:1, size = 1, prob = c(1 - x, x)))
})
或
f = function(x) {
case_when(x == 0 ~ 0L,
x > 0 ~ sample(0:1, size = 1, prob = c(1 - x, x))) }
f = Vectorize(f)
df %>% mutate_all(f)
您的方法有 2 个问题。
1) 您的函数未矢量化,并且正在考虑整个概率列。错误是 Error in mutate_impl(.data, dots) :
Evaluation error: incorrect number of probabilities.
使用 rowwise
或矢量化您的函数将解决此问题。
2) case_when
没有 return 相同类型的值。错误是 Error in mutate_impl(.data, dots) :
Evaluation error: must be type double, not integer.
使用 0L
而不是 0
将解决这个问题。
您正在尝试从二项分布中抽样。幸运的是 rbinom
是关于其 prob
参数的矢量化,您可以避免任何 R 循环(for
、apply
、Vectorize
等)。
m <- as.matrix(DF)
set.seed(42) #for reproducibility
m[m != 0] <- rbinom(sum(m != 0), 1, m[m != 0])
我会使用 runif:
df %>%
map_df(~ if_else(runif(length(.x)) < .x, 1, 0))
我有一个名为 my_data
的 R tibble
,它由 (1) 个零或 (2) 个介于零和一之间的数字组成:
> my_data
# A tibble: 30 x 40
s1 s2 s3 s4 s5 s6 s7 s8 s9 s10 s11 s12 s13
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 0 0 0 0 0 0.969 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 0 0 0
5 0 0 0 0 0 0 0 0 0 0 0 0 0
6 0 0 0 0 0 0 0 0 0 0 0 0 0
7 0 0 0 0 0 0 0 0 0 0 0 0 0
8 0 0 0 0 0 0 0 0 0 0 0 0 0
9 0 0 0 0 0 0 0 0 0 0 0 0 0
10 0 0 0 0 0 0 0 0 0 0 0 0 0
# ... with 20 more rows, and 27 more variables: s14 <dbl>, s15 <dbl>, s16 <dbl>,
# s17 <dbl>, s18 <dbl>, s19 <dbl>, s20 <dbl>, s21 <dbl>, s22 <dbl>, s23 <dbl>,
# s24 <dbl>, s25 <dbl>, s26 <dbl>, s27 <dbl>, s28 <dbl>, s29 <dbl>, s30 <dbl>,
# s31 <dbl>, s32 <dbl>, s33 <dbl>, s34 <dbl>, s35 <dbl>, s36 <dbl>, s37 <dbl>,
# s38 <dbl>, s39 <dbl>, s40 <dbl>
我想以一定的概率将my_data
中的所有非零数字(例如s7
列中的0.969)替换为1,其中数字本身 是概率,否则它们将被替换为 0。例如,number 0.969(在名为 s7
的列中)被 1 替换的概率为 0.969,被 0 替换的概率为 0.031 .
我试过了,但没用:
# Doesn't work:
my_data %>%
mutate_all(function(x) {
case_when(x == 0 ~ 0,
x > 0 ~ sample(0:1, size = 1, prob = c(1 - x, x)))
})
我该怎么做?我应该使用 purrr::map()
(如何?)还是其他什么?谢谢!
这里是 my_data
的 dput()
:
structure(list(s1 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), s2 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0.956159271283707, 0), s3 = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.982878394164842,
0, 0, 0, 0, 0.982878394164842), s4 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0.959674748019852, 0, 0, 0, 0, 0, 0, 0,
0, 0.959674748019852, 0, 0, 0, 0, 0), s5 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.876892497722416,
0, 0, 0, 0, 0, 0.876892497722416, 0, 0), s6 = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0.989641778880238, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0.989641778880238, 0, 0, 0, 0, 0), s7 = c(0.969355168732184,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.969355168732184,
0, 0, 0, 0, 0, 0, 0, 0), s8 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0.991517098892877, 0, 0, 0, 0, 0, 0, 0, 0, 0.991517098892877,
0, 0, 0, 0.991517098892877, 0.991517098892877), s9 = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.929770500656618, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.929770500656618, 0, 0), s10 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), s11 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.993637560789263,
0), s12 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0.949340969426271, 0, 0, 0.949340969426271,
0, 0), s13 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.919919910704918, 0), s14 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.937896138681889,
0, 0, 0.937896138681889, 0, 0, 0, 0, 0), s15 = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0.991517098892877, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.991517098892877), s16 = c(0.956159271283707,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.956159271283707, 0, 0,
0, 0, 0, 0, 0, 0.956159271283707, 0, 0, 0, 0, 0, 0, 0.956159271283707,
0.956159271283707), s17 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0.597187792371775, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0.597187792371775, 0), s18 = c(0.975209130375021, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.975209130375021, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.975209130375021), s19 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), s20 = c(0.937234650859115, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.937234650859115, 0, 0, 0,
0, 0, 0.937234650859115, 0, 0, 0, 0, 0, 0, 0, 0), s21 = c(0.929770500656618,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.929770500656618,
0, 0, 0, 0, 0, 0.929770500656618, 0, 0, 0, 0, 0, 0, 0.929770500656618,
0), s22 = c(0.929770500656618, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.929770500656618, 0, 0, 0, 0,
0, 0, 0, 0.929770500656618), s23 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.921445826350068), s24 = c(0.919919910704918, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0), s25 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.993637560789263, 0),
s26 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.942968974602997, 0.942968974602997
), s27 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.959674748019852,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), s28 = c(0.999498946154851,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), s29 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0.988925875658174, 0), s30 = c(0, 0.975209130375021,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0.975209130375021, 0), s31 = c(0.986350500013957,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.986350500013957, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.986350500013957
), s32 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.942968974602997,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.942968974602997),
s33 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.927760110879459,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), s34 = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.919919910704918,
0, 0, 0, 0, 0, 0, 0, 0, 0.919919910704918, 0, 0, 0.919919910704918,
0, 0), s35 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.956159271283707,
0, 0, 0, 0, 0, 0, 0, 0.956159271283707, 0, 0, 0, 0, 0, 0,
0.956159271283707, 0), s36 = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0.991517098892877, 0, 0.991517098892877, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0.991517098892877, 0.991517098892877,
0), s37 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.919919910704918, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.919919910704918,
0), s38 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), s39 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0.999972102622884, 0, 0, 0, 0, 0), s40 = c(0.942968974602997,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0.942968974602997, 0, 0, 0, 0, 0, 0, 0, 0)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -30L))
你可以试试:
library(tidyverse)
as_tibble(apply(df, c(1,2), function(x) sample(c(0,1),1,prob=c(1-x,x))))
通常不鼓励从矩阵转换为 data.frame,但在这里您似乎确实有一个格式为 data.frame 的矩阵,所以我选择了它。
要避免转换,您可以这样做:
df %>% mutate_all(~ map_dbl(.,~sample(c(0,1),1,prob=c(1-.x,.x))))
以下将在采样前测试该值,但我不确定它会更快还是更快:
df %>% mutate_all(~ map_if(.,~. != 0, ~sample(c(0,1),1,prob=c(1-.x,.x))) %>% unlist)
如果你真的想使用你的自定义函数(case_when
)你可以做
df %>%
rowwise() %>%
mutate_all(function(x) {
case_when(x == 0 ~ 0L,
x > 0 ~ sample(0:1, size = 1, prob = c(1 - x, x)))
})
或
f = function(x) {
case_when(x == 0 ~ 0L,
x > 0 ~ sample(0:1, size = 1, prob = c(1 - x, x))) }
f = Vectorize(f)
df %>% mutate_all(f)
您的方法有 2 个问题。
1) 您的函数未矢量化,并且正在考虑整个概率列。错误是 Error in mutate_impl(.data, dots) :
Evaluation error: incorrect number of probabilities.
使用 rowwise
或矢量化您的函数将解决此问题。
2) case_when
没有 return 相同类型的值。错误是 Error in mutate_impl(.data, dots) :
Evaluation error: must be type double, not integer.
使用 0L
而不是 0
将解决这个问题。
您正在尝试从二项分布中抽样。幸运的是 rbinom
是关于其 prob
参数的矢量化,您可以避免任何 R 循环(for
、apply
、Vectorize
等)。
m <- as.matrix(DF)
set.seed(42) #for reproducibility
m[m != 0] <- rbinom(sum(m != 0), 1, m[m != 0])
我会使用 runif:
df %>%
map_df(~ if_else(runif(length(.x)) < .x, 1, 0))