根据公式生成模拟数据

Question

我有一个看起来像这样的数据集

d<- structure(list(groupid = c(2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 
2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 3L, 3L, 3L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 3L, 3L, 
3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 2L, 2L, 2L, 1L, 1L, 
1L, 2L, 2L, 2L, 1L, 1L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 3L, 3L, 
3L, 3L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), participant_id = c(1L, 
1L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 
7L, 7L, 7L, 8L, 8L, 9L, 9L, 9L, 10L, 10L, 10L, 11L, 11L, 11L, 
12L, 12L, 13L, 13L, 13L, 14L, 14L, 14L, 15L, 15L, 15L, 16L, 16L, 
17L, 17L, 17L, 18L, 18L, 19L, 19L, 19L, 20L, 20L, 20L, 21L, 21L, 
21L, 22L, 22L, 22L, 23L, 23L, 24L, 24L, 24L, 25L, 25L, 26L, 26L, 
26L, 27L, 27L, 28L, 28L, 28L, 29L, 29L, 29L, 30L, 30L, 31L, 31L, 
31L, 32L, 32L, 32L, 33L, 33L, 34L, 34L, 34L, 35L, 35L, 35L, 36L, 
36L, 36L, 37L, 37L, 37L, 38L, 38L, 38L, 39L, 39L, 39L, 40L, 40L, 
40L, 41L, 41L, 41L, 42L, 42L, 42L, 43L, 43L, 43L, 44L, 44L, 45L, 
45L, 46L, 46L, 47L, 47L, 47L, 48L, 48L, 49L, 49L, 50L, 50L), 
    attrib1_A = c(0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 
    0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 
    0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 
    0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 
    0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 
    1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 
    0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 
    0, 0, 1, 1, 0), attrib1_B = c(1, 0, 0, 0, 0, 0, 0, 1, 1, 
    0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 
    0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 
    0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 
    0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 
    0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 
    0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 
    0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1)), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -134L), groups = structure(list(
    participant_id = 1:50, .rows = structure(list(1:3, 4:5, 6:8, 
        9:11, 12:14, 15:17, 18:20, 21:22, 23:25, 26:28, 29:31, 
        32:33, 34:36, 37:39, 40:42, 43:44, 45:47, 48:49, 50:52, 
        53:55, 56:58, 59:61, 62:63, 64:66, 67:68, 69:71, 72:73, 
        74:76, 77:79, 80:81, 82:84, 85:87, 88:89, 90:92, 93:95, 
        96:98, 99:101, 102:104, 105:107, 108:110, 111:113, 114:116, 
        117:119, 120:121, 122:123, 124:125, 126:128, 129:130, 
        131:132, 133:134), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, -50L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE))

# Groups:   participant_id [4]
   groupid participant_id attrib1_A attrib1_B
     <int>          <int>     <dbl>     <dbl>
 1       2              1         0         1
 2       2              1         1         0
 3       2              1         0         0
 4       1              2         0         0
 5       1              2         1         0
 6       2              3         1         0
 7       2              3         0         0
 8       2              3         0         1
 9       2              4         0         1
10       2              4         1         0

我的目标是生成一个模拟因变量Y。 Y 是一个仅取两个值 1 和 0 的虚拟对象。我希望通过围绕以下真实数据生成过程使用伯努利分布来创建虚拟对象：

$y=a+因子(attrib1_A)+因子(attrib1_B)$。 attrib1_A 和 attrib1_B 是我感兴趣的关键回归变量。

其中a为常量：当attrib1和attrib2对应参考类别时y=1的概率。贝塔是 attrib1_A= 0.3 attrib1_B= -0.5.

有人知道我该怎么做吗？

非常感谢您的帮助

Answer 1

你的第一个街区对我来说运行，但练习很简单。在下面的代码中，我只创建了两个具有指定效果的因子，然后使用 arm::invlogit 创建可以传递给二项式的概率。


n <- 1e5
b1 <- .3 
b2 <- -.5 
a <- as.factor(rbinom(n, 1, .5))
b <- as.factor(rbinom(n, 1, .5))
probs <- arm::invlogit(b1*a + b2*b)
y <- rbinom(n, 1, prob = probs)

dat <- data.frame(a,b,y)
m1 <- glm(y ~ a + b, data =dat, family = binomial)
summary(m1)

# 
# Call:
#   glm(formula = y ~ a + b, family = binomial, data = dat)
# 
# Deviance Residuals: 
#   Min       1Q   Median       3Q      Max  
# -1.3056  -1.0828  -0.9662   1.1759   1.4045  
# 
# Coefficients:
#   Estimate Std. Error z value Pr(>|z|)    
# (Intercept)  0.003637   0.010994   0.331    0.741    
# a            0.292889   0.012811  22.862   <2e-16 ***
#   b           -0.523197   0.012812 -40.835   <2e-16 ***
#   ---
#   Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 
# (Dispersion parameter for binomial family taken to be 1)
# 
# Null deviance: 138336  on 99999  degrees of freedom
# Residual deviance: 136141  on 99997  degrees of freedom
# AIC: 136147
# 
# Number of Fisher Scoring iterations: 4

Number of Fisher Scoring iterations: 4

根据公式生成模拟数据

Generating simulated data based on a formula

simulation

r

montecarlo