simstudy:genData 产生相同的值
simstudy: genData producing identical values
这是我第一次尝试模拟数据 - 我们想模拟一个数据集,并选择使用以下代码使用 simstudy:
def <- defData(varname='median_household_income',formula=mean(
df$median_household_income))
def <- defData(def, varname='share_unemployed_seasonal',formula=mean(
df$share_unemployed_seasonal))
def <- defData(def, varname='share_population_in_metro_areas',
formula=mean(df$share_population_in_metro_areas))
def <- defData(def, varname='share_population_with_high_school_degree',
formula=mean(df$share_population_with_high_school_degree))
def <- defData(def, varname='share_non_citizen',
formula=mean(df$share_non_citizen))
def <- defData(def, varname='share_white_poverty',
formula=mean(df$share_white_poverty))
def <- defData(def, varname='gini_index',formula=mean(df$gini_index))
def <- defData(def, varname='share_non_white',formula=mean(df$share_non_white))
def <- defData(def, varname='share_voters_voted_trump',
formula=mean(df$share_voters_voted_trump))
#outcome
def <- defData(def, varname='avg_hatecrimes_per_100k_fbi',formula=
".0001*median_household_income + 44*share_unemployed_seasonal +
-2.8*share_population_in_metro_areas +
24*share_population_with_high_school_degree + 22*share_non_citizen +
3.2*share_white_poverty + 55*gini_index + -4*share_non_white +
-2.6*share_voters_voted_trump")
#generate simulated data
df_sim <- genData(10000,def)
输出如下所示:
head(df_sim)
id median_household_income share_unemployed_seasonal share_population_in_metro_areas
1: 1 55223.61 0.04956863 0.7501961
2: 2 55223.61 0.04956863 0.7501961
3: 3 55223.61 0.04956863 0.7501961
4: 4 55223.61 0.04956863 0.7501961
5: 5 55223.61 0.04956863 0.7501961
6: 6 55223.61 0.04956863 0.7501961
为什么生成的值都是一样的?我的理解是默认情况下变量是根据正态分布生成的。对此的任何帮助表示赞赏!
我发现您指的是一个包 simstudy
。如果您查看 defData
函数 (link here) 的文档,您会发现 defData
函数的 variance
参数默认为零。如果你想有不同的观察结果,你需要将这个值设置为大于 0 的数字。
defData
函数的默认行为:
defData(dtDefs = NULL, varname, formula, variance = 0,
dist = "normal", link = "identity", id = "id")
所以你可能想要运行像
这样的命令
def <- defData(varname='median_household_income',
formula=mean(df$median_household_income),
variance = 1)
这是我第一次尝试模拟数据 - 我们想模拟一个数据集,并选择使用以下代码使用 simstudy:
def <- defData(varname='median_household_income',formula=mean(
df$median_household_income))
def <- defData(def, varname='share_unemployed_seasonal',formula=mean(
df$share_unemployed_seasonal))
def <- defData(def, varname='share_population_in_metro_areas',
formula=mean(df$share_population_in_metro_areas))
def <- defData(def, varname='share_population_with_high_school_degree',
formula=mean(df$share_population_with_high_school_degree))
def <- defData(def, varname='share_non_citizen',
formula=mean(df$share_non_citizen))
def <- defData(def, varname='share_white_poverty',
formula=mean(df$share_white_poverty))
def <- defData(def, varname='gini_index',formula=mean(df$gini_index))
def <- defData(def, varname='share_non_white',formula=mean(df$share_non_white))
def <- defData(def, varname='share_voters_voted_trump',
formula=mean(df$share_voters_voted_trump))
#outcome
def <- defData(def, varname='avg_hatecrimes_per_100k_fbi',formula=
".0001*median_household_income + 44*share_unemployed_seasonal +
-2.8*share_population_in_metro_areas +
24*share_population_with_high_school_degree + 22*share_non_citizen +
3.2*share_white_poverty + 55*gini_index + -4*share_non_white +
-2.6*share_voters_voted_trump")
#generate simulated data
df_sim <- genData(10000,def)
输出如下所示:
head(df_sim)
id median_household_income share_unemployed_seasonal share_population_in_metro_areas
1: 1 55223.61 0.04956863 0.7501961
2: 2 55223.61 0.04956863 0.7501961
3: 3 55223.61 0.04956863 0.7501961
4: 4 55223.61 0.04956863 0.7501961
5: 5 55223.61 0.04956863 0.7501961
6: 6 55223.61 0.04956863 0.7501961
为什么生成的值都是一样的?我的理解是默认情况下变量是根据正态分布生成的。对此的任何帮助表示赞赏!
我发现您指的是一个包 simstudy
。如果您查看 defData
函数 (link here) 的文档,您会发现 defData
函数的 variance
参数默认为零。如果你想有不同的观察结果,你需要将这个值设置为大于 0 的数字。
defData
函数的默认行为:
defData(dtDefs = NULL, varname, formula, variance = 0,
dist = "normal", link = "identity", id = "id")
所以你可能想要运行像
这样的命令def <- defData(varname='median_household_income',
formula=mean(df$median_household_income),
variance = 1)