每个人重复一个模型 100 次
Repeat a model 100 times per individual
我想在 R 中每个人重复一个模型 (GAM) 100 次。然后我想对这些模型进行模型平均并获得每个人的平均系数。我有一个包含一组个体 (dist_FS.utm) 的响应 + 解释变量的数据集。
> head(dist_FS.utm)
id min_dist_to_FS used min_dist_exp x_utm y_utm
1 41 918.8052 1 0.4581917 465801.8 5033858
2 41 1863.5125 1 0.7114719 465682.4 5041281
3 41 830.5054 1 0.4253230 460491.4 5040223
4 41 3381.4481 1 0.8951711 464405.4 5039687
5 41 3392.7368 1 0.8959575 464442.1 5059669
6 41 654.1306 1 0.3535795 464495.0 5061919
我已经检查了一些问题,例如:simulate a linear model 100 times and How to repeat a process N times? 但我还是没能做到。
到目前为止,我已经这样做了:
for (i in (unique(dist_FS.utm$id))) {
db <- dist_FS.utm[dist_FS.utm$id==i,]
gam_id <- gam(used~min_dist_exp + s(x_utm, y_utm), data = db, family=binomial)
gam_id_100 <- t(sapply(1:100,gam_id))
a <- model.avg(gam_id_100)
}
我得到这个错误:
Error in get(as.character(FUN), mode = "function", envir = envir) :
object 'gam_id' of mode 'function' was not found
我认为这是因为 gam_id
不是函数 (https://www.guru99.com/r-apply-sapply-tapply.html). I was trying to write this as a function and then use replicate(), but I am not sure how, because I need to split by individual... (I'm not very familiar with functions and loops, but saw that here: https://nicercode.github.io/guides/repeating-things/)
有谁知道我怎样才能有效地做到这一点?
根据我上面写的代码,即使它可以工作,我不认为我会得到每个人的平均系数,对吧?如果是这样,我怎样才能得到它?也许我必须将它们存储在某个地方...
编辑
>dput(head(dist_FS.utm,30))
structure(list(X = c(1L, 15L, 25L, 32L, 33L, 34L, 38L, 39L, 40L,
41L, 42L, 43L, 44L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L,
55L, 56L, 58L, 59L, 60L, 61L, 62L, 73L), animals_id = c(41L,
41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L,
41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L,
41L, 41L, 41L), min_dist_to_FS = c(918.80522737075, 1863.51246503695,
830.505382951001, 3381.44807737323, 3392.73683300527, 654.130586700104,
2566.31958442098, 1053.65311284759, 966.144306603196, 873.484749339232,
121.806430036914, 1387.64623589338, 1004.18377421608, 1549.18381508086,
1419.58318230344, 1467.30800101015, 1982.15665056485, 1574.24870423625,
1649.08689860009, 1509.82936705109, 1522.69465586994, 1549.32812543367,
1433.34459545157, 1399.18930656071, 1516.77860478692, 1517.30360715202,
1535.19659260561, 1527.56075938311, 1507.97021104668, 955.188805770983
), used = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), min_dist_exp = c(0.458191730619659, 0.711471904832608, 0.425322974299617,
0.895171109000169, 0.895957464432406, 0.353579527899345, 0.819447766862565,
0.504796567587344, 0.475032147975436, 0.441563479599525, 0.0780321158438149,
0.60369059194633, 0.488184247651747, 0.644171214938928, 0.612043470340838,
0.624198588744711, 0.733424689932805, 0.650070609306066, 0.667109263467795,
0.634707248445273, 0.63782846850107, 0.6442054635922, 0.615588176935089,
0.606730152341633, 0.636396514046123, 0.63652381717982, 0.640835984546634,
0.639002059809323, 0.634253983612545, 0.471181990354149), x_utm = c(455801.848652911,
455682.436631772, 450491.389615233, 454405.43316214, 454442.094012587,
454495.000195656, 455008.476539838, 449097.624664053, 450540.707026459,
450500.924335471, 449766.142753517, 448660.380605766, 448291.700201206,
447688.093883708, 447676.546228742, 447828.232148529, 451793.759557934,
447638.292913575, 447576.821010766, 447710.347453009, 447691.498899187,
447679.063697279, 447658.862064349, 447704.743534774, 447700.077591867,
447683.441775526, 447696.249109853, 447715.294788822, 447692.191785761,
449126.687863692), y_utm = c(5043858.23523823, 5051281.39009194,
5050222.99885332, 5049686.7739777, 5049669.10780019, 5041919.21100737,
5046290.39022494, 5048739.43310135, 5050065.94922413, 5050163.4816897,
5050810.6018219, 5051476.22456451, 5050628.56392999, 5048652.92037187,
5048857.81952291, 5048635.6397179, 5050709.27628448, 5048667.78083257,
5048623.8530378, 5048686.25515273, 5048686.96825384, 5048661.69552229,
5048857.78968914, 5048855.73937433, 5048686.70765026, 5048702.95718389,
5048664.51836415, 5048656.5817114, 5048707.32827632, 5045708.60831853
)), row.names = c(NA, 30L), class = "data.frame")
考虑 by
(tapply
的面向对象包装器),它可以通过一个或多个因素对数据帧进行子集化,并将子集传递给方法。对于该方法,在 sapply
中指定 function
或使用其包装器 replicate
,它不需要输入参数,但会重新 运行 任何表达式 n 次。
此外,而不是 model.avg
从 gam
模型对象中提取特定系数,然后对所有 100 个结果进行平均。请注意:您没有指定 运行 平均值的确切系数。下面仅提取 min_dist_exp
个术语。
gam_id_avg_coeff_list <- by(dist_FS.utm, dist_FS.utm$animals_id, function(sub) {
gam_formula <- used ~ min_dist_exp + s(x_utm, y_utm)
gam_mod_100 <- replicate(100, {
mod <- summary(gam(gam_formula, data=sub, family=binomial))
mod$coefficients["min_dist_exp", "Estimate"] # EXTRACT SPECIFIC COEFF
})
# ALTERNATIVELY WITH sapply
# gam_mod_100 <- sapply(1:100, function(i) {
# mod <- summary(gam(gam_formula, data=sub, family=binomial))
# mod$coefficients["min_dist_exp", "Estimate"] # EXTRACT SPECIFIC COEFF
# })
return(mean(gam_mod_100, na.rm=TRUE)) # RETURN MEAN OF COEFFICIENTS
})
gam_id_avg_coeff_list
# CONVERT TO NAMED VECTOR
gam_id_avg_coeff_vec <- c(gam_id_avg_coeff_list)
# CONVERT TO NAMED MATRIX
gam_id_avg_coeff_matrix <- matrix(gam_id_avg_coeff_list,
dimnames = list(names(gam_id_avg_coeff_list), "mean_100"))
我想在 R 中每个人重复一个模型 (GAM) 100 次。然后我想对这些模型进行模型平均并获得每个人的平均系数。我有一个包含一组个体 (dist_FS.utm) 的响应 + 解释变量的数据集。
> head(dist_FS.utm)
id min_dist_to_FS used min_dist_exp x_utm y_utm
1 41 918.8052 1 0.4581917 465801.8 5033858
2 41 1863.5125 1 0.7114719 465682.4 5041281
3 41 830.5054 1 0.4253230 460491.4 5040223
4 41 3381.4481 1 0.8951711 464405.4 5039687
5 41 3392.7368 1 0.8959575 464442.1 5059669
6 41 654.1306 1 0.3535795 464495.0 5061919
我已经检查了一些问题,例如:simulate a linear model 100 times and How to repeat a process N times? 但我还是没能做到。
到目前为止,我已经这样做了:
for (i in (unique(dist_FS.utm$id))) {
db <- dist_FS.utm[dist_FS.utm$id==i,]
gam_id <- gam(used~min_dist_exp + s(x_utm, y_utm), data = db, family=binomial)
gam_id_100 <- t(sapply(1:100,gam_id))
a <- model.avg(gam_id_100)
}
我得到这个错误:
Error in get(as.character(FUN), mode = "function", envir = envir) :
object 'gam_id' of mode 'function' was not found
我认为这是因为 gam_id
不是函数 (https://www.guru99.com/r-apply-sapply-tapply.html). I was trying to write this as a function and then use replicate(), but I am not sure how, because I need to split by individual... (I'm not very familiar with functions and loops, but saw that here: https://nicercode.github.io/guides/repeating-things/)
有谁知道我怎样才能有效地做到这一点?
根据我上面写的代码,即使它可以工作,我不认为我会得到每个人的平均系数,对吧?如果是这样,我怎样才能得到它?也许我必须将它们存储在某个地方...
编辑
>dput(head(dist_FS.utm,30)) structure(list(X = c(1L, 15L, 25L, 32L, 33L, 34L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L, 56L, 58L, 59L, 60L, 61L, 62L, 73L), animals_id = c(41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L), min_dist_to_FS = c(918.80522737075, 1863.51246503695, 830.505382951001, 3381.44807737323, 3392.73683300527, 654.130586700104, 2566.31958442098, 1053.65311284759, 966.144306603196, 873.484749339232, 121.806430036914, 1387.64623589338, 1004.18377421608, 1549.18381508086, 1419.58318230344, 1467.30800101015, 1982.15665056485, 1574.24870423625, 1649.08689860009, 1509.82936705109, 1522.69465586994, 1549.32812543367, 1433.34459545157, 1399.18930656071, 1516.77860478692, 1517.30360715202, 1535.19659260561, 1527.56075938311, 1507.97021104668, 955.188805770983 ), used = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), min_dist_exp = c(0.458191730619659, 0.711471904832608, 0.425322974299617, 0.895171109000169, 0.895957464432406, 0.353579527899345, 0.819447766862565, 0.504796567587344, 0.475032147975436, 0.441563479599525, 0.0780321158438149, 0.60369059194633, 0.488184247651747, 0.644171214938928, 0.612043470340838, 0.624198588744711, 0.733424689932805, 0.650070609306066, 0.667109263467795, 0.634707248445273, 0.63782846850107, 0.6442054635922, 0.615588176935089, 0.606730152341633, 0.636396514046123, 0.63652381717982, 0.640835984546634, 0.639002059809323, 0.634253983612545, 0.471181990354149), x_utm = c(455801.848652911, 455682.436631772, 450491.389615233, 454405.43316214, 454442.094012587, 454495.000195656, 455008.476539838, 449097.624664053, 450540.707026459, 450500.924335471, 449766.142753517, 448660.380605766, 448291.700201206, 447688.093883708, 447676.546228742, 447828.232148529, 451793.759557934, 447638.292913575, 447576.821010766, 447710.347453009, 447691.498899187, 447679.063697279, 447658.862064349, 447704.743534774, 447700.077591867, 447683.441775526, 447696.249109853, 447715.294788822, 447692.191785761, 449126.687863692), y_utm = c(5043858.23523823, 5051281.39009194, 5050222.99885332, 5049686.7739777, 5049669.10780019, 5041919.21100737, 5046290.39022494, 5048739.43310135, 5050065.94922413, 5050163.4816897, 5050810.6018219, 5051476.22456451, 5050628.56392999, 5048652.92037187, 5048857.81952291, 5048635.6397179, 5050709.27628448, 5048667.78083257, 5048623.8530378, 5048686.25515273, 5048686.96825384, 5048661.69552229, 5048857.78968914, 5048855.73937433, 5048686.70765026, 5048702.95718389, 5048664.51836415, 5048656.5817114, 5048707.32827632, 5045708.60831853 )), row.names = c(NA, 30L), class = "data.frame")
考虑 by
(tapply
的面向对象包装器),它可以通过一个或多个因素对数据帧进行子集化,并将子集传递给方法。对于该方法,在 sapply
中指定 function
或使用其包装器 replicate
,它不需要输入参数,但会重新 运行 任何表达式 n 次。
此外,而不是 model.avg
从 gam
模型对象中提取特定系数,然后对所有 100 个结果进行平均。请注意:您没有指定 运行 平均值的确切系数。下面仅提取 min_dist_exp
个术语。
gam_id_avg_coeff_list <- by(dist_FS.utm, dist_FS.utm$animals_id, function(sub) {
gam_formula <- used ~ min_dist_exp + s(x_utm, y_utm)
gam_mod_100 <- replicate(100, {
mod <- summary(gam(gam_formula, data=sub, family=binomial))
mod$coefficients["min_dist_exp", "Estimate"] # EXTRACT SPECIFIC COEFF
})
# ALTERNATIVELY WITH sapply
# gam_mod_100 <- sapply(1:100, function(i) {
# mod <- summary(gam(gam_formula, data=sub, family=binomial))
# mod$coefficients["min_dist_exp", "Estimate"] # EXTRACT SPECIFIC COEFF
# })
return(mean(gam_mod_100, na.rm=TRUE)) # RETURN MEAN OF COEFFICIENTS
})
gam_id_avg_coeff_list
# CONVERT TO NAMED VECTOR
gam_id_avg_coeff_vec <- c(gam_id_avg_coeff_list)
# CONVERT TO NAMED MATRIX
gam_id_avg_coeff_matrix <- matrix(gam_id_avg_coeff_list,
dimnames = list(names(gam_id_avg_coeff_list), "mean_100"))