R-样本量问题中的分层抽样
Stratified Sampling in R- sample size issues
我正在尝试使用 splitstackshape 包中的分层函数在 R 中进行分层抽样。我有四个层(标记为 1:4)。当设置 size = 1 时,它 returns 一行属于每个层(太棒了!)。但是,我无法将样本量增加一个。
我想要 select 5 行:其中 4 行属于地层 1:4,第五行属于地层 #1(覆盖我的区域最多的地层)研究地点);理想情况下,这将在没有替换的情况下完成,因此从 strata #1 采样的第二行将与第一行不同。
设置大小 = 1 - 1.99 始终 returns 每层 1 行(共 4 行)。设置大小 = 2 returns 8 行(每层 2 行)。
数据框
homer_join_strat<- structure(list(cluster = c(4L, 3L, 4L, 5L, 5L, 4L, 4L, 4L, 5L,
5L, 5L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 5L, 1L, 1L, 5L, 4L, 4L,
4L), waterbody = c("Homer", "Homer", "Homer", "Homer", "Homer",
"Homer", "Homer", "Homer", "Homer", "Homer", "Homer", "Homer",
"Homer", "Homer", "Homer", "Homer", "Homer", "Homer", "Homer",
"Homer", "Homer", "Homer", "Homer", "Homer", "Homer", "Homer"
), transect_number = 1:26, BLG = c(75, 38.4204909284952, 77.634011090573,
82.1917808219178, 119.341563786008, 22.5422667501565, 155.275381552754,
81.1332904056665, 37.037037037037, 73.2824427480916, 71.608040201005,
208.806818181818, 116.504854368932, 119.775421085465, 104.408352668213,
117.391304347826, 12.0603015075377, 93.5593220338983, 166.795366795367,
20, 91.566265060241, 70.8860759493671, 0, 44.8765893792072, 0,
3.96563119629874), GSF = c(0, 6.4034151547492, 11.090573012939,
16.4383561643836, 4.11522633744856, 0, 0, 0, 0, 4.58015267175573,
0, 21.3068181818182, 0, 0, 6.96055684454756, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), LMB = c(51.3157894736842, 83.2443970117396, 73.9371534195933,
71.2328767123288, 28.8065843621399, 37.5704445835942, 59.721300597213,
38.6349001931745, 66.6666666666667, 77.8625954198473, 67.8391959798995,
63.9204545454545, 46.6019417475728, 22.4578914535246, 6.96055684454756,
13.0434782608696, 24.1206030150754, 40.6779661016949, 60.2316602316602,
56, 28.9156626506024, 55.6962025316456, 20.2360876897133, 31.413612565445,
0, 31.7250495703899), YLB = c(0, 0, 14.7874306839187, 0, 4.11522633744856,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4.63320463320463, 0, 0,
0, 0, 0, 0, 0), BLC = c(7.89473684210526, 12.8068303094984, 7.39371534195933,
10.958904109589, 0, 0, 3.9814200398142, 0, 3.7037037037037, 13.7404580152672,
11.3065326633166, 12.7840909090909, 3.88349514563107, 3.74298190892077,
0, 0, 0, 4.06779661016949, 0, 16, 9.63855421686747, 5.06329113924051,
20.2360876897133, 22.4382946896036, 0, 7.93126239259749), WHC = c(0,
0, 0, 0, 0, 0, 3.9814200398142, 0, 7.40740740740741, 0, 0, 0,
0, 7.48596381784155, 0, 0, 0, 0, 0, 0, 0, 10.126582278481, 0,
4.48765893792072, 0, 0), RSF = c(0, 0, 0, 0, 0, 0, 11.9442601194426,
0, 0, 13.7404580152672, 0, 0, 0, 0, 0, 0, 0, 0, 27.7992277992278,
0, 4.81927710843374, 0, 0, 4.48765893792072, 0, 0), CCF = c(0,
0, 0, 0, 0, 0, 0, 0, 3.7037037037037, 0, 0, 0, 3.88349514563107,
0, 0, 0, 0, 8.13559322033898, 0, 0, 0, 0, 0, 0, 0, 0), BLB = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8.52272727272727, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0)), class = "data.frame", row.names = c(NA,
-26L))
代码
stratified(homer_join_strat, "cluster", size=.06)
stratified(homer_join_strat, "cluster", size=.07)
stratified(homer_join_strat, "cluster", size=.09)
stratified(homer_join_strat, "cluster", size=2)
有没有人有使用这个函数来select一些不等于或不是数据层数倍数的行的经验?
我可以通过分别设置大小 = .06、.07、.09 (为什么我认为 1-1.99 之间的数字会给我 5 行 selected
提前致谢!
来自您正在使用的函数的文档“如果 size 是一个命名向量,该函数将检查向量的长度是否与组数匹配以及名称是否与组名匹配”- 它然后继续举例说明您的问题。
DF <- data.frame(
ID = 1:100,
A = sample(c("AA", "BB", "CC", "DD", "EE"), 100, replace = TRUE),
B = rnorm(100), C = abs(round(rnorm(100), digits=1)),
D = sample(c("CA", "NY", "TX"), 100, replace = TRUE),
E = sample(c("M", "F"), 100, replace = TRUE))
stratified(DF, "D", c(CA = 5, NY = 3, TX = 2))
我正在尝试使用 splitstackshape 包中的分层函数在 R 中进行分层抽样。我有四个层(标记为 1:4)。当设置 size = 1 时,它 returns 一行属于每个层(太棒了!)。但是,我无法将样本量增加一个。
我想要 select 5 行:其中 4 行属于地层 1:4,第五行属于地层 #1(覆盖我的区域最多的地层)研究地点);理想情况下,这将在没有替换的情况下完成,因此从 strata #1 采样的第二行将与第一行不同。
设置大小 = 1 - 1.99 始终 returns 每层 1 行(共 4 行)。设置大小 = 2 returns 8 行(每层 2 行)。
数据框
homer_join_strat<- structure(list(cluster = c(4L, 3L, 4L, 5L, 5L, 4L, 4L, 4L, 5L,
5L, 5L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 5L, 1L, 1L, 5L, 4L, 4L,
4L), waterbody = c("Homer", "Homer", "Homer", "Homer", "Homer",
"Homer", "Homer", "Homer", "Homer", "Homer", "Homer", "Homer",
"Homer", "Homer", "Homer", "Homer", "Homer", "Homer", "Homer",
"Homer", "Homer", "Homer", "Homer", "Homer", "Homer", "Homer"
), transect_number = 1:26, BLG = c(75, 38.4204909284952, 77.634011090573,
82.1917808219178, 119.341563786008, 22.5422667501565, 155.275381552754,
81.1332904056665, 37.037037037037, 73.2824427480916, 71.608040201005,
208.806818181818, 116.504854368932, 119.775421085465, 104.408352668213,
117.391304347826, 12.0603015075377, 93.5593220338983, 166.795366795367,
20, 91.566265060241, 70.8860759493671, 0, 44.8765893792072, 0,
3.96563119629874), GSF = c(0, 6.4034151547492, 11.090573012939,
16.4383561643836, 4.11522633744856, 0, 0, 0, 0, 4.58015267175573,
0, 21.3068181818182, 0, 0, 6.96055684454756, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), LMB = c(51.3157894736842, 83.2443970117396, 73.9371534195933,
71.2328767123288, 28.8065843621399, 37.5704445835942, 59.721300597213,
38.6349001931745, 66.6666666666667, 77.8625954198473, 67.8391959798995,
63.9204545454545, 46.6019417475728, 22.4578914535246, 6.96055684454756,
13.0434782608696, 24.1206030150754, 40.6779661016949, 60.2316602316602,
56, 28.9156626506024, 55.6962025316456, 20.2360876897133, 31.413612565445,
0, 31.7250495703899), YLB = c(0, 0, 14.7874306839187, 0, 4.11522633744856,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4.63320463320463, 0, 0,
0, 0, 0, 0, 0), BLC = c(7.89473684210526, 12.8068303094984, 7.39371534195933,
10.958904109589, 0, 0, 3.9814200398142, 0, 3.7037037037037, 13.7404580152672,
11.3065326633166, 12.7840909090909, 3.88349514563107, 3.74298190892077,
0, 0, 0, 4.06779661016949, 0, 16, 9.63855421686747, 5.06329113924051,
20.2360876897133, 22.4382946896036, 0, 7.93126239259749), WHC = c(0,
0, 0, 0, 0, 0, 3.9814200398142, 0, 7.40740740740741, 0, 0, 0,
0, 7.48596381784155, 0, 0, 0, 0, 0, 0, 0, 10.126582278481, 0,
4.48765893792072, 0, 0), RSF = c(0, 0, 0, 0, 0, 0, 11.9442601194426,
0, 0, 13.7404580152672, 0, 0, 0, 0, 0, 0, 0, 0, 27.7992277992278,
0, 4.81927710843374, 0, 0, 4.48765893792072, 0, 0), CCF = c(0,
0, 0, 0, 0, 0, 0, 0, 3.7037037037037, 0, 0, 0, 3.88349514563107,
0, 0, 0, 0, 8.13559322033898, 0, 0, 0, 0, 0, 0, 0, 0), BLB = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8.52272727272727, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0)), class = "data.frame", row.names = c(NA,
-26L))
代码
stratified(homer_join_strat, "cluster", size=.06)
stratified(homer_join_strat, "cluster", size=.07)
stratified(homer_join_strat, "cluster", size=.09)
stratified(homer_join_strat, "cluster", size=2)
有没有人有使用这个函数来select一些不等于或不是数据层数倍数的行的经验?
我可以通过分别设置大小 = .06、.07、.09 (为什么我认为 1-1.99 之间的数字会给我 5 行 selected
提前致谢!
来自您正在使用的函数的文档“如果 size 是一个命名向量,该函数将检查向量的长度是否与组数匹配以及名称是否与组名匹配”- 它然后继续举例说明您的问题。
DF <- data.frame(
ID = 1:100,
A = sample(c("AA", "BB", "CC", "DD", "EE"), 100, replace = TRUE),
B = rnorm(100), C = abs(round(rnorm(100), digits=1)),
D = sample(c("CA", "NY", "TX"), 100, replace = TRUE),
E = sample(c("M", "F"), 100, replace = TRUE))
stratified(DF, "D", c(CA = 5, NY = 3, TX = 2))