基于另一个数据集创建分层样本
Creating stratified sample based on another dataset
我有一个数据库如下:
DT <- structure(list(Year = c(2005, 2005, 2005, 2005, 2005, 2005, 2005,
2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005,
2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005,
2005, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006,
2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006,
2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2007, 2007,
2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007,
2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007,
2007, 2007, 2007, 2007, 2007, 2007), Type = c(1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3,
3), Value = c(0.504376244608734, 0.544791523560323, 0.536356351248399,
0.186754489979335, 0.0145059662169885, 0.552467068108315, 0.728991908748136,
0.0782701833265232, 0.0770140143185365, 0.745720346755096, 0.182549844851049,
0.0037854136407528, 0.892426526130476, 0.670307075099745, 0.0787676704471466,
0.243642889274613, 0.61622932816441, 0.773909954748003, 0.0368627127466908,
0.864836276200213, 0.363247130858897, 0.170719500081567, 0.458862115912474,
0.764369844834086, 0.22138732039061, 0.950217140815184, 0.119026355092504,
0.806698643902745, 0.809697143416323, 0.0161168403745759, 0.56149794546334,
0.0663374185634651, 0.851044662622003, 0.144127493261805, 0.646129610173195,
0.180326314861961, 0.346305710081752, 0.689186084156133, 0.0902438913162577,
0.493067567084055, 0.829728867159447, 0.212655417404949, 0.873112880345332,
0.57019799015934, 0.666924788035991, 0.421470848297274, 0.137822577124685,
0.646797965126931, 0.00186628356193685, 0.220630784144145, 0.636097250212043,
0.337161167241577, 0.763014675300797, 0.0290609945874959, 0.179775595422681,
0.926270372245386, 0.14413707866326, 0.308460218540821, 0.505730133160804,
0.92831463570813, 0.2406601397661, 0.469013177711661, 0.0514836845684897,
0.8773477591701, 0.988870207825279, 0.0409427390691713, 0.345261503182235,
0.457678159145652, 0.928521904779235, 0.981654149874765, 0.165376851871405,
0.657749413049735, 0.645610554242246, 0.288901032482677, 0.903464871012278,
0.91288926903878, 0.331819964874993, 0.451775254733976, 0.561567931867726,
0.934770693643712, 0.0515071551015609, 0.0772762108900331, 0.233674539049138,
0.636764452840065, 0.673165028674493, 0.806944576060158, 0.763410488346345,
0.661058275398286, 0.275215831961986, 0.821051953775588), Value2 = c(0.898973133700585,
0.0043728119746469, 0.90370150590114, 0.664255277142381, 0.478255150030532,
0.428181937562552, 0.0547471373342867, 0.382060484866744, 0.467990590870777,
0.44613758335896, 0.767317422802576, 0.378150639908367, 0.490578474103678,
0.677901331005272, 0.287571260541928, 0.201396158908221, 0.504989505596871,
0.854550423135574, 0.545208640791417, 0.951248990134053, 0.958420479001103,
0.916437669811835, 0.299402641214852, 0.966388390213139, 0.511359402704707,
0.0867219533353825, 0.88481040004275, 0.158676351804193, 0.0723357399252373,
0.605048894989562, 0.60104443547608, 0.608164723564692, 0.309073275149768,
0.183031315824665, 0.495737621177827, 0.981936843144856, 0.601436476710344,
0.442362735422709, 0.497899316486054, 0.0545162134700136, 0.572666465987199,
0.0134330483790179, 0.494252845049882, 0.752561338910785, 0.269231150235318,
0.580397043886635, 0.00438648885146109, 0.974859546601355, 0.964309270817873,
0.740961468264743, 0.966289928060099, 0.165450408579171, 0.457088887715921,
0.725271665700556, 0.611801886877621, 0.693114823445831, 0.509441044895801,
0.668642268489104, 0.0769213109282016, 0.0106313240133811, 0.653738670103508,
0.515077318720933, 0.0355798295524966, 0.916849288357794, 0.489540407953311,
0.355080030655249, 0.0584185346727107, 0.117505910926226, 0.840486642923002,
0.0919621689925281, 0.513293731647231, 0.813987689492758, 0.520895630669219,
0.417642884334403, 0.549898208275446, 0.190152036926942, 0.730222922437507,
0.247328458018061, 0.587109508511267, 0.850096530635719, 0.929032051736368,
0.929910983683225, 0.461558252621238, 0.106247873795127, 0.177666580357953,
0.85962988262837, 0.531897323076434, 0.105528819826748, 0.0349104003049517,
0.180758384726269), Value3 = c(0.728747048185938, 0.136214396563203,
0.0552254916905935, 0.888943411458351, 0.593186561829418, 0.142192475897417,
0.397839605231809, 0.128332683559321, 0.818143628566787, 0.675081193031822,
0.267554700398382, 0.289692778583473, 0.395043380675461, 0.582592369450023,
0.999361780203229, 0.421977850130829, 0.723404859329269, 0.333410997686596,
0.545945290276875, 0.510878802866974, 0.746682101648222, 0.625853669469718,
0.0366957172106372, 0.417685335838607, 0.106323486037796, 0.0127310987059773,
0.291264331038641, 0.690392584005106, 0.0367947033685097, 0.287721087095362,
0.389582158765541, 0.179954765659721, 0.688980485242488, 0.492296704771236,
0.177765364735501, 0.311877860895471, 0.402659917512069, 0.579307427105039,
0.588566648357923, 0.741057591300206, 0.111932877257211, 0.515443723005798,
0.679584351614947, 0.0197622696399569, 0.0326379476305644, 0.736148474541639,
0.0115696238487739, 0.0530159587501624, 0.710708890129421, 0.537042840144158,
0.0277825198238522, 0.851349803530179, 0.448963399024373, 0.42841165712813,
0.0615511042450435, 0.210541933956987, 0.983517611560273, 0.533691182135933,
0.61993895519575, 0.136074538018663, 0.716185070081669, 0.67982888131481,
0.186059692566576, 0.0129160598675656, 0.832257317305668, 0.0269936347869698,
0.579065014243438, 0.857987264303428, 0.270050217297758, 0.606374993010002,
0.565105220120649, 0.977264711860796, 0.14241840012272, 0.942496958955904,
0.652070963472916, 0.912867524689929, 0.0249357414986835, 0.87704909395977,
0.72849611059358, 0.525707690655331, 0.290223239565496, 0.992723233891769,
0.178173444691217, 0.0292681960925434, 0.65696953770876, 0.452973377851251,
0.471917712361899, 0.117830393053313, 0.126107861454795, 0.0848074010166607
)), row.names = c(NA, -90L), class = c("tbl_df", "tbl", "data.frame"
))
使用 DT
数据库,我想根据以下数据创建分层子样本:
Ratios <- structure(list(State = c("A", "A", "A", "A", "A", "A", "A", "A",
"A", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "C", "C",
"C", "C", "C", "C", "C", "C"), Year = c(2005, 2005, 2005, 2006,
2006, 2006, 2007, 2007, 2007, 2005, 2005, 2005, 2006, 2006, 2006,
2007, 2007, 2007, 2005, 2005, 2005, 2006, 2006, 2006, 2007, 2007,
2007), Type = c(1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3,
1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3), ratio = c(0.2, 0.2, 0.6,
0.45, 0.2, 0.35, 0.1, 0.1, 0.8, 0.05, 0.5, 0.45, 0.3, 0.5, 0.2,
0.3, 0.2, 0.5, 0.35, 0.45, 0.2, 0.35, 0.2, 0.45, 0.2, 0.2, 0.6
)), row.names = c(NA, -27L), class = c("tbl_df", "tbl", "data.frame"
))
图 1。Ratios
我正在想办法解决这个问题。为了获得最大可能的样本量,我需要根据比率计算出最大可能的子样本。图 1. Ratios
显示了 2005 年国家 A 的比率。
我首先要确定 DT
中的比率:
# This is just to determine the ratio of `Type` in the data `DT`
DT_1 <- DT %>% count(Year, Type) %>% group_by(Year)
DT_2 <- DT %>% count(Year, Type) %>% group_by(Year) %>% mutate(n = n/sum(n))
DT_RAT <- cbind(DT_1 , DT_2)
setDT(DT_RAT)[,Year1:=NULL]
setDT(DT_RAT)[,Type1:=NULL]
rm(DT_1 , DT_2)
现在我应该根据这些信息创建示例。我必须从最大的份额开始(对于国家 A,这是 Type==1
。)所以对于样本 DT_A
,样本的一半应该是 Type
= 1。从 DT_RAT
我知道这相当于 Type 1
的 13 次观察(类型 1 观察的最大数量)。我想做类似的事情:
DT_A <- DT[,.SD[sample(.N, min(13,.N))],by = Type==1]
但我不允许这样做。谁能帮我实现这个目标?
编辑:
DT_A
(状态 A 的 DT
的子样本)的期望结果如下。
2005年DT_A的- 0.5应该是
Type 1
(见图1)
2005 年 DT_A 的 - 0.45 应该包括
Type 2
2005 年 DT_A 的 - 0.05 应包括
Type 3
DT 在 2005 年对 Type 1
进行了 13 次观察,因此我们将所有这些都考虑在内。因为这13个应该是样本的一半。总样本是 26 个观察值
2005 年还有 13 个 Type 2
的观测值,但由于只有 0.45 个应该由 Type 2
组成,我们取 12 个。只有 5% 应该是 Type 3
,所以这是剩下的 1 个观测值。
- 13 = 0.5 的 DT_A 2005 年应包括
Type 1
(见图 1)
- 12 = 2005 年 DT_A 的 0.45 应该包括
Type 2
- 1 = 0.05 of DT_A in 2005 应该包括
Type 3
我认为这条管道可以满足您的需求:
library(tidyr)
library(dplyr)
library(purrr)
DT %>%
count(Year, Type) %>%
right_join(Ratios, by = c("Year", "Type")) %>%
group_by(State, Year) %>%
mutate(sample_size = pmax(1, round(min(n / ratio) * ratio))) %>%
ungroup() %>%
select(State, Year, Type, sample_size) %>%
left_join(DT, by = c("Year","Type")) %>%
nest(data = -c(State, Year, Type, sample_size)) %>%
mutate(sample = map2(data, sample_size, ~slice_sample(.x, n = .y))) %>%
select(State, Year, Type, sample) %>%
arrange(State, Year, Type)
输出
# A tibble: 27 x 4
State Year Type sample
<chr> <dbl> <dbl> <list>
1 A 2005 1 <tibble [1 x 3]>
2 A 2005 2 <tibble [1 x 3]>
3 A 2005 3 <tibble [4 x 3]>
4 A 2006 1 <tibble [4 x 3]>
5 A 2006 2 <tibble [2 x 3]>
6 A 2006 3 <tibble [3 x 3]>
7 A 2007 1 <tibble [1 x 3]>
8 A 2007 2 <tibble [1 x 3]>
9 A 2007 3 <tibble [5 x 3]>
10 B 2005 1 <tibble [1 x 3]>
# ... with 17 more rows
您可以进一步向该管道追加一个 %>% unnest(sample)
,但我会保留它,因为您从未指定您的预期输出。这种格式也方便我给大家解释背后的逻辑。
关键是您需要为 State, Year, Type
的每个组合确定合适的样本量。我用来确定样本量的公式是
sample_size = pmax(1, round(min(n / ratio) * ratio))
例如,设n = c(13, 13, 4)
,与您的示例显示的数字相同。您想要根据 ratio = c(0.5, 0.45, 0.05)
对样本进行切片。那么,您可以拥有的最大样本量是 min(n / ratio)
,即 min(c(26, 28.88889, 80)) = 26
。任何大于此的数字都是无效的,因为您没有足够的观察值来抽样(在这种情况下对于类型 2 和 3)。然后我们使用总样本量乘以比率来计算每种类型的样本量。我们必须 round
结果,因为只允许使用自然数;我们还需要对每种类型至少进行一次观察。这就是为什么您会看到 pmax(1, ...)
.
对于您 post 中显示的示例,此公式 returns
> pmax(1, round(min(c(13, 13, 4) / c(0.5, 0.45, 0.05)) * c(0.5, 0.45, 0.05)))
[1] 13 12 1
有了这个设置,我们就可以开始你想要的采样程序了。我们先
> DT %>%
count(Year, Type)
# A tibble: 9 x 3
Year Type n
<dbl> <dbl> <int>
1 2005 1 13
2 2005 2 13
3 2005 3 4
4 2006 1 16
5 2006 2 11
6 2006 3 3
7 2007 1 11
8 2007 2 14
9 2007 3 5
以便我们知道每个 Year
和 Type
有多少观察值。那么,
> DT %>%
count(Year, Type) %>%
right_join(Ratios, by = c("Year", "Type")) %>%
group_by(State, Year) %>%
mutate(sample_size = pmax(1, round(min(n / ratio) * ratio)))
# A tibble: 27 x 6
# Groups: State, Year [9]
Year Type n State ratio sample_size
<dbl> <dbl> <int> <chr> <dbl> <dbl>
1 2005 1 13 A 0.2 1
2 2005 1 13 B 0.05 1
3 2005 1 13 B 0.35 3
4 2005 2 13 A 0.2 1
5 2005 2 13 B 0.5 4
6 2005 2 13 C 0.45 9
7 2005 3 4 A 0.6 4
8 2005 3 4 B 0.45 4
9 2005 3 4 C 0.2 4
10 2006 1 16 A 0.45 4
# ... with 17 more rows
以便我们知道每个 Year
、Type
和 State
的适当样本量。接下来,
DT %>%
count(Year, Type) %>%
right_join(Ratios, by = c("Year", "Type")) %>%
group_by(State, Year) %>%
mutate(sample_size = pmax(1, round(min(n / ratio) * ratio))) %>%
ungroup() %>%
select(State, Year, Type, sample_size) %>%
left_join(DT, by = c("Year","Type")) %>%
nest(data = -c(State, Year, Type, sample_size))
# A tibble: 27 x 5
State Year Type sample_size data
<chr> <dbl> <dbl> <dbl> <list>
1 A 2005 1 1 <tibble [13 x 3]>
2 B 2005 1 1 <tibble [13 x 3]>
3 B 2005 1 3 <tibble [13 x 3]>
4 A 2005 2 1 <tibble [13 x 3]>
5 B 2005 2 4 <tibble [13 x 3]>
6 C 2005 2 9 <tibble [13 x 3]>
7 A 2005 3 4 <tibble [4 x 3]>
8 B 2005 3 4 <tibble [4 x 3]>
9 C 2005 3 4 <tibble [4 x 3]>
10 A 2006 1 4 <tibble [16 x 3]>
# ... with 17 more rows
以便每个 Year
和 Type
的样本大小和数据都匹配。最后,我们使用slice_sample
函数对每组的数据进行采样,select我们需要的列,然后排列数据进行展示。
DT %>%
count(Year, Type) %>%
right_join(Ratios, by = c("Year", "Type")) %>%
group_by(State, Year) %>%
mutate(sample_size = pmax(1, round(min(n / ratio) * ratio))) %>%
ungroup() %>%
select(State, Year, Type, sample_size) %>%
left_join(DT, by = c("Year","Type")) %>%
nest(data = -c(State, Year, Type, sample_size)) %>%
mutate(sample = map2(data, sample_size, ~slice_sample(.x, n = .y))) %>%
select(State, Year, Type, sample) %>%
arrange(State, Year, Type)
此管道的输出显示在此 post 的开头。我认为您也可以通过 data.table
方式获得相同的结果。但是,我会这样保留它,因为此管道更适合说明目的。
我有一个数据库如下:
DT <- structure(list(Year = c(2005, 2005, 2005, 2005, 2005, 2005, 2005,
2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005,
2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005,
2005, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006,
2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006,
2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2007, 2007,
2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007,
2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007,
2007, 2007, 2007, 2007, 2007, 2007), Type = c(1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3,
3), Value = c(0.504376244608734, 0.544791523560323, 0.536356351248399,
0.186754489979335, 0.0145059662169885, 0.552467068108315, 0.728991908748136,
0.0782701833265232, 0.0770140143185365, 0.745720346755096, 0.182549844851049,
0.0037854136407528, 0.892426526130476, 0.670307075099745, 0.0787676704471466,
0.243642889274613, 0.61622932816441, 0.773909954748003, 0.0368627127466908,
0.864836276200213, 0.363247130858897, 0.170719500081567, 0.458862115912474,
0.764369844834086, 0.22138732039061, 0.950217140815184, 0.119026355092504,
0.806698643902745, 0.809697143416323, 0.0161168403745759, 0.56149794546334,
0.0663374185634651, 0.851044662622003, 0.144127493261805, 0.646129610173195,
0.180326314861961, 0.346305710081752, 0.689186084156133, 0.0902438913162577,
0.493067567084055, 0.829728867159447, 0.212655417404949, 0.873112880345332,
0.57019799015934, 0.666924788035991, 0.421470848297274, 0.137822577124685,
0.646797965126931, 0.00186628356193685, 0.220630784144145, 0.636097250212043,
0.337161167241577, 0.763014675300797, 0.0290609945874959, 0.179775595422681,
0.926270372245386, 0.14413707866326, 0.308460218540821, 0.505730133160804,
0.92831463570813, 0.2406601397661, 0.469013177711661, 0.0514836845684897,
0.8773477591701, 0.988870207825279, 0.0409427390691713, 0.345261503182235,
0.457678159145652, 0.928521904779235, 0.981654149874765, 0.165376851871405,
0.657749413049735, 0.645610554242246, 0.288901032482677, 0.903464871012278,
0.91288926903878, 0.331819964874993, 0.451775254733976, 0.561567931867726,
0.934770693643712, 0.0515071551015609, 0.0772762108900331, 0.233674539049138,
0.636764452840065, 0.673165028674493, 0.806944576060158, 0.763410488346345,
0.661058275398286, 0.275215831961986, 0.821051953775588), Value2 = c(0.898973133700585,
0.0043728119746469, 0.90370150590114, 0.664255277142381, 0.478255150030532,
0.428181937562552, 0.0547471373342867, 0.382060484866744, 0.467990590870777,
0.44613758335896, 0.767317422802576, 0.378150639908367, 0.490578474103678,
0.677901331005272, 0.287571260541928, 0.201396158908221, 0.504989505596871,
0.854550423135574, 0.545208640791417, 0.951248990134053, 0.958420479001103,
0.916437669811835, 0.299402641214852, 0.966388390213139, 0.511359402704707,
0.0867219533353825, 0.88481040004275, 0.158676351804193, 0.0723357399252373,
0.605048894989562, 0.60104443547608, 0.608164723564692, 0.309073275149768,
0.183031315824665, 0.495737621177827, 0.981936843144856, 0.601436476710344,
0.442362735422709, 0.497899316486054, 0.0545162134700136, 0.572666465987199,
0.0134330483790179, 0.494252845049882, 0.752561338910785, 0.269231150235318,
0.580397043886635, 0.00438648885146109, 0.974859546601355, 0.964309270817873,
0.740961468264743, 0.966289928060099, 0.165450408579171, 0.457088887715921,
0.725271665700556, 0.611801886877621, 0.693114823445831, 0.509441044895801,
0.668642268489104, 0.0769213109282016, 0.0106313240133811, 0.653738670103508,
0.515077318720933, 0.0355798295524966, 0.916849288357794, 0.489540407953311,
0.355080030655249, 0.0584185346727107, 0.117505910926226, 0.840486642923002,
0.0919621689925281, 0.513293731647231, 0.813987689492758, 0.520895630669219,
0.417642884334403, 0.549898208275446, 0.190152036926942, 0.730222922437507,
0.247328458018061, 0.587109508511267, 0.850096530635719, 0.929032051736368,
0.929910983683225, 0.461558252621238, 0.106247873795127, 0.177666580357953,
0.85962988262837, 0.531897323076434, 0.105528819826748, 0.0349104003049517,
0.180758384726269), Value3 = c(0.728747048185938, 0.136214396563203,
0.0552254916905935, 0.888943411458351, 0.593186561829418, 0.142192475897417,
0.397839605231809, 0.128332683559321, 0.818143628566787, 0.675081193031822,
0.267554700398382, 0.289692778583473, 0.395043380675461, 0.582592369450023,
0.999361780203229, 0.421977850130829, 0.723404859329269, 0.333410997686596,
0.545945290276875, 0.510878802866974, 0.746682101648222, 0.625853669469718,
0.0366957172106372, 0.417685335838607, 0.106323486037796, 0.0127310987059773,
0.291264331038641, 0.690392584005106, 0.0367947033685097, 0.287721087095362,
0.389582158765541, 0.179954765659721, 0.688980485242488, 0.492296704771236,
0.177765364735501, 0.311877860895471, 0.402659917512069, 0.579307427105039,
0.588566648357923, 0.741057591300206, 0.111932877257211, 0.515443723005798,
0.679584351614947, 0.0197622696399569, 0.0326379476305644, 0.736148474541639,
0.0115696238487739, 0.0530159587501624, 0.710708890129421, 0.537042840144158,
0.0277825198238522, 0.851349803530179, 0.448963399024373, 0.42841165712813,
0.0615511042450435, 0.210541933956987, 0.983517611560273, 0.533691182135933,
0.61993895519575, 0.136074538018663, 0.716185070081669, 0.67982888131481,
0.186059692566576, 0.0129160598675656, 0.832257317305668, 0.0269936347869698,
0.579065014243438, 0.857987264303428, 0.270050217297758, 0.606374993010002,
0.565105220120649, 0.977264711860796, 0.14241840012272, 0.942496958955904,
0.652070963472916, 0.912867524689929, 0.0249357414986835, 0.87704909395977,
0.72849611059358, 0.525707690655331, 0.290223239565496, 0.992723233891769,
0.178173444691217, 0.0292681960925434, 0.65696953770876, 0.452973377851251,
0.471917712361899, 0.117830393053313, 0.126107861454795, 0.0848074010166607
)), row.names = c(NA, -90L), class = c("tbl_df", "tbl", "data.frame"
))
使用 DT
数据库,我想根据以下数据创建分层子样本:
Ratios <- structure(list(State = c("A", "A", "A", "A", "A", "A", "A", "A",
"A", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "C", "C",
"C", "C", "C", "C", "C", "C"), Year = c(2005, 2005, 2005, 2006,
2006, 2006, 2007, 2007, 2007, 2005, 2005, 2005, 2006, 2006, 2006,
2007, 2007, 2007, 2005, 2005, 2005, 2006, 2006, 2006, 2007, 2007,
2007), Type = c(1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3,
1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3), ratio = c(0.2, 0.2, 0.6,
0.45, 0.2, 0.35, 0.1, 0.1, 0.8, 0.05, 0.5, 0.45, 0.3, 0.5, 0.2,
0.3, 0.2, 0.5, 0.35, 0.45, 0.2, 0.35, 0.2, 0.45, 0.2, 0.2, 0.6
)), row.names = c(NA, -27L), class = c("tbl_df", "tbl", "data.frame"
))
图 1。Ratios
我正在想办法解决这个问题。为了获得最大可能的样本量,我需要根据比率计算出最大可能的子样本。图 1. Ratios
显示了 2005 年国家 A 的比率。
我首先要确定 DT
中的比率:
# This is just to determine the ratio of `Type` in the data `DT`
DT_1 <- DT %>% count(Year, Type) %>% group_by(Year)
DT_2 <- DT %>% count(Year, Type) %>% group_by(Year) %>% mutate(n = n/sum(n))
DT_RAT <- cbind(DT_1 , DT_2)
setDT(DT_RAT)[,Year1:=NULL]
setDT(DT_RAT)[,Type1:=NULL]
rm(DT_1 , DT_2)
现在我应该根据这些信息创建示例。我必须从最大的份额开始(对于国家 A,这是 Type==1
。)所以对于样本 DT_A
,样本的一半应该是 Type
= 1。从 DT_RAT
我知道这相当于 Type 1
的 13 次观察(类型 1 观察的最大数量)。我想做类似的事情:
DT_A <- DT[,.SD[sample(.N, min(13,.N))],by = Type==1]
但我不允许这样做。谁能帮我实现这个目标?
编辑:
DT_A
(状态 A 的 DT
的子样本)的期望结果如下。
-
2005年DT_A的
- 0.5应该是
Type 1
(见图1)
2005 年 DT_A 的 - 0.45 应该包括
Type 2
2005 年 DT_A 的 - 0.05 应包括
Type 3
DT 在 2005 年对 Type 1
进行了 13 次观察,因此我们将所有这些都考虑在内。因为这13个应该是样本的一半。总样本是 26 个观察值
2005 年还有 13 个 Type 2
的观测值,但由于只有 0.45 个应该由 Type 2
组成,我们取 12 个。只有 5% 应该是 Type 3
,所以这是剩下的 1 个观测值。
- 13 = 0.5 的 DT_A 2005 年应包括
Type 1
(见图 1) - 12 = 2005 年 DT_A 的 0.45 应该包括
Type 2
- 1 = 0.05 of DT_A in 2005 应该包括
Type 3
我认为这条管道可以满足您的需求:
library(tidyr)
library(dplyr)
library(purrr)
DT %>%
count(Year, Type) %>%
right_join(Ratios, by = c("Year", "Type")) %>%
group_by(State, Year) %>%
mutate(sample_size = pmax(1, round(min(n / ratio) * ratio))) %>%
ungroup() %>%
select(State, Year, Type, sample_size) %>%
left_join(DT, by = c("Year","Type")) %>%
nest(data = -c(State, Year, Type, sample_size)) %>%
mutate(sample = map2(data, sample_size, ~slice_sample(.x, n = .y))) %>%
select(State, Year, Type, sample) %>%
arrange(State, Year, Type)
输出
# A tibble: 27 x 4
State Year Type sample
<chr> <dbl> <dbl> <list>
1 A 2005 1 <tibble [1 x 3]>
2 A 2005 2 <tibble [1 x 3]>
3 A 2005 3 <tibble [4 x 3]>
4 A 2006 1 <tibble [4 x 3]>
5 A 2006 2 <tibble [2 x 3]>
6 A 2006 3 <tibble [3 x 3]>
7 A 2007 1 <tibble [1 x 3]>
8 A 2007 2 <tibble [1 x 3]>
9 A 2007 3 <tibble [5 x 3]>
10 B 2005 1 <tibble [1 x 3]>
# ... with 17 more rows
您可以进一步向该管道追加一个 %>% unnest(sample)
,但我会保留它,因为您从未指定您的预期输出。这种格式也方便我给大家解释背后的逻辑。
关键是您需要为 State, Year, Type
的每个组合确定合适的样本量。我用来确定样本量的公式是
sample_size = pmax(1, round(min(n / ratio) * ratio))
例如,设n = c(13, 13, 4)
,与您的示例显示的数字相同。您想要根据 ratio = c(0.5, 0.45, 0.05)
对样本进行切片。那么,您可以拥有的最大样本量是 min(n / ratio)
,即 min(c(26, 28.88889, 80)) = 26
。任何大于此的数字都是无效的,因为您没有足够的观察值来抽样(在这种情况下对于类型 2 和 3)。然后我们使用总样本量乘以比率来计算每种类型的样本量。我们必须 round
结果,因为只允许使用自然数;我们还需要对每种类型至少进行一次观察。这就是为什么您会看到 pmax(1, ...)
.
对于您 post 中显示的示例,此公式 returns
> pmax(1, round(min(c(13, 13, 4) / c(0.5, 0.45, 0.05)) * c(0.5, 0.45, 0.05)))
[1] 13 12 1
有了这个设置,我们就可以开始你想要的采样程序了。我们先
> DT %>%
count(Year, Type)
# A tibble: 9 x 3
Year Type n
<dbl> <dbl> <int>
1 2005 1 13
2 2005 2 13
3 2005 3 4
4 2006 1 16
5 2006 2 11
6 2006 3 3
7 2007 1 11
8 2007 2 14
9 2007 3 5
以便我们知道每个 Year
和 Type
有多少观察值。那么,
> DT %>%
count(Year, Type) %>%
right_join(Ratios, by = c("Year", "Type")) %>%
group_by(State, Year) %>%
mutate(sample_size = pmax(1, round(min(n / ratio) * ratio)))
# A tibble: 27 x 6
# Groups: State, Year [9]
Year Type n State ratio sample_size
<dbl> <dbl> <int> <chr> <dbl> <dbl>
1 2005 1 13 A 0.2 1
2 2005 1 13 B 0.05 1
3 2005 1 13 B 0.35 3
4 2005 2 13 A 0.2 1
5 2005 2 13 B 0.5 4
6 2005 2 13 C 0.45 9
7 2005 3 4 A 0.6 4
8 2005 3 4 B 0.45 4
9 2005 3 4 C 0.2 4
10 2006 1 16 A 0.45 4
# ... with 17 more rows
以便我们知道每个 Year
、Type
和 State
的适当样本量。接下来,
DT %>%
count(Year, Type) %>%
right_join(Ratios, by = c("Year", "Type")) %>%
group_by(State, Year) %>%
mutate(sample_size = pmax(1, round(min(n / ratio) * ratio))) %>%
ungroup() %>%
select(State, Year, Type, sample_size) %>%
left_join(DT, by = c("Year","Type")) %>%
nest(data = -c(State, Year, Type, sample_size))
# A tibble: 27 x 5
State Year Type sample_size data
<chr> <dbl> <dbl> <dbl> <list>
1 A 2005 1 1 <tibble [13 x 3]>
2 B 2005 1 1 <tibble [13 x 3]>
3 B 2005 1 3 <tibble [13 x 3]>
4 A 2005 2 1 <tibble [13 x 3]>
5 B 2005 2 4 <tibble [13 x 3]>
6 C 2005 2 9 <tibble [13 x 3]>
7 A 2005 3 4 <tibble [4 x 3]>
8 B 2005 3 4 <tibble [4 x 3]>
9 C 2005 3 4 <tibble [4 x 3]>
10 A 2006 1 4 <tibble [16 x 3]>
# ... with 17 more rows
以便每个 Year
和 Type
的样本大小和数据都匹配。最后,我们使用slice_sample
函数对每组的数据进行采样,select我们需要的列,然后排列数据进行展示。
DT %>%
count(Year, Type) %>%
right_join(Ratios, by = c("Year", "Type")) %>%
group_by(State, Year) %>%
mutate(sample_size = pmax(1, round(min(n / ratio) * ratio))) %>%
ungroup() %>%
select(State, Year, Type, sample_size) %>%
left_join(DT, by = c("Year","Type")) %>%
nest(data = -c(State, Year, Type, sample_size)) %>%
mutate(sample = map2(data, sample_size, ~slice_sample(.x, n = .y))) %>%
select(State, Year, Type, sample) %>%
arrange(State, Year, Type)
此管道的输出显示在此 post 的开头。我认为您也可以通过 data.table
方式获得相同的结果。但是,我会这样保留它,因为此管道更适合说明目的。