基于另一个数据集创建分层样本

Question

我有一个数据库如下：

DT <- structure(list(Year = c(2005, 2005, 2005, 2005, 2005, 2005, 2005, 
2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 
2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 
2005, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 
2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 
2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2007, 2007, 
2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 
2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 
2007, 2007, 2007, 2007, 2007, 2007), Type = c(1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 
3), Value = c(0.504376244608734, 0.544791523560323, 0.536356351248399, 
0.186754489979335, 0.0145059662169885, 0.552467068108315, 0.728991908748136, 
0.0782701833265232, 0.0770140143185365, 0.745720346755096, 0.182549844851049, 
0.0037854136407528, 0.892426526130476, 0.670307075099745, 0.0787676704471466, 
0.243642889274613, 0.61622932816441, 0.773909954748003, 0.0368627127466908, 
0.864836276200213, 0.363247130858897, 0.170719500081567, 0.458862115912474, 
0.764369844834086, 0.22138732039061, 0.950217140815184, 0.119026355092504, 
0.806698643902745, 0.809697143416323, 0.0161168403745759, 0.56149794546334, 
0.0663374185634651, 0.851044662622003, 0.144127493261805, 0.646129610173195, 
0.180326314861961, 0.346305710081752, 0.689186084156133, 0.0902438913162577, 
0.493067567084055, 0.829728867159447, 0.212655417404949, 0.873112880345332, 
0.57019799015934, 0.666924788035991, 0.421470848297274, 0.137822577124685, 
0.646797965126931, 0.00186628356193685, 0.220630784144145, 0.636097250212043, 
0.337161167241577, 0.763014675300797, 0.0290609945874959, 0.179775595422681, 
0.926270372245386, 0.14413707866326, 0.308460218540821, 0.505730133160804, 
0.92831463570813, 0.2406601397661, 0.469013177711661, 0.0514836845684897, 
0.8773477591701, 0.988870207825279, 0.0409427390691713, 0.345261503182235, 
0.457678159145652, 0.928521904779235, 0.981654149874765, 0.165376851871405, 
0.657749413049735, 0.645610554242246, 0.288901032482677, 0.903464871012278, 
0.91288926903878, 0.331819964874993, 0.451775254733976, 0.561567931867726, 
0.934770693643712, 0.0515071551015609, 0.0772762108900331, 0.233674539049138, 
0.636764452840065, 0.673165028674493, 0.806944576060158, 0.763410488346345, 
0.661058275398286, 0.275215831961986, 0.821051953775588), Value2 = c(0.898973133700585, 
0.0043728119746469, 0.90370150590114, 0.664255277142381, 0.478255150030532, 
0.428181937562552, 0.0547471373342867, 0.382060484866744, 0.467990590870777, 
0.44613758335896, 0.767317422802576, 0.378150639908367, 0.490578474103678, 
0.677901331005272, 0.287571260541928, 0.201396158908221, 0.504989505596871, 
0.854550423135574, 0.545208640791417, 0.951248990134053, 0.958420479001103, 
0.916437669811835, 0.299402641214852, 0.966388390213139, 0.511359402704707, 
0.0867219533353825, 0.88481040004275, 0.158676351804193, 0.0723357399252373, 
0.605048894989562, 0.60104443547608, 0.608164723564692, 0.309073275149768, 
0.183031315824665, 0.495737621177827, 0.981936843144856, 0.601436476710344, 
0.442362735422709, 0.497899316486054, 0.0545162134700136, 0.572666465987199, 
0.0134330483790179, 0.494252845049882, 0.752561338910785, 0.269231150235318, 
0.580397043886635, 0.00438648885146109, 0.974859546601355, 0.964309270817873, 
0.740961468264743, 0.966289928060099, 0.165450408579171, 0.457088887715921, 
0.725271665700556, 0.611801886877621, 0.693114823445831, 0.509441044895801, 
0.668642268489104, 0.0769213109282016, 0.0106313240133811, 0.653738670103508, 
0.515077318720933, 0.0355798295524966, 0.916849288357794, 0.489540407953311, 
0.355080030655249, 0.0584185346727107, 0.117505910926226, 0.840486642923002, 
0.0919621689925281, 0.513293731647231, 0.813987689492758, 0.520895630669219, 
0.417642884334403, 0.549898208275446, 0.190152036926942, 0.730222922437507, 
0.247328458018061, 0.587109508511267, 0.850096530635719, 0.929032051736368, 
0.929910983683225, 0.461558252621238, 0.106247873795127, 0.177666580357953, 
0.85962988262837, 0.531897323076434, 0.105528819826748, 0.0349104003049517, 
0.180758384726269), Value3 = c(0.728747048185938, 0.136214396563203, 
0.0552254916905935, 0.888943411458351, 0.593186561829418, 0.142192475897417, 
0.397839605231809, 0.128332683559321, 0.818143628566787, 0.675081193031822, 
0.267554700398382, 0.289692778583473, 0.395043380675461, 0.582592369450023, 
0.999361780203229, 0.421977850130829, 0.723404859329269, 0.333410997686596, 
0.545945290276875, 0.510878802866974, 0.746682101648222, 0.625853669469718, 
0.0366957172106372, 0.417685335838607, 0.106323486037796, 0.0127310987059773, 
0.291264331038641, 0.690392584005106, 0.0367947033685097, 0.287721087095362, 
0.389582158765541, 0.179954765659721, 0.688980485242488, 0.492296704771236, 
0.177765364735501, 0.311877860895471, 0.402659917512069, 0.579307427105039, 
0.588566648357923, 0.741057591300206, 0.111932877257211, 0.515443723005798, 
0.679584351614947, 0.0197622696399569, 0.0326379476305644, 0.736148474541639, 
0.0115696238487739, 0.0530159587501624, 0.710708890129421, 0.537042840144158, 
0.0277825198238522, 0.851349803530179, 0.448963399024373, 0.42841165712813, 
0.0615511042450435, 0.210541933956987, 0.983517611560273, 0.533691182135933, 
0.61993895519575, 0.136074538018663, 0.716185070081669, 0.67982888131481, 
0.186059692566576, 0.0129160598675656, 0.832257317305668, 0.0269936347869698, 
0.579065014243438, 0.857987264303428, 0.270050217297758, 0.606374993010002, 
0.565105220120649, 0.977264711860796, 0.14241840012272, 0.942496958955904, 
0.652070963472916, 0.912867524689929, 0.0249357414986835, 0.87704909395977, 
0.72849611059358, 0.525707690655331, 0.290223239565496, 0.992723233891769, 
0.178173444691217, 0.0292681960925434, 0.65696953770876, 0.452973377851251, 
0.471917712361899, 0.117830393053313, 0.126107861454795, 0.0848074010166607
)), row.names = c(NA, -90L), class = c("tbl_df", "tbl", "data.frame"
))

使用 DT 数据库，我想根据以下数据创建分层子样本：

Ratios <- structure(list(State = c("A", "A", "A", "A", "A", "A", "A", "A", 
"A", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "C", "C", 
"C", "C", "C", "C", "C", "C"), Year = c(2005, 2005, 2005, 2006, 
2006, 2006, 2007, 2007, 2007, 2005, 2005, 2005, 2006, 2006, 2006, 
2007, 2007, 2007, 2005, 2005, 2005, 2006, 2006, 2006, 2007, 2007, 
2007), Type = c(1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 
1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3), ratio = c(0.2, 0.2, 0.6, 
0.45, 0.2, 0.35, 0.1, 0.1, 0.8, 0.05, 0.5, 0.45, 0.3, 0.5, 0.2, 
0.3, 0.2, 0.5, 0.35, 0.45, 0.2, 0.35, 0.2, 0.45, 0.2, 0.2, 0.6
)), row.names = c(NA, -27L), class = c("tbl_df", "tbl", "data.frame"
))

图 1。Ratios

我正在想办法解决这个问题。为了获得最大可能的样本量，我需要根据比率计算出最大可能的子样本。图 1. Ratios 显示了 2005 年国家 A 的比率。

我首先要确定 DT 中的比率：

# This is just to determine the ratio of `Type` in the data `DT`
DT_1 <- DT %>%  count(Year, Type) %>% group_by(Year)
DT_2 <- DT %>%  count(Year, Type) %>% group_by(Year) %>% mutate(n = n/sum(n))
DT_RAT <- cbind(DT_1 , DT_2)
setDT(DT_RAT)[,Year1:=NULL]
setDT(DT_RAT)[,Type1:=NULL]
rm(DT_1 , DT_2)

现在我应该根据这些信息创建示例。我必须从最大的份额开始（对于国家 A，这是 Type==1。）所以对于样本 DT_A，样本的一半应该是 Type = 1。从 DT_RAT 我知道这相当于 Type 1 的 13 次观察（类型 1 观察的最大数量）。我想做类似的事情：

DT_A <- DT[,.SD[sample(.N, min(13,.N))],by = Type==1]

但我不允许这样做。谁能帮我实现这个目标？

编辑：

DT_A（状态 A 的 DT 的子样本）的期望结果如下。

0.5应该是Type 1（见图1）
0.45 应该包括 Type 2
0.05 应包括 Type 3

DT 在 2005 年对 Type 1 进行了 13 次观察，因此我们将所有这些都考虑在内。因为这13个应该是样本的一半。总样本是 26 个观察值 2005 年还有 13 个 Type 2 的观测值，但由于只有 0.45 个应该由 Type 2 组成，我们取 12 个。只有 5% 应该是 Type 3，所以这是剩下的 1 个观测值。

13 = 0.5 的 DT_A 2005 年应包括 Type 1（见图 1）
12 = 2005 年 DT_A 的 0.45 应该包括 Type 2
1 = 0.05 of DT_A in 2005 应该包括 Type 3

Answer 1

我认为这条管道可以满足您的需求：

library(tidyr)
library(dplyr)
library(purrr)

DT %>% 
  count(Year, Type) %>% 
  right_join(Ratios, by = c("Year", "Type")) %>% 
  group_by(State, Year) %>% 
  mutate(sample_size = pmax(1, round(min(n / ratio) * ratio))) %>% 
  ungroup() %>% 
  select(State, Year, Type, sample_size) %>% 
  left_join(DT, by = c("Year","Type")) %>% 
  nest(data = -c(State, Year, Type, sample_size)) %>% 
  mutate(sample = map2(data, sample_size, ~slice_sample(.x, n = .y))) %>% 
  select(State, Year, Type, sample) %>% 
  arrange(State, Year, Type)

输出

# A tibble: 27 x 4
   State  Year  Type sample          
   <chr> <dbl> <dbl> <list>          
 1 A      2005     1 <tibble [1 x 3]>
 2 A      2005     2 <tibble [1 x 3]>
 3 A      2005     3 <tibble [4 x 3]>
 4 A      2006     1 <tibble [4 x 3]>
 5 A      2006     2 <tibble [2 x 3]>
 6 A      2006     3 <tibble [3 x 3]>
 7 A      2007     1 <tibble [1 x 3]>
 8 A      2007     2 <tibble [1 x 3]>
 9 A      2007     3 <tibble [5 x 3]>
10 B      2005     1 <tibble [1 x 3]>
# ... with 17 more rows

您可以进一步向该管道追加一个 %>% unnest(sample)，但我会保留它，因为您从未指定您的预期输出。这种格式也方便我给大家解释背后的逻辑。

关键是您需要为 State, Year, Type 的每个组合确定合适的样本量。我用来确定样本量的公式是

sample_size = pmax(1, round(min(n / ratio) * ratio))

例如，设n = c(13, 13, 4)，与您的示例显示的数字相同。您想要根据 ratio = c(0.5, 0.45, 0.05) 对样本进行切片。那么，您可以拥有的最大样本量是 min(n / ratio)，即 min(c(26, 28.88889, 80)) = 26。任何大于此的数字都是无效的，因为您没有足够的观察值来抽样（在这种情况下对于类型 2 和 3）。然后我们使用总样本量乘以比率来计算每种类型的样本量。我们必须 round 结果，因为只允许使用自然数；我们还需要对每种类型至少进行一次观察。这就是为什么您会看到 pmax(1, ...).

对于您 post 中显示的示例，此公式 returns

> pmax(1, round(min(c(13, 13, 4) / c(0.5, 0.45, 0.05)) * c(0.5, 0.45, 0.05)))
[1] 13 12  1

有了这个设置，我们就可以开始你想要的采样程序了。我们先

> DT %>% 
      count(Year, Type)

# A tibble: 9 x 3
   Year  Type     n
  <dbl> <dbl> <int>
1  2005     1    13
2  2005     2    13
3  2005     3     4
4  2006     1    16
5  2006     2    11
6  2006     3     3
7  2007     1    11
8  2007     2    14
9  2007     3     5

以便我们知道每个 Year 和 Type 有多少观察值。那么，

> DT %>% 
    count(Year, Type) %>% 
    right_join(Ratios, by = c("Year", "Type")) %>% 
    group_by(State, Year) %>% 
    mutate(sample_size = pmax(1, round(min(n / ratio) * ratio)))

# A tibble: 27 x 6
# Groups:   State, Year [9]
    Year  Type     n State ratio sample_size
   <dbl> <dbl> <int> <chr> <dbl>       <dbl>
 1  2005     1    13 A      0.2            1
 2  2005     1    13 B      0.05           1
 3  2005     1    13 B      0.35           3
 4  2005     2    13 A      0.2            1
 5  2005     2    13 B      0.5            4
 6  2005     2    13 C      0.45           9
 7  2005     3     4 A      0.6            4
 8  2005     3     4 B      0.45           4
 9  2005     3     4 C      0.2            4
10  2006     1    16 A      0.45           4
# ... with 17 more rows

以便我们知道每个 Year、Type 和 State 的适当样本量。接下来，

DT %>% 
  count(Year, Type) %>% 
  right_join(Ratios, by = c("Year", "Type")) %>% 
  group_by(State, Year) %>% 
  mutate(sample_size = pmax(1, round(min(n / ratio) * ratio))) %>% 
  ungroup() %>% 
  select(State, Year, Type, sample_size) %>% 
  left_join(DT, by = c("Year","Type")) %>% 
  nest(data = -c(State, Year, Type, sample_size))

# A tibble: 27 x 5
   State  Year  Type sample_size data             
   <chr> <dbl> <dbl>       <dbl> <list>           
 1 A      2005     1           1 <tibble [13 x 3]>
 2 B      2005     1           1 <tibble [13 x 3]>
 3 B      2005     1           3 <tibble [13 x 3]>
 4 A      2005     2           1 <tibble [13 x 3]>
 5 B      2005     2           4 <tibble [13 x 3]>
 6 C      2005     2           9 <tibble [13 x 3]>
 7 A      2005     3           4 <tibble [4 x 3]> 
 8 B      2005     3           4 <tibble [4 x 3]> 
 9 C      2005     3           4 <tibble [4 x 3]> 
10 A      2006     1           4 <tibble [16 x 3]>
# ... with 17 more rows

以便每个 Year 和 Type 的样本大小和数据都匹配。最后，我们使用slice_sample函数对每组的数据进行采样，select我们需要的列，然后排列数据进行展示。

DT %>% 
  count(Year, Type) %>% 
  right_join(Ratios, by = c("Year", "Type")) %>% 
  group_by(State, Year) %>% 
  mutate(sample_size = pmax(1, round(min(n / ratio) * ratio))) %>% 
  ungroup() %>% 
  select(State, Year, Type, sample_size) %>% 
  left_join(DT, by = c("Year","Type")) %>% 
  nest(data = -c(State, Year, Type, sample_size)) %>% 
  mutate(sample = map2(data, sample_size, ~slice_sample(.x, n = .y))) %>% 
  select(State, Year, Type, sample) %>% 
  arrange(State, Year, Type)

此管道的输出显示在此 post 的开头。我认为您也可以通过 data.table 方式获得相同的结果。但是，我会这样保留它，因为此管道更适合说明目的。

基于另一个数据集创建分层样本

Creating stratified sample based on another dataset

r

sample

data.table