在 R 中有条件地删除观察结果(在使用 MatchIt 包之后)
Removing observations conditionally (after use of MatchIt package) in R
我已经使用包 MatchIt
对治疗组 (treat = 1
) 和对照组 (treat = 0
) 进行了精确匹配——匹配是通过 age
.变量 subclass
显示匹配的单位。
如果与多个对照相匹配,我希望为每个治疗单位随机选择一个对照单位。重要的是它是随机的。
如果我有多个处理单元只与 1 个控制匹配(subclass
4 的情况),我想丢弃这样的控制单元,以便为每个子类保留相同数量的控制和单元.
最后,我希望 treat = 1 和 treat = 0 的观察次数相等。
我的真实数据集很大,包含超过一百万个子类。
structure(list(id = c("NSW1", "NSW57", "PSID6", "PSID84", "PSID147",
"PSID349", "PSID361", "PSID400", "NSW2", "NSW6", "NSW9", "NSW60",
"NSW77", "NSW80", "NSW127", "NSW161", "NSW169", "NSW177", "NSW179",
"PSID15", "PSID31", "PSID41", "PSID62", "PSID92", "PSID93", "PSID150",
"PSID167", "PSID178", "PSID254", "PSID292", "PSID300", "PSID308",
"PSID309", "PSID314", "PSID330", "NSW3", "NSW55", "NSW109", "PSID1",
"PSID69", "PSID91", "PSID165", "PSID166", "PSID302", "PSID378",
"ASID9033", "ASID9034", "ASID9036"), treat = c(1L, 1L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L), age = c(37L,
37L, 37L, 37L, 37L, 37L, 37L, 37L, 22L, 22L, 22L, 22L, 22L, 22L,
22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L,
22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 30L, 30L, 30L, 30L, 30L,
30L, 30L, 30L, 30L, 30L, 29L, 29L, 29L), race = c("black", "black",
"black", "hispan", "white", "white", "white", "black", "hispan",
"black", "black", "white", "black", "black", "black", "black",
"black", "hispan", "white", "black", "hispan", "black", "white",
"white", "white", "hispan", "white", "white", "white", "white",
"black", "black", "white", "white", "black", "black", "black",
"black", "white", "black", "white", "white", "white", "white",
"white", "black", "white", "black"), married = c(1L, 0L, 1L,
0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L,
1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L,
0L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L), subclass = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L)), class = "data.frame", row.names = c(NA,
-48L))
这是使用 group_split
和 map_dfr
的方法(可能有点复杂)。
library(tidyverse)
df %>%
group_split(subclass) %>%
map_dfr(~ if(sum(.x$treat) > (nrow(.x) / 2)) bind_rows(.x[.x$treat == 0, ], sample_n(.x[.x$treat == 1, ], nrow(.x[.x$treat == 0, ])))
else if(sum(.x$treat) < (nrow(.x) / 2)) bind_rows(.x[.x$treat == 1, ], sample_n(.x[.x$treat == 0, ], nrow(.x[.x$treat == 1, ])))
else .x)
# A tibble: 34 x 6
id treat age race married subclass
<chr> <int> <int> <chr> <int> <int>
1 NSW1 1 37 black 1 1
2 NSW57 1 37 black 0 1
3 PSID400 0 37 black 0 1
4 PSID84 0 37 hispan 0 1
5 NSW2 1 22 hispan 0 2
6 NSW6 1 22 black 0 2
7 NSW9 1 22 black 0 2
8 NSW60 1 22 white 0 2
9 NSW77 1 22 black 0 2
10 NSW80 1 22 black 0 2
# ... with 24 more rows
这是一种简单的方法
library(tidyverse)
set.seed(999)
mydata %>%
mutate(r = runif(n = nrow(mydata))) %>%
arrange(r) %>%
group_by(treat, subclass) %>%
mutate(max_r = max(r)) %>%
filter(r == max_r) %>% select(-c(r, max_r)) -> mydata.filtered
我先创建一个随机数r
,然后我根据r
排列数据。此后,我为每个子类 x 处理单元计算 max(r)
,并删除 max(r) != r
.
中的所有内容
这导致每个子类有 1 个处理和 1 个 non-treated obs。
> table(mydata.filtered$treat, mydata.filtered$subclass)
1 2 3 4
0 1 1 1 1
1 1 1 1 1
数据
mydata<- structure(list(id = c("NSW1", "NSW57", "PSID6", "PSID84", "PSID147",
"PSID349", "PSID361", "PSID400", "NSW2", "NSW6", "NSW9", "NSW60",
"NSW77", "NSW80", "NSW127", "NSW161", "NSW169", "NSW177", "NSW179",
"PSID15", "PSID31", "PSID41", "PSID62", "PSID92", "PSID93", "PSID150",
"PSID167", "PSID178", "PSID254", "PSID292", "PSID300", "PSID308",
"PSID309", "PSID314", "PSID330", "NSW3", "NSW55", "NSW109", "PSID1",
"PSID69", "PSID91", "PSID165", "PSID166", "PSID302", "PSID378",
"ASID9033", "ASID9034", "ASID9036"), treat = c(1L, 1L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L), age = c(37L,
37L, 37L, 37L, 37L, 37L, 37L, 37L, 22L, 22L, 22L, 22L, 22L, 22L,
22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L,
22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 30L, 30L, 30L, 30L, 30L,
30L, 30L, 30L, 30L, 30L, 29L, 29L, 29L), race = c("black", "black",
"black", "hispan", "white", "white", "white", "black", "hispan",
"black", "black", "white", "black", "black", "black", "black",
"black", "hispan", "white", "black", "hispan", "black", "white",
"white", "white", "hispan", "white", "white", "white", "white",
"black", "black", "white", "white", "black", "black", "black", "black", "white", "black", "white", "white", "white", "white",
"white", "black", "white", "black"), married = c(1L, 0L, 1L,
0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L,
1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L,
0L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L), subclass = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L)), class = "data.frame", row.names = c(NA,
-48L))
?MatchIt
似乎还提供了 ratio
参数,可用于在匹配函数调用中强制进行一对一匹配。
另一种(基础 R)方法:
md <- do.call("rbind", unname(lapply(split(md, ~subclass),
function(x) {
x[c(which(x$treat == 1)[1],
which(x$treat == 0)[1]),]
})))
从每个子类中获取第一个处理过的和第一个控制单元,然后 rbind
将它们放在一起。如果您的数据是随机排序的,这相当于随机选择一个处理单元和一个控制单元。
我已经使用包 MatchIt
对治疗组 (treat = 1
) 和对照组 (treat = 0
) 进行了精确匹配——匹配是通过 age
.变量 subclass
显示匹配的单位。
如果与多个对照相匹配,我希望为每个治疗单位随机选择一个对照单位。重要的是它是随机的。
如果我有多个处理单元只与 1 个控制匹配(subclass
4 的情况),我想丢弃这样的控制单元,以便为每个子类保留相同数量的控制和单元.
最后,我希望 treat = 1 和 treat = 0 的观察次数相等。
我的真实数据集很大,包含超过一百万个子类。
structure(list(id = c("NSW1", "NSW57", "PSID6", "PSID84", "PSID147",
"PSID349", "PSID361", "PSID400", "NSW2", "NSW6", "NSW9", "NSW60",
"NSW77", "NSW80", "NSW127", "NSW161", "NSW169", "NSW177", "NSW179",
"PSID15", "PSID31", "PSID41", "PSID62", "PSID92", "PSID93", "PSID150",
"PSID167", "PSID178", "PSID254", "PSID292", "PSID300", "PSID308",
"PSID309", "PSID314", "PSID330", "NSW3", "NSW55", "NSW109", "PSID1",
"PSID69", "PSID91", "PSID165", "PSID166", "PSID302", "PSID378",
"ASID9033", "ASID9034", "ASID9036"), treat = c(1L, 1L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L), age = c(37L,
37L, 37L, 37L, 37L, 37L, 37L, 37L, 22L, 22L, 22L, 22L, 22L, 22L,
22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L,
22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 30L, 30L, 30L, 30L, 30L,
30L, 30L, 30L, 30L, 30L, 29L, 29L, 29L), race = c("black", "black",
"black", "hispan", "white", "white", "white", "black", "hispan",
"black", "black", "white", "black", "black", "black", "black",
"black", "hispan", "white", "black", "hispan", "black", "white",
"white", "white", "hispan", "white", "white", "white", "white",
"black", "black", "white", "white", "black", "black", "black",
"black", "white", "black", "white", "white", "white", "white",
"white", "black", "white", "black"), married = c(1L, 0L, 1L,
0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L,
1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L,
0L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L), subclass = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L)), class = "data.frame", row.names = c(NA,
-48L))
这是使用 group_split
和 map_dfr
的方法(可能有点复杂)。
library(tidyverse)
df %>%
group_split(subclass) %>%
map_dfr(~ if(sum(.x$treat) > (nrow(.x) / 2)) bind_rows(.x[.x$treat == 0, ], sample_n(.x[.x$treat == 1, ], nrow(.x[.x$treat == 0, ])))
else if(sum(.x$treat) < (nrow(.x) / 2)) bind_rows(.x[.x$treat == 1, ], sample_n(.x[.x$treat == 0, ], nrow(.x[.x$treat == 1, ])))
else .x)
# A tibble: 34 x 6
id treat age race married subclass
<chr> <int> <int> <chr> <int> <int>
1 NSW1 1 37 black 1 1
2 NSW57 1 37 black 0 1
3 PSID400 0 37 black 0 1
4 PSID84 0 37 hispan 0 1
5 NSW2 1 22 hispan 0 2
6 NSW6 1 22 black 0 2
7 NSW9 1 22 black 0 2
8 NSW60 1 22 white 0 2
9 NSW77 1 22 black 0 2
10 NSW80 1 22 black 0 2
# ... with 24 more rows
这是一种简单的方法
library(tidyverse)
set.seed(999)
mydata %>%
mutate(r = runif(n = nrow(mydata))) %>%
arrange(r) %>%
group_by(treat, subclass) %>%
mutate(max_r = max(r)) %>%
filter(r == max_r) %>% select(-c(r, max_r)) -> mydata.filtered
我先创建一个随机数r
,然后我根据r
排列数据。此后,我为每个子类 x 处理单元计算 max(r)
,并删除 max(r) != r
.
这导致每个子类有 1 个处理和 1 个 non-treated obs。
> table(mydata.filtered$treat, mydata.filtered$subclass)
1 2 3 4
0 1 1 1 1
1 1 1 1 1
数据
mydata<- structure(list(id = c("NSW1", "NSW57", "PSID6", "PSID84", "PSID147",
"PSID349", "PSID361", "PSID400", "NSW2", "NSW6", "NSW9", "NSW60",
"NSW77", "NSW80", "NSW127", "NSW161", "NSW169", "NSW177", "NSW179",
"PSID15", "PSID31", "PSID41", "PSID62", "PSID92", "PSID93", "PSID150",
"PSID167", "PSID178", "PSID254", "PSID292", "PSID300", "PSID308",
"PSID309", "PSID314", "PSID330", "NSW3", "NSW55", "NSW109", "PSID1",
"PSID69", "PSID91", "PSID165", "PSID166", "PSID302", "PSID378",
"ASID9033", "ASID9034", "ASID9036"), treat = c(1L, 1L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L), age = c(37L,
37L, 37L, 37L, 37L, 37L, 37L, 37L, 22L, 22L, 22L, 22L, 22L, 22L,
22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L,
22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 30L, 30L, 30L, 30L, 30L,
30L, 30L, 30L, 30L, 30L, 29L, 29L, 29L), race = c("black", "black",
"black", "hispan", "white", "white", "white", "black", "hispan",
"black", "black", "white", "black", "black", "black", "black",
"black", "hispan", "white", "black", "hispan", "black", "white",
"white", "white", "hispan", "white", "white", "white", "white",
"black", "black", "white", "white", "black", "black", "black", "black", "white", "black", "white", "white", "white", "white",
"white", "black", "white", "black"), married = c(1L, 0L, 1L,
0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L,
1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L,
0L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L), subclass = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L)), class = "data.frame", row.names = c(NA,
-48L))
?MatchIt
似乎还提供了 ratio
参数,可用于在匹配函数调用中强制进行一对一匹配。
另一种(基础 R)方法:
md <- do.call("rbind", unname(lapply(split(md, ~subclass),
function(x) {
x[c(which(x$treat == 1)[1],
which(x$treat == 0)[1]),]
})))
从每个子类中获取第一个处理过的和第一个控制单元,然后 rbind
将它们放在一起。如果您的数据是随机排序的,这相当于随机选择一个处理单元和一个控制单元。