从大数据集重采样的差异估计
Diff-in-diff estimation with resampling from large dataset
我有一个大型数据集,可以对其执行 diff-in-diff 估计。鉴于数据集的性质,我的 t 统计量分母被夸大了,系数(偷偷地)具有统计显着性。
我想逐步减少数据库中的元素数量,并为每一步重新采样大量次数并重新估计每次交互系数和标准误差。
然后我想获取所有平均值估计值和标准误差,并将它们绘制在图表上,以显示在什么点(如果有的话)它们在统计上与零没有差异。
我的代码后面有一个玩具示例。
- 我不确定这是解决问题的最有效方法
- 我无法检索并绘制置信区间
- 鉴于存在不同的群体,我不确定抽样是否具有代表性。
玩具示例(Creds Torres-Reyna - 2015)
library(foreign)
library(dplyr)
library(ggplot2)
df_0 <- NULL
for (i in 1:length(seq(5,nrow(mydata)-1,5))){
index <- seq(5,nrow(mydata),5)[i]
df_1 <- NULL
for (j in 1:10){
mydata_temp <- mydata[sample(nrow(mydata), index), ]
didreg = lm(y ~ treated + time + did, data = mydata_temp)
out <- summary(didreg)
new_line <- c(out$coefficients[,1][4], out$coefficients[,2][4], index)
new_line <- data.frame(t(new_line))
names(new_line) <- c("c","s","i")
df_1 <- rbind(df_1,new_line)
}
df_0 <- rbind(df_0,df_1)
}
df_0 <- df_0 %>% group_by(i) %>% summarise(coefficient <- mean(c, na.rm = T),
standard_error <- mean(s, na.rm = T))
names(df_0) <- c("i","c","s")
View(df_0)
最后我是这样解决的:
这是最有效的方法吗?
library(foreign)
library(dplyr)
mydata = read.dta("http://dss.princeton.edu/training/Panel101.dta")
mydata$time = ifelse(mydata$year >= 1994, 1, 0)
mydata$treated = ifelse(mydata$country == "E" |
mydata$country == "F" |
mydata$country == "G", 1, 0)
mydata$did = mydata$time * mydata$treated
df_0 <- NULL
for (i in 1:length(seq(5,nrow(mydata)-1,5))){
index <- seq(5,nrow(mydata),5)[i]
df_1 <- NULL
for (j in 1:100){
mydata_temp <- mydata[sample(nrow(mydata), index), ]
didreg = lm(y ~ treated + time + did, data = mydata_temp)
out <- summary(didreg)
new_line <- c(out$coefficients[,1][4], out$coefficients[,2][4], index)
new_line <- data.frame(t(new_line))
names(new_line) <- c("c","s","i")
df_1 <- rbind(df_1,new_line)
}
df_0 <- rbind(df_0,df_1)
}
df_0 <- df_0 %>% group_by(i) %>% summarise(c = mean(c, na.rm = T), s =
mean(s, na.rm = T))
df_0 <- df_0 %>% group_by(i) %>% mutate(upper = c+s, lower = c-s)
df <- df_0
plot(df$i, df$c, ylim=c(min(df_0$c)-5000000000, max(df_0$c)+5000000000), type = "l")
polygon(c(df$i,rev(df$i)),c(df$lower,rev(df$upper)),col = "grey75", border = FALSE)
lines(df$i, df$c, lwd = 2)
考虑以下使用基本 R 函数的重构代码:within
、%in%
、嵌套 lapply
、setNames
、aggregate
和 do.call
.这种方法避免了在循环中调用 rbind
并在不经常使用 $
列引用的情况下紧凑地重写代码。
library(foreign)
mydata = read.dta("http://dss.princeton.edu/training/Panel101.dta")
mydata <- within(mydata, {
time <- ifelse(year >= 1994, 1, 0)
treated <- ifelse(country %in% c("E", "F", "G"), 1, 0)
did <- time * treated
})
# OUTER LIST OF DATA FRAMES
df_0_list <- lapply(1:length(seq(5,nrow(mydata)-1,5)), function(i) {
index <- seq(5,nrow(mydata),5)[i]
# INNER LIST OF DATA FRAMES
df_1_list <- lapply(1:100, function(j) {
mydata_temp <- mydata[sample(nrow(mydata), index), ]
didreg <- lm(y ~ treated + time + did, data = mydata_temp)
out <- summary(didreg)
new_line <- c(out$coefficients[,1][4], out$coefficients[,2][4], index)
new_line <- setNames(data.frame(t(new_line)), c("c","s","i"))
})
# APPEND ALL INNER DFS
df <- do.call(rbind, df_1_list)
return(df)
})
# APPEND ALL OUTER DFS
df_0 <- do.call(rbind, df_0_list)
# AGGREGATE WITH NEW COLUMNS
df_0 <- within(aggregate(cbind(c, s) ~ i, df_0, function(x) mean(x, na.rm=TRUE)), {
upper = c + s
lower = c - s
})
# RUN PLOT
within(df_0, {
plot(i, c, ylim=c(min(c)-5000000000, max(c)+5000000000), type = "l",
cex.lab=0.75, cex.axis=0.75, cex.main=0.75, cex.sub=0.75)
polygon(c(i, rev(i)), c(lower, rev(upper)),
col = "grey75", border = FALSE)
lines(i, c, lwd = 2)
})
我有一个大型数据集,可以对其执行 diff-in-diff 估计。鉴于数据集的性质,我的 t 统计量分母被夸大了,系数(偷偷地)具有统计显着性。 我想逐步减少数据库中的元素数量,并为每一步重新采样大量次数并重新估计每次交互系数和标准误差。
然后我想获取所有平均值估计值和标准误差,并将它们绘制在图表上,以显示在什么点(如果有的话)它们在统计上与零没有差异。
我的代码后面有一个玩具示例。
- 我不确定这是解决问题的最有效方法
- 我无法检索并绘制置信区间
- 鉴于存在不同的群体,我不确定抽样是否具有代表性。
玩具示例(Creds Torres-Reyna - 2015)
library(foreign)
library(dplyr)
library(ggplot2)
df_0 <- NULL
for (i in 1:length(seq(5,nrow(mydata)-1,5))){
index <- seq(5,nrow(mydata),5)[i]
df_1 <- NULL
for (j in 1:10){
mydata_temp <- mydata[sample(nrow(mydata), index), ]
didreg = lm(y ~ treated + time + did, data = mydata_temp)
out <- summary(didreg)
new_line <- c(out$coefficients[,1][4], out$coefficients[,2][4], index)
new_line <- data.frame(t(new_line))
names(new_line) <- c("c","s","i")
df_1 <- rbind(df_1,new_line)
}
df_0 <- rbind(df_0,df_1)
}
df_0 <- df_0 %>% group_by(i) %>% summarise(coefficient <- mean(c, na.rm = T),
standard_error <- mean(s, na.rm = T))
names(df_0) <- c("i","c","s")
View(df_0)
最后我是这样解决的: 这是最有效的方法吗?
library(foreign)
library(dplyr)
mydata = read.dta("http://dss.princeton.edu/training/Panel101.dta")
mydata$time = ifelse(mydata$year >= 1994, 1, 0)
mydata$treated = ifelse(mydata$country == "E" |
mydata$country == "F" |
mydata$country == "G", 1, 0)
mydata$did = mydata$time * mydata$treated
df_0 <- NULL
for (i in 1:length(seq(5,nrow(mydata)-1,5))){
index <- seq(5,nrow(mydata),5)[i]
df_1 <- NULL
for (j in 1:100){
mydata_temp <- mydata[sample(nrow(mydata), index), ]
didreg = lm(y ~ treated + time + did, data = mydata_temp)
out <- summary(didreg)
new_line <- c(out$coefficients[,1][4], out$coefficients[,2][4], index)
new_line <- data.frame(t(new_line))
names(new_line) <- c("c","s","i")
df_1 <- rbind(df_1,new_line)
}
df_0 <- rbind(df_0,df_1)
}
df_0 <- df_0 %>% group_by(i) %>% summarise(c = mean(c, na.rm = T), s =
mean(s, na.rm = T))
df_0 <- df_0 %>% group_by(i) %>% mutate(upper = c+s, lower = c-s)
df <- df_0
plot(df$i, df$c, ylim=c(min(df_0$c)-5000000000, max(df_0$c)+5000000000), type = "l")
polygon(c(df$i,rev(df$i)),c(df$lower,rev(df$upper)),col = "grey75", border = FALSE)
lines(df$i, df$c, lwd = 2)
考虑以下使用基本 R 函数的重构代码:within
、%in%
、嵌套 lapply
、setNames
、aggregate
和 do.call
.这种方法避免了在循环中调用 rbind
并在不经常使用 $
列引用的情况下紧凑地重写代码。
library(foreign)
mydata = read.dta("http://dss.princeton.edu/training/Panel101.dta")
mydata <- within(mydata, {
time <- ifelse(year >= 1994, 1, 0)
treated <- ifelse(country %in% c("E", "F", "G"), 1, 0)
did <- time * treated
})
# OUTER LIST OF DATA FRAMES
df_0_list <- lapply(1:length(seq(5,nrow(mydata)-1,5)), function(i) {
index <- seq(5,nrow(mydata),5)[i]
# INNER LIST OF DATA FRAMES
df_1_list <- lapply(1:100, function(j) {
mydata_temp <- mydata[sample(nrow(mydata), index), ]
didreg <- lm(y ~ treated + time + did, data = mydata_temp)
out <- summary(didreg)
new_line <- c(out$coefficients[,1][4], out$coefficients[,2][4], index)
new_line <- setNames(data.frame(t(new_line)), c("c","s","i"))
})
# APPEND ALL INNER DFS
df <- do.call(rbind, df_1_list)
return(df)
})
# APPEND ALL OUTER DFS
df_0 <- do.call(rbind, df_0_list)
# AGGREGATE WITH NEW COLUMNS
df_0 <- within(aggregate(cbind(c, s) ~ i, df_0, function(x) mean(x, na.rm=TRUE)), {
upper = c + s
lower = c - s
})
# RUN PLOT
within(df_0, {
plot(i, c, ylim=c(min(c)-5000000000, max(c)+5000000000), type = "l",
cex.lab=0.75, cex.axis=0.75, cex.main=0.75, cex.sub=0.75)
polygon(c(i, rev(i)), c(lower, rev(upper)),
col = "grey75", border = FALSE)
lines(i, c, lwd = 2)
})