如何只为在面板数据中第一次观察具有正值的人替换缺失值?
How to replace missing values only for people who have positive values on the first observation in panel data?
我有四个人被关注了四年。我想将 NA 替换为 0,但仅适用于在第一波面试中工作时间具有正值的人。例如,在我的数据中,这意味着 ID 为 3 和 4 的人的数据将被替换为 0,但 ID 为 2 的人将保留 his/her NA。
id wave year work_hours
1 1 2007 40
1 2 2008 39
1 3 2009 39
1 4 2010 38
2 1 2005 NA
2 2 2006 35
2 3 2007 35
2 4 2008 NA
3 1 2007 40
3 2 2008 NA
3 3 2009 40
3 4 2010 40
4 1 2009 32
4 2 2010 NA
4 3 2011 32
4 4 2012 NA
我尝试了以下代码,但它用 0 代替了第一个波浪,而不是后面的波浪:
df= df %>% group_by(id) %>%
mutate(workhours_imputed= ifelse(work_hours>0 & wave==1, replace_na(0), work_hours))
这是数据:
structure(list(id = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4,
4, 4, 4), wave = c(1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
3, 4), year = c(2007, 2008, 2009, 2010, 2005, 2006, 2007, 2008,
2007, 2008, 2009, 2010, 2009, 2010, 2011, 2012), work_hours = c(40,
39, 39, 38, NA, 35, 35, NA, 40, NA, 40, 40, 32, NA, 32, NA),
workhours_imputed = c(0, 39, 39, 38, NA, 35, 35, NA, 0, NA,
40, 40, 0, NA, 32, NA)), row.names = c(NA, -16L), groups = structure(list(
id = c(1, 2, 3, 4), .rows = structure(list(1:4, 5:8, 9:12,
13:16), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
一种方法是使用 match
-
library(dplyr)
df %>%
group_by(id) %>%
mutate(workhours_imputed = {
tmp <- work_hours[match(1, wave)]
#If the 1st wave has a positive value
#replace NA with 0
if(!is.na(tmp) && tmp > 0) replace(work_hours, is.na(work_hours), 0) else work_hours
})
# id wave year work_hours workhours_imputed
# <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 1 2007 40 40
# 2 1 2 2008 39 39
# 3 1 3 2009 39 39
# 4 1 4 2010 38 38
# 5 2 1 2005 NA NA
# 6 2 2 2006 35 35
# 7 2 3 2007 35 35
# 8 2 4 2008 NA NA
# 9 3 1 2007 40 40
#10 3 2 2008 NA 0
#11 3 3 2009 40 40
#12 3 4 2010 40 40
#13 4 1 2009 32 32
#14 4 2 2010 NA 0
#15 4 3 2011 32 32
#16 4 4 2012 NA 0
替代 dplyr 解决方案:
df %>%
mutate(workhours_imputed = if_else(
is.na(work_hours) & any(wave == 1 & !is.na(work_hours)),
0, work_hours)
)
# # A tibble: 16 x 5
# # Groups: id [4]
# id wave year work_hours workhours_imputed
# <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 1 2007 40 40
# 2 1 2 2008 39 39
# 3 1 3 2009 39 39
# 4 1 4 2010 38 38
# 5 2 1 2005 NA NA
# 6 2 2 2006 35 35
# 7 2 3 2007 35 35
# 8 2 4 2008 NA NA
# 9 3 1 2007 40 40
# 10 3 2 2008 NA 0
# 11 3 3 2009 40 40
# 12 3 4 2010 40 40
# 13 4 1 2009 32 32
# 14 4 2 2010 NA 0
# 15 4 3 2011 32 32
# 16 4 4 2012 NA 0
如果 wave
并不总是从 1 开始,但您总是想检查 wave
的第一个值,那么您可以改用它:
df %>%
mutate(workhours_imputed = if_else(
is.na(work_hours) & !is.na(work_hours[which.min(wave)]),
0, work_hours)
)
我有四个人被关注了四年。我想将 NA 替换为 0,但仅适用于在第一波面试中工作时间具有正值的人。例如,在我的数据中,这意味着 ID 为 3 和 4 的人的数据将被替换为 0,但 ID 为 2 的人将保留 his/her NA。
id wave year work_hours
1 1 2007 40
1 2 2008 39
1 3 2009 39
1 4 2010 38
2 1 2005 NA
2 2 2006 35
2 3 2007 35
2 4 2008 NA
3 1 2007 40
3 2 2008 NA
3 3 2009 40
3 4 2010 40
4 1 2009 32
4 2 2010 NA
4 3 2011 32
4 4 2012 NA
我尝试了以下代码,但它用 0 代替了第一个波浪,而不是后面的波浪:
df= df %>% group_by(id) %>%
mutate(workhours_imputed= ifelse(work_hours>0 & wave==1, replace_na(0), work_hours))
这是数据:
structure(list(id = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4,
4, 4, 4), wave = c(1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
3, 4), year = c(2007, 2008, 2009, 2010, 2005, 2006, 2007, 2008,
2007, 2008, 2009, 2010, 2009, 2010, 2011, 2012), work_hours = c(40,
39, 39, 38, NA, 35, 35, NA, 40, NA, 40, 40, 32, NA, 32, NA),
workhours_imputed = c(0, 39, 39, 38, NA, 35, 35, NA, 0, NA,
40, 40, 0, NA, 32, NA)), row.names = c(NA, -16L), groups = structure(list(
id = c(1, 2, 3, 4), .rows = structure(list(1:4, 5:8, 9:12,
13:16), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
一种方法是使用 match
-
library(dplyr)
df %>%
group_by(id) %>%
mutate(workhours_imputed = {
tmp <- work_hours[match(1, wave)]
#If the 1st wave has a positive value
#replace NA with 0
if(!is.na(tmp) && tmp > 0) replace(work_hours, is.na(work_hours), 0) else work_hours
})
# id wave year work_hours workhours_imputed
# <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 1 2007 40 40
# 2 1 2 2008 39 39
# 3 1 3 2009 39 39
# 4 1 4 2010 38 38
# 5 2 1 2005 NA NA
# 6 2 2 2006 35 35
# 7 2 3 2007 35 35
# 8 2 4 2008 NA NA
# 9 3 1 2007 40 40
#10 3 2 2008 NA 0
#11 3 3 2009 40 40
#12 3 4 2010 40 40
#13 4 1 2009 32 32
#14 4 2 2010 NA 0
#15 4 3 2011 32 32
#16 4 4 2012 NA 0
替代 dplyr 解决方案:
df %>%
mutate(workhours_imputed = if_else(
is.na(work_hours) & any(wave == 1 & !is.na(work_hours)),
0, work_hours)
)
# # A tibble: 16 x 5
# # Groups: id [4]
# id wave year work_hours workhours_imputed
# <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 1 2007 40 40
# 2 1 2 2008 39 39
# 3 1 3 2009 39 39
# 4 1 4 2010 38 38
# 5 2 1 2005 NA NA
# 6 2 2 2006 35 35
# 7 2 3 2007 35 35
# 8 2 4 2008 NA NA
# 9 3 1 2007 40 40
# 10 3 2 2008 NA 0
# 11 3 3 2009 40 40
# 12 3 4 2010 40 40
# 13 4 1 2009 32 32
# 14 4 2 2010 NA 0
# 15 4 3 2011 32 32
# 16 4 4 2012 NA 0
如果 wave
并不总是从 1 开始,但您总是想检查 wave
的第一个值,那么您可以改用它:
df %>%
mutate(workhours_imputed = if_else(
is.na(work_hours) & !is.na(work_hours[which.min(wave)]),
0, work_hours)
)