如何只为在面板数据中第一次观察具有正值的人替换缺失值?

How to replace missing values only for people who have positive values on the first observation in panel data?

我有四个人被关注了四年。我想将 NA 替换为 0,但仅适用于在第一波面试中工作时间具有正值的人。例如,在我的数据中,这意味着 ID 为 3 和 4 的人的数据将被替换为 0,但 ID 为 2 的人将保留 his/her NA。

id wave year work_hours
1   1   2007    40
1   2   2008    39
1   3   2009    39
1   4   2010    38
2   1   2005    NA
2   2   2006    35
2   3   2007    35
2   4   2008    NA
3   1   2007    40
3   2   2008    NA
3   3   2009    40
3   4   2010    40
4   1   2009    32
4   2   2010    NA
4   3   2011    32
4   4   2012    NA

我尝试了以下代码,但它用 0 代替了第一个波浪,而不是后面的波浪:

df= df %>% group_by(id) %>% 
  mutate(workhours_imputed= ifelse(work_hours>0 & wave==1, replace_na(0), work_hours))

这是数据:

structure(list(id = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 
4, 4, 4), wave = c(1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 
3, 4), year = c(2007, 2008, 2009, 2010, 2005, 2006, 2007, 2008, 
2007, 2008, 2009, 2010, 2009, 2010, 2011, 2012), work_hours = c(40, 
39, 39, 38, NA, 35, 35, NA, 40, NA, 40, 40, 32, NA, 32, NA), 
    workhours_imputed = c(0, 39, 39, 38, NA, 35, 35, NA, 0, NA, 
    40, 40, 0, NA, 32, NA)), row.names = c(NA, -16L), groups = structure(list(
    id = c(1, 2, 3, 4), .rows = structure(list(1:4, 5:8, 9:12, 
        13:16), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, -4L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

一种方法是使用 match -

library(dplyr)

df %>%
  group_by(id) %>%
  mutate(workhours_imputed = {
    tmp <- work_hours[match(1, wave)]
    #If the 1st wave has a positive value
    #replace NA with 0
    if(!is.na(tmp) && tmp > 0) replace(work_hours, is.na(work_hours), 0) else work_hours
  })

#      id  wave  year work_hours workhours_imputed
#   <dbl> <dbl> <dbl>      <dbl>             <dbl>
# 1     1     1  2007         40                40
# 2     1     2  2008         39                39
# 3     1     3  2009         39                39
# 4     1     4  2010         38                38
# 5     2     1  2005         NA                NA
# 6     2     2  2006         35                35
# 7     2     3  2007         35                35
# 8     2     4  2008         NA                NA
# 9     3     1  2007         40                40
#10     3     2  2008         NA                 0
#11     3     3  2009         40                40
#12     3     4  2010         40                40
#13     4     1  2009         32                32
#14     4     2  2010         NA                 0
#15     4     3  2011         32                32
#16     4     4  2012         NA                 0

替代 dplyr 解决方案:

df %>%
  mutate(workhours_imputed = if_else(
    is.na(work_hours) & any(wave == 1 & !is.na(work_hours)),
    0, work_hours)
  )
# # A tibble: 16 x 5
# # Groups:   id [4]
#       id  wave  year work_hours workhours_imputed
#    <dbl> <dbl> <dbl>      <dbl>             <dbl>
#  1     1     1  2007         40                40
#  2     1     2  2008         39                39
#  3     1     3  2009         39                39
#  4     1     4  2010         38                38
#  5     2     1  2005         NA                NA
#  6     2     2  2006         35                35
#  7     2     3  2007         35                35
#  8     2     4  2008         NA                NA
#  9     3     1  2007         40                40
# 10     3     2  2008         NA                 0
# 11     3     3  2009         40                40
# 12     3     4  2010         40                40
# 13     4     1  2009         32                32
# 14     4     2  2010         NA                 0
# 15     4     3  2011         32                32
# 16     4     4  2012         NA                 0

如果 wave 并不总是从 1 开始,但您总是想检查 wave 的第一个值,那么您可以改用它:

df %>%
  mutate(workhours_imputed = if_else(
    is.na(work_hours) & !is.na(work_hours[which.min(wave)]),
    0, work_hours)
  )