如何替换第一个事件中缺失的观察值的缺失值?

How to replace missing values for observations that are missing on the first event?

我在面板数据中输入社会 class 变量。我想用第一波出现的第一个社交 class 替换所有社交 class 变量。但是,有些人缺少第一波的数据。我还想归因于这些人的社交 class,但只是在第一次出现社交 class 变量之后。如果我使用以下命令:

df %>% 
  group_by(id) %>%
  dplyr::mutate(class_imputed = first(class))

# Groups:   id [4]
      id  wave  year class class_imputed
   <dbl> <dbl> <dbl> <fct> <fct>        
 1     1     1  2007 3     3            
 2     1     2  2008 2     3            
 3     1     3  2009 2     3            
 4     1     4  2010 2     3            
 5     2     1  2005 .     .            
 6     2     2  2006 2     .            
 7     2     3  2007 3     .            
 8     2     4  2008 .     .            
 9     3     1  2007 .     .            
10     3     2  2008 .     .            
11     3     3  2009 1     .            
12     3     4  2010 1     .            
13     4     1  2009 2     2            
14     4     2  2010 .     2            
15     4     3  2011 3     2            
16     4     4  2012 .     2    

我们可以看到,ID 2 和 3 的人没有被插补。但我只想估算第一次观察到社会 class 变量的时期。这意味着 ID 号为 2 的人将在 2006 年、2007 年和 2008 年获得社交 class = 2,但在 2005 年不会。有人可以帮忙吗?

这是数据:

structure(list(id = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 
4, 4, 4), wave = c(1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 
3, 4), year = c(2007, 2008, 2009, 2010, 2005, 2006, 2007, 2008, 
2007, 2008, 2009, 2010, 2009, 2010, 2011, 2012), class = structure(c(4L, 
3L, 3L, 3L, 1L, 3L, 4L, 1L, 1L, 1L, 2L, 2L, 3L, 1L, 4L, 1L), .Label = c(".", 
"1", "2", "3"), class = "factor"), class_imputed = structure(c(3L, 
3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c(".", 
"2", "3"), class = "factor")), row.names = c(NA, -16L), class = "data.frame")
 
library(tidyverse)

data <- structure(list(id = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 
                              4, 4, 4), wave = c(1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 
                                                 3, 4), year = c(2007, 2008, 2009, 2010, 2005, 2006, 2007, 2008, 
                                                                 2007, 2008, 2009, 2010, 2009, 2010, 2011, 2012), class = structure(c(4L, 
                                                                                                                                      3L, 3L, 3L, 1L, 3L, 4L, 1L, 1L, 1L, 2L, 2L, 3L, 1L, 4L, 1L), .Label = c(".", 
                                                                                                                                                                                                              "1", "2", "3"), class = "factor"), class_imputed = structure(c(3L, 
                                                                                                                                                                                                                                                                             3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c(".", 
                                                                                                                                                                                                                                                                                                                                                     "2", "3"), class = "factor")), row.names = c(NA, -16L), class = "data.frame")

data %>%
  na_if(".") %>%
  mutate(class_imputed = class) %>%
  fill(class_imputed)
#>    id wave year class class_imputed
#> 1   1    1 2007     3             3
#> 2   1    2 2008     2             2
#> 3   1    3 2009     2             2
#> 4   1    4 2010     2             2
#> 5   2    1 2005  <NA>             2
#> 6   2    2 2006     2             2
#> 7   2    3 2007     3             3
#> 8   2    4 2008  <NA>             3
#> 9   3    1 2007  <NA>             3
#> 10  3    2 2008  <NA>             3
#> 11  3    3 2009     1             1
#> 12  3    4 2010     1             1
#> 13  4    1 2009     2             2
#> 14  4    2 2010  <NA>             2
#> 15  4    3 2011     3             3
#> 16  4    4 2012  <NA>             3

reprex package (v2.0.1)

于 2021 年 10 月 1 日创建

您可以试试这个选项-

library(dplyr)

df %>%
  group_by(id) %>%
  dplyr::mutate(class_imputed = replace(class, 
                      row_number() > match(TRUE, class != '.'), 
                      class[class != '.'][1]))

#     id  wave  year class class_imputed
#   <dbl> <dbl> <dbl> <fct> <fct>        
# 1     1     1  2007 3     3            
# 2     1     2  2008 2     3            
# 3     1     3  2009 2     3            
# 4     1     4  2010 2     3            
# 5     2     1  2005 .     .            
# 6     2     2  2006 2     2            
# 7     2     3  2007 3     2            
# 8     2     4  2008 .     2            
# 9     3     1  2007 .     .            
#10     3     2  2008 .     .            
#11     3     3  2009 1     1            
#12     3     4  2010 1     1            
#13     4     1  2009 2     2            
#14     4     2  2010 .     2            
#15     4     3  2011 3     2            
#16     4     4  2012 .     2     

match(TRUE, class != '.') 将 return 值不同于 '.' 的第一个位置,而 class[class != '.'][1] 将 return 被估算的值,即第一个社会class变量。

使用 data.table

更容易
library(data.table)
i1 <- setDT(df)[, .I[cumsum(class != "." & any(class == ".")) > 0],id]$V1
df[i1, class_imputed := first(class), id]

-输出

> df
    id wave year class class_imputed
 1:  1    1 2007     3             3
 2:  1    2 2008     2             3
 3:  1    3 2009     2             3
 4:  1    4 2010     2             3
 5:  2    1 2005     .             .
 6:  2    2 2006     2             2
 7:  2    3 2007     3             2
 8:  2    4 2008     .             2
 9:  3    1 2007     .             .
10:  3    2 2008     .             .
11:  3    3 2009     1             1
12:  3    4 2010     1             1
13:  4    1 2009     2             2
14:  4    2 2010     .             2
15:  4    3 2011     3             2
16:  4    4 2012     .             2