数据清理,从横截面(多个文件)到 RStudio 中的面板:merge/gather?

Data cleaning, from cross-sectional (multiple files) to panel in RStudio: merge/gather?

首先,很抱歉提出这个问题。我知道有办法做到这一点,但我已经为此苦苦挣扎了几天,我开始变得无能为力了。

从 2008 年到 2020 年,我对不同变量的个人进行年度观察。我有关于家庭(25 个变量)、收入(15 个变量)和学校教育(22 个变量)的数据。 现在,每个数据集都有 'cleaned',这样每个类别的每一列都具有相同的列名。对于上下文,这就是我的 R 现在的样子。

问题是,我想要一个大数据集,其中包含一个数据框中的所有个人和年份。我知道我 should/could 首先使用 innerjoin 或 merge 函数按 'Householdmember' 排序,我可以使用 gather 函数,但我真的很纠结我应该按什么顺序做这件事以及我在哪里做应该开始。我一直在尝试很多事情,但考虑到数据帧的数量,很难跟踪我在做什么。我还为每年的每个类别创建了列表,因为这是在一种方法中推荐的,但没有成功...

我想得到一个看起来类似于这样的数据框:

Individual Year Var1 Var2
1 2008 value value
1 2009 value value
1 2010 value value
2 2008 value value
2 2009 value value
2 2010 value value

我真的希望有人能帮助我或告诉我第一步应该做什么...如果我合并数据框,我认为 R 不知道哪些值对应于哪一年...

    > head(fam08)
# A tibble: 6 x 25
  HouseholdMember RandomChild YearBirthRandom  Gender   Age FatherBirth FatherAlive MotherBirth MotherAlive Divorce SeeFather SeeMother
            <dbl>   <dbl+lbl>           <dbl> <dbl+l> <dbl>   <dbl+lbl>   <dbl+lbl>   <dbl+lbl>   <dbl+lbl> <dbl+l> <dbl+lbl> <dbl+lbl>
1          800033 16 [not ap…              NA 1 [mal…    16        1952     1 [yes]        1961     1 [yes] 1 [yes]  7 [ever…  7 [ever…
2          800042 16 [not ap…              NA 2 [fem…    32        1946     1 [yes]        1948     1 [yes] 2 [no]   4 [at l…  4 [at l…
3          800045 16 [not ap…              NA 1 [mal…    65        1913     2 [no]         1915     2 [no]  2 [no]  NA        NA       
4          800057 16 [not ap…              NA 1 [mal…    33        1939     1 [yes]        1945     1 [yes] 1 [yes]  4 [at l…  4 [at l…
5          800076 16 [not ap…              NA 2 [fem…    22        1955     1 [yes]        1955     1 [yes] 1 [yes]  5 [at l…  3 [a fe…
6          800119 16 [not ap…              NA 2 [fem…    57        1908     2 [no]         1918     2 [no]  2 [no]  NA        NA       
# … with 13 more variables: Married <dbl+lbl>, Child <dbl+lbl>, NumChild <dbl>, SchoolCH1 <dbl+lbl>, SchoolCH2 <dbl+lbl>,
#   SchoolCH3 <dbl+lbl>, SchoolCH4 <dbl+lbl>, BirthCH1 <dbl>, BirthCH2 <dbl>, BirthCH3 <dbl>, BirthCH4 <dbl>, FamSatisfaction <dbl+lbl>,
#   Year <dbl>



> head(fam09)
# A tibble: 6 x 25
  HouseholdMember RandomChild YearBirthRandom  Gender   Age FatherBirth FatherAlive MotherBirth MotherAlive Divorce SeeFather SeeMother
            <dbl>   <dbl+lbl>           <dbl> <dbl+l> <dbl>   <dbl+lbl>   <dbl+lbl>   <dbl+lbl>   <dbl+lbl> <dbl+l> <dbl+lbl> <dbl+lbl>
1          800033 16 [not ap…              NA 1 [mal…    17        1952     1 [yes]        1961     1 [yes]      NA  5 [at l…  7 [ever…
2          800042 16 [not ap…              NA 2 [fem…    33        1946     1 [yes]        1948     1 [yes]      NA  4 [at l…  4 [at l…
3          800057 16 [not ap…              NA 1 [mal…    34        1939     1 [yes]        1945     1 [yes]      NA  3 [a fe…  3 [a fe…
4          800076 16 [not ap…              NA 2 [fem…    23        1955     1 [yes]        1955     1 [yes]      NA  5 [at l…  3 [a fe…
5          800119 16 [not ap…              NA 2 [fem…    58          NA    NA                NA    NA            NA NA        NA       
6          800125 16 [not ap…              NA 2 [fem…    50          NA    NA              1928     1 [yes]      NA NA         1 [neve…
# … with 13 more variables: Married <dbl+lbl>, Child <dbl+lbl>, NumChild <dbl>, SchoolCH1 <dbl+lbl>, SchoolCH2 <dbl+lbl>,
#   SchoolCH3 <dbl+lbl>, SchoolCH4 <dbl+lbl>, BirthCH1 <dbl>, BirthCH2 <dbl>, BirthCH3 <dbl>, BirthCH4 <dbl>, FamSatisfaction <dbl+lbl>,
#   Year <dbl>




dput(head(fam09,10))
structure(list(HouseholdMember = c(800033, 800042, 800057, 800076, 
800119, 800125, 800170, 800186, 800201, 800204), RandomChild = structure(c(16, 
16, 16, 16, 16, 16, 3, 16, 16, 16), label = "Randomly chosen child", labels = c(`child 1` = 1, 
`child 2` = 2, `child 3` = 3, `child 4` = 4, `child 5` = 5, `child 6` = 6, 
`child 7` = 7, `child 8` = 8, `child 9` = 9, `child 10` = 10, 
`child 11` = 11, `child 12` = 12, `child 13` = 13, `child 14` = 14, 
`child 15` = 15, `not applicable` = 16), class = "haven_labelled"), 
    YearBirthRandom = c(NA, NA, NA, NA, NA, NA, 1999, NA, NA, 
    NA), Gender = structure(c(1, 2, 1, 2, 2, 2, 2, 2, 1, 1), label = "Gender respondent", labels = c(male = 1, 
    female = 2), class = "haven_labelled"), Age = c(17, 33, 34, 
    23, 58, 50, 50, 69, 35, 67), FatherBirth = structure(c(1952, 
    1946, 1939, 1955, NA, NA, 1926, NA, 1948, NA), label = "What is the year of birth of your father?", labels = c(`I don't know` = 99999), class = "haven_labelled"), 
    FatherAlive = structure(c(1, 1, 1, 1, NA, NA, 1, NA, 1, NA
    ), label = "Is your father still alive?", labels = c(yes = 1, 
    no = 2, `I don't know` = 99), class = "haven_labelled"), 
    MotherBirth = structure(c(1961, 1948, 1945, 1955, NA, 1928, 
    1931, NA, 1950, NA), label = "What is the year of birth of your mother?", labels = c(`I don't know` = 99999), class = "haven_labelled"), 
    MotherAlive = structure(c(1, 1, 1, 1, NA, 1, 1, NA, 1, NA
    ), label = "Is your mother still alive?", labels = c(yes = 1, 
    no = 2, `I don't know` = 99), class = "haven_labelled"), 
    Divorce = structure(c(NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_
    ), label = "Did your own parents ever divorce?", labels = c(yes = 1, 
    no = 2, `my parents never had a relationship` = 3, `I don't know` = 99
    ), class = "haven_labelled"), SeeFather = structure(c(5, 
    4, 3, 5, NA, NA, 6, NA, 3, NA), label = "How often did you see your father over the past 12 months?", labels = c(never = 1, 
    once = 2, `a few times` = 3, `at least every month` = 4, 
    `at least every week` = 5, `a few times per week` = 6, `every day` = 7
    ), class = "haven_labelled"), SeeMother = structure(c(7, 
    4, 3, 3, NA, 1, 6, NA, 3, NA), label = "How often did you see your mother over the past 12 months?", labels = c(never = 1, 
    once = 2, `a few times` = 3, `at least every month` = 4, 
    `at least every week` = 5, `a few times per week` = 6, `every day` = 7
    ), class = "haven_labelled"), Married = structure(c(NA, 1, 
    2, 2, 1, 2, 1, 1, 1, 1), label = "Are you married to this partner?", labels = c(yes = 1, 
    no = 2), class = "haven_labelled"), Child = structure(c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_), label = "Have you had any children?", labels = c(yes = 1, 
    no = 2), class = "haven_labelled"), NumChild = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_), SchoolCH1 = structure(c(NA, 
    NA, NA, NA, NA, NA, 4, NA, NA, NA), label = "What school does child 1 (born in the years 1991 through 2004) attend?", labels = c(`primary school` = 1, 
    `school for special primary education` = 2, `secondary school` = 3, 
    other = 4), class = "haven_labelled"), SchoolCH2 = structure(c(NA, 
    NA, NA, NA, NA, NA, 3, NA, NA, NA), label = "What school does child 2 (born in the years 1991 through 2004) attend?", labels = c(`primary school` = 1, 
    `school for special primary education` = 2, `secondary school` = 3, 
    other = 4), class = "haven_labelled"), SchoolCH3 = structure(c(NA, 
    NA, NA, NA, NA, NA, 1, NA, NA, NA), label = "What school does child 3 (born in the years 1991 through 2004) attend?", labels = c(`primary school` = 1, 
    `school for special primary education` = 2, `secondary school` = 3, 
    other = 4), class = "haven_labelled"), SchoolCH4 = structure(c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_), label = "What school does child 4 (born in the years 1991 through 2004) attend?", labels = c(`primary school` = 1, 
    `school for special primary education` = 2, `secondary school` = 3, 
    other = 4), class = "haven_labelled"), BirthCH1 = c(NA, 2005, 
    2007, NA, 1983, NA, 1991, 1964, NA, 1974), BirthCH2 = c(NA, 
    2007, NA, NA, 1985, NA, 1994, 1966, NA, 1976), BirthCH3 = c(NA, 
    NA, NA, NA, NA, NA, 1999, 1970, NA, NA), BirthCH4 = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_), FamSatisfaction = structure(c(NA, 
    8, 9, NA, 8, NA, 8, NA, NA, NA), label = "How satisfied are you with your family life?", labels = c(`entirely dissatisfied` = 0, 
    `entirely satisfied` = 10, `I don’t know` = 999), class = "haven_labelled"), 
    Year = c(2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 
    2009, 2009)), row.names = c(NA, -10L), class = c("tbl_df", 
"tbl", "data.frame"))

我相信你可以按照这些思路做一些事情:

fam = bind_rows(fam_list)
inc = bind_rows(inc_list)
ws = bind_rows(ws_list)

result = fam %>%
  left_join(inc, by=c("HouseholdMember", "Year")) %>% 
  left_join(ws, by=c("HouseholdMember", "Year"))

输出:

   HouseholdMember  Year fam_v1 fam_v2 fam_v3  inc_v1  inc_v2 inc_v3   ws_v1 ws_v2  ws_v3
             <dbl> <dbl>  <dbl>  <dbl>  <dbl>   <dbl>   <dbl>  <dbl>   <dbl> <dbl>  <dbl>
 1            8001  2008  0.609 -0.253 -1.30   0.0147  0.719  -0.765  0.120  0.974 -0.764
 2            8002  2008  0.395  1.73  -0.503  0.119  -3.33   -0.798  0.325  0.664  1.65 
 3            8003  2008  0.562  0.157  0.243 -1.18   -0.260   0.105  1.09   0.855  1.19 
 4            8004  2008  1.32   0.737 -1.18   0.725  -1.82    0.356  0.362  2.04   1.76 
 5            8005  2008 -0.497 -0.444 -0.632 -0.534   1.63    0.984  1.29   0.614  0.576
 6            8006  2008 -1.70  -0.989 -1.32   0.868   0.0979  0.468 -0.0146 1.11   0.957
 7            8007  2008 -2.19  -0.419  1.69   1.34   -0.404  -1.43  -0.156  0.648 -0.186
 8            8008  2008  1.48   0.350 -0.595  0.785  -0.609   1.28  -1.01   1.04   0.845
 9            8009  2008 -0.315 -0.530  0.419  0.390  -0.0951 -0.755  0.135  0.696 -1.97 
10            8010  2008 -0.882  1.38   2.06  -0.0757  1.53   -0.494 -1.03   1.14   1.87 

注意:

我通过创建 tibbles 列表为这个例子制作了数据;我相信 fam_listinc_listws_list 与您图像中的列表 objects 相似。这些是数据框/tibbles 的列表。然后我使用 bind_rows 将这些相似结构的小标题绑定在一起,这样我就有了三个大的小标题。

然后我使用 left_join 两次加入 incwsfam

输入数据:

library(tidyverse)
fam_list = lapply(8:20, function(x) {
  tibble(HouseholdMember = c(8000+seq(1:100)),
         Year=2000+x, 
         fam_v1=rnorm(100), 
         fam_v2=rnorm(100), 
         fam_v3=rnorm(100)
  )
})
names(fam_list) = paste0("fam_20", 8:20)

inc_list = lapply(8:20, function(x) {
  tibble(HouseholdMember = c(8000+seq(1:100)),
         Year=2000+x, 
         inc_v1=rnorm(100), 
         inc_v2=rnorm(100), 
         inc_v3=rnorm(100)
  )
})
names(inc_list) = paste0("inc_20", 8:20)
ws_list = lapply(8:20, function(x) {
  tibble(HouseholdMember = c(8000+seq(1:100)),
         Year=2000+x, 
         ws_v1=rnorm(100), 
         ws_v2=rnorm(100), 
         ws_v3=rnorm(100)
  )
})
names(ws_list) = paste0("ws_20", 8:20)

输入