将数据文件和标签文件组合在一起,在 R 中得到一个单独的标签数据帧

Combine data file and label file together to have one single labelled dataframe in R

我有两个数据框,一个是调查数据(data.csv),另一个是标签数据(label.csv)。这是样本数据(我的原始数据有大约150个变量)

#sample data

df <- tibble::tribble(
  ~id, ~House_member, ~dob, ~age_quota, ~work, ~sex, ~pss,
  1L,            4L,  1983L,  2L,        2L,     1,      1,
  2L,            1L,  1940L,  7L,        2L,     1,      2,
  3L,            2L,  1951L,  5L,        6L,     1,      1,
  4L,            4L,  1965L,  2L,        2L,     1,      4,
  5L,            3L,  1965L,  2L,        3L,     1,      1,
  6L,            1L,  1951L,  3L,        1L,     1,      3,
  7L,            1L,  1955L,  1L,        1L,     1,      3,
  8L,            4L,  1982L,  2L,        2L,     2,      5,
  9L,            2L,  1990L,  2L,        4L,     2,      3,
  10L,            2L,  1953L, 3L,        2L,     2,      4
)


#sample label data
label <- tibble::tribble(
                ~variable, ~value,                           ~label,
           "House_member",     NA, "How many people live with you?",
           "House_member",     1L,                       "1 person",
           "House_member",     2L,                      "2 persons",
           "House_member",     3L,                      "3 persons",
           "House_member",     4L,                      "4 persons",
           "House_member",     5L,                      "5 persons",
           "House_member",     6L,                      "6 persons",
           "House_member",     7L,                      "7 persons",
           "House_member",     8L,                      "8 persons",
           "House_member",     9L,                      "9 persons",
           "House_member",    10L,                     "10 or more",
                    "dob",     NA,                  "date of brith",
              "age_quota",     NA,                      "age_quota",
              "age_quota",     1L,                          "10-14",
              "age_quota",     2L,                          "15-19",
              "age_quota",     3L,                          "20-29",
              "age_quota",     4L,                          "30-39",
              "age_quota",     5L,                          "40-49",
              "age_quota",     6L,                          "50-70",
              "age_quota",     7L,                           "70 +",
                   "work",     NA,        "what is your occupation?",
                   "work",     1L,                      "full time",
                   "work",     2L,                      "part time",
                   "work",     3L,                        "retired",
                   "work",     4L,                        "student",
                   "work",     5L,                      "housewife",
                   "work",     6L,                     "unemployed",
                   "work",     7L,                          "other",
                   "work",     8L,                   "kid under 15",
                    "sex",     NA,                        "gender?",
                    "sex",     1L,                            "Man",
                    "sex",     2L,                          "Woman",
                    "pss",     NA,       "How often do you use PS?",
                    "pss",     1L,                          "Daily",
                    "pss",     2L,         "several times per week",
                    "pss",     3L,                  "once per week",
                    "pss",     4L,         "several time per month",
                    "pss",     5L,                          "Rarly"
           )

我想知道有什么方法可以将这些文件组合在一起,形成一个像 SPSS 的样式格式(dbl+lbl 格式)一样的标记数据框。我知道 labelled 包可以将值标签添加到未标记的向量中,例如这个例子:

v <- labelled::labelled(c(1,2,2,2,3,9,1,3,2,NA), c(yes = 1, maybe = 2, no = 3))

我希望有一种 better/faster 的方法,而不是一个一个地为每个变量添加标签。

这是一种使用 purrr::imap_dfc

的方法
library(dplyr)
library(purrr)

# custom function for taking the column data and column name and reformat the values using factor
custom_function <- function(value, col_name) { 
  matching_vairable <- label %>%
    filter(variable == col_name & !is.na(value)) %>%
    select(label, value)
  column_data <- tibble(!!sym(col_name) := value)
  if (nrow(matching_vairable) > 0) {
    column_data[[1]] <- labelled::labelled(column_data[[1]],
      tibble::deframe(matching_vairable))
  }
  column_data
}

new_df <- imap_dfc(df, .f = custom_function)

输出

new_df
#> # A tibble: 10 x 7
#>       id  House_member   dob age_quota         work      sex                 pss
#>    <int>     <int+lbl> <int> <int+lbl>    <int+lbl> <dbl+lb>           <dbl+lbl>
#>  1     1 4 [4 persons]  1983 2 [15-19] 2 [part tim… 1 [Man]  1 [Daily]          
#>  2     2 1 [1 person]   1940 7 [70 +]  2 [part tim… 1 [Man]  2 [several times p…
#>  3     3 2 [2 persons]  1951 5 [40-49] 6 [unemploy… 1 [Man]  1 [Daily]          
#>  4     4 4 [4 persons]  1965 2 [15-19] 2 [part tim… 1 [Man]  4 [several time pe…
#>  5     5 3 [3 persons]  1965 2 [15-19] 3 [retired]  1 [Man]  1 [Daily]          
#>  6     6 1 [1 person]   1951 3 [20-29] 1 [full tim… 1 [Man]  3 [once per week]  
#>  7     7 1 [1 person]   1955 1 [10-14] 1 [full tim… 1 [Man]  3 [once per week]  
#>  8     8 4 [4 persons]  1982 2 [15-19] 2 [part tim… 2 [Woma… 5 [Rarly]          
#>  9     9 2 [2 persons]  1990 2 [15-19] 4 [student]  2 [Woma… 3 [once per week]  
#> 10    10 2 [2 persons]  1953 3 [20-29] 2 [part tim… 2 [Woma… 4 [several time pe…

new_df %>% pull(House_member)
#> <labelled<integer>[10]>
#>  [1] 4 1 2 4 3 1 1 4 2 2
#> 
#> Labels:
#>  value      label
#>      1   1 person
#>      2  2 persons
#>      3  3 persons
#>      4  4 persons
#>      5  5 persons
#>      6  6 persons
#>      7  7 persons
#>      8  8 persons
#>      9  9 persons
#>     10 10 or more

reprex package (v2.0.0)

于 2021-05-16 创建

另一个imap_dfc解决方案:

library(tidyverse)

df %>% imap_dfc(~{ 
                  label[label$variable==.y,c('label','value')] %>%
                  deframe() %>% # to named vector
                  haven::labelled(.x,.)
                 })

# A tibble: 10 x 7
          id  House_member       dob age_quota           work       sex                        pss
   <int+lbl>     <int+lbl> <int+lbl> <int+lbl>      <int+lbl> <dbl+lbl>                  <dbl+lbl>
 1         1 4 [4 persons]      1983 2 [15-19] 2 [part time]  1 [Man]   1 [Daily]                 
 2         2 1 [1 person]       1940 7 [70 +]  2 [part time]  1 [Man]   2 [several times per week]
 3         3 2 [2 persons]      1951 5 [40-49] 6 [unemployed] 1 [Man]   1 [Daily]                 
 4         4 4 [4 persons]      1965 2 [15-19] 2 [part time]  1 [Man]   4 [several time per month]
 5         5 3 [3 persons]      1965 2 [15-19] 3 [retired]    1 [Man]   1 [Daily]                 
 6         6 1 [1 person]       1951 3 [20-29] 1 [full time]  1 [Man]   3 [once per week]         
 7         7 1 [1 person]       1955 1 [10-14] 1 [full time]  1 [Man]   3 [once per week]         
 8         8 4 [4 persons]      1982 2 [15-19] 2 [part time]  2 [Woman] 5 [Rarly]                 
 9         9 2 [2 persons]      1990 2 [15-19] 4 [student]    2 [Woman] 3 [once per week]         
10        10 2 [2 persons]      1953 3 [20-29] 2 [part time]  2 [Woman] 4 [several time per month]

使用了 tibble::deframehaven::labelled,它们包含在 tidyverse

直接访问label替换filter/select后的速度对比:

Waldi <- function() {
df %>% imap_dfc(~{ 
    label[label$variable==.y,c('label','value')] %>%
      deframe() %>% # to named vector
      haven::labelled(.x,.)})}

Waldi_old <- function() {   
    df %>% imap_dfc(~{ 
      label %>% filter(variable==.y) %>%
        select(label, value) %>%
        deframe() %>% # to named vector
        haven::labelled(.x,.)
    })}

#EDIT : Included TIC33() for-loop solution

microbenchmark::microbenchmark(TIC3(),Waldi(),Anil(),TIC1(),Waldi_old(),Sinh())
Unit: microseconds
        expr     min       lq      mean   median       uq     max neval   cld
      TIC3()   688.0   871.80   982.280   920.95  1005.55  1801.6   100 a    
     Waldi()  1345.5  1543.60  1804.758  1635.45  1893.75  4306.8   100  b   
      Anil()  4006.8  4476.65  5188.519  4862.95  5439.10 10163.6   100   c  
      TIC1()  3898.2  4278.80  5009.927  4774.95  5277.05 12916.2   100   c  
 Waldi_old() 18712.3 20091.75 21756.140 20609.35 22169.75 33359.8   100    d 
      Sinh() 22730.9 24093.45 25931.412 24946.00 26614.00 38735.3   100     e

虽然不如@Waldi 提出的其他答案那么快,但由于使用了最少的外部包,这可能被视为一种选择。

交替使用 purrr::imap_*tibble::deframe,这可以在 dplyr 中仅使用 mutate(across(.. 完成,如下所示-

library(dplyr)
library(labelled)

df %>%
  mutate(across(everything(), ~labelled::labelled(as.double(.), 
                                                  setNames(label$value[label$variable == cur_column()], 
                                                           label$label[label$variable == cur_column()])
                                                  )))


# A tibble: 10 x 7
          id  House_member       dob age_quota           work       sex                        pss
   <dbl+lbl>     <dbl+lbl> <dbl+lbl> <dbl+lbl>      <dbl+lbl> <dbl+lbl>                  <dbl+lbl>
 1         1 4 [4 persons]      1983 2 [15-19] 2 [part time]  1 [Man]   1 [Daily]                 
 2         2 1 [1 person]       1940 7 [70 +]  2 [part time]  1 [Man]   2 [several times per week]
 3         3 2 [2 persons]      1951 5 [40-49] 6 [unemployed] 1 [Man]   1 [Daily]                 
 4         4 4 [4 persons]      1965 2 [15-19] 2 [part time]  1 [Man]   4 [several time per month]
 5         5 3 [3 persons]      1965 2 [15-19] 3 [retired]    1 [Man]   1 [Daily]                 
 6         6 1 [1 person]       1951 3 [20-29] 1 [full time]  1 [Man]   3 [once per week]         
 7         7 1 [1 person]       1955 1 [10-14] 1 [full time]  1 [Man]   3 [once per week]         
 8         8 4 [4 persons]      1982 2 [15-19] 2 [part time]  2 [Woman] 5 [Rarly]                 
 9         9 2 [2 persons]      1990 2 [15-19] 4 [student]    2 [Woman] 3 [once per week]         
10        10 2 [2 persons]      1953 3 [20-29] 2 [part time]  2 [Woman] 4 [several time per month]

如前所述,在注释中,您需要将输出列作为 dbl + lbl,因此第一个参数已用作 as.double(.) 而不是 .,其中输出将是 int + lbl 当输入列是 integer 类型时。

以下是 回答的一些变体。似乎 for 循环(参见 TIC3())提供了不错的速度。

  • 变体 1
TIC1 <- function() {
  df %>%
    mutate(
      across(everything(), ~ labelled(
        .,
        with(label, setNames(value, label)[variable == cur_column()])
      ))
    )
}
  • 变体 2(如果您不想 NA 被标记)
TIC2 <- function() {
  df %>%
    mutate(
      across(
        with(label, unique(variable[!is.na(value)])),
        ~ labelled(
          .,
          with(label, setNames(value, label)[variable == cur_column()])
        )
      )
    )
}
  • 变体 3(for 循环版本 TIC1()
TIC3 <- function() {
  nms <- names(df)
  for (k in nms[nms %in% label$variable]) {
    df[[k]] <- labelled(df[[k]], with(label, setNames(value, label)[variable == k]))
  }
  df
}

基准测试

TIC1 <- function() {
  df %>%
    mutate(
      across(everything(), ~ labelled(
        .,
        with(label, setNames(value, label)[variable == cur_column()])
      ))
    )
}

TIC2 <- function() {
  df %>%
    mutate(
      across(
        with(label, unique(variable[!is.na(value)])),
        ~ labelled(
          .,
          with(label, setNames(value, label)[variable == cur_column()])
        )
      )
    )
}

TIC3 <- function() {
  nms <- names(df)
  for (k in nms[nms %in% label$variable]) {
    df[[k]] <- labelled(df[[k]], with(label, setNames(value, label)[variable == k]))
  }
  df
}


Waldi1 <- function() {
  df %>% imap_dfc(~ {
    label %>%
      filter(variable == .y) %>%
      select(label, value) %>%
      deframe() %>%
      # to named vector
      haven::labelled(.x, .)
  })
}

Waldi2 <- function() {
  df %>% imap_dfc(~ {
    label[label$variable == .y, c("label", "value")] %>%
      deframe() %>% # to named vector
      haven::labelled(.x, .)
  })
}

Anil <- function() {
  df %>%
    mutate(across(everything(), ~ labelled::labelled(
      as.double(.),
      setNames(
        label$value[label$variable == cur_column()],
        label$label[label$variable == cur_column()]
      )
    )))
}

custom_function <- function(value, col_name) {
  matching_vairable <- label %>%
    filter(variable == col_name & !is.na(value)) %>%
    select(label, value)
  column_data <- tibble(!!sym(col_name) := value)
  if (nrow(matching_vairable) > 0) {
    column_data[[1]] <- labelled::labelled(
      column_data[[1]],
      tibble::deframe(matching_vairable)
    )
  }
  column_data
}

Sinh <- function(x) {
  imap_dfc(df, .f = custom_function)
}

microbenchmark(
  Waldi1(),
  Waldi2(),
  Anil(),
  Sinh(),
  TIC1(),
  TIC2(),
  TIC3(),
  unit = "relative"
)

给予

Unit: relative
     expr       min        lq      mean    median        uq        max neval
 Waldi1() 17.540613 17.359550 17.019266 17.238594 18.502584  4.7788575   100
 Waldi2()  1.355634  1.350547  1.338517  1.352509  1.342408  0.7033271   100
   Anil()  3.996836  4.011826  3.902559  4.029819  3.937232  1.2877871   100
   Sinh() 20.756122 20.595253 20.637410 20.452746 21.484992 13.0362139   100
   TIC1()  3.617278  3.617310  3.480283  3.609973  3.526703  1.0682179   100
   TIC2()  3.315545  3.384422  3.282862  3.389645  3.325616  1.0474304   100
   TIC3()  1.000000  1.000000  1.000000  1.000000  1.000000  1.0000000   100