将数据文件和标签文件组合在一起,在 R 中得到一个单独的标签数据帧
Combine data file and label file together to have one single labelled dataframe in R
我有两个数据框,一个是调查数据(data.csv),另一个是标签数据(label.csv)。这是样本数据(我的原始数据有大约150个变量)
#sample data
df <- tibble::tribble(
~id, ~House_member, ~dob, ~age_quota, ~work, ~sex, ~pss,
1L, 4L, 1983L, 2L, 2L, 1, 1,
2L, 1L, 1940L, 7L, 2L, 1, 2,
3L, 2L, 1951L, 5L, 6L, 1, 1,
4L, 4L, 1965L, 2L, 2L, 1, 4,
5L, 3L, 1965L, 2L, 3L, 1, 1,
6L, 1L, 1951L, 3L, 1L, 1, 3,
7L, 1L, 1955L, 1L, 1L, 1, 3,
8L, 4L, 1982L, 2L, 2L, 2, 5,
9L, 2L, 1990L, 2L, 4L, 2, 3,
10L, 2L, 1953L, 3L, 2L, 2, 4
)
#sample label data
label <- tibble::tribble(
~variable, ~value, ~label,
"House_member", NA, "How many people live with you?",
"House_member", 1L, "1 person",
"House_member", 2L, "2 persons",
"House_member", 3L, "3 persons",
"House_member", 4L, "4 persons",
"House_member", 5L, "5 persons",
"House_member", 6L, "6 persons",
"House_member", 7L, "7 persons",
"House_member", 8L, "8 persons",
"House_member", 9L, "9 persons",
"House_member", 10L, "10 or more",
"dob", NA, "date of brith",
"age_quota", NA, "age_quota",
"age_quota", 1L, "10-14",
"age_quota", 2L, "15-19",
"age_quota", 3L, "20-29",
"age_quota", 4L, "30-39",
"age_quota", 5L, "40-49",
"age_quota", 6L, "50-70",
"age_quota", 7L, "70 +",
"work", NA, "what is your occupation?",
"work", 1L, "full time",
"work", 2L, "part time",
"work", 3L, "retired",
"work", 4L, "student",
"work", 5L, "housewife",
"work", 6L, "unemployed",
"work", 7L, "other",
"work", 8L, "kid under 15",
"sex", NA, "gender?",
"sex", 1L, "Man",
"sex", 2L, "Woman",
"pss", NA, "How often do you use PS?",
"pss", 1L, "Daily",
"pss", 2L, "several times per week",
"pss", 3L, "once per week",
"pss", 4L, "several time per month",
"pss", 5L, "Rarly"
)
我想知道有什么方法可以将这些文件组合在一起,形成一个像 SPSS
的样式格式(dbl+lbl 格式)一样的标记数据框。我知道 labelled
包可以将值标签添加到未标记的向量中,例如这个例子:
v <- labelled::labelled(c(1,2,2,2,3,9,1,3,2,NA), c(yes = 1, maybe = 2, no = 3))
我希望有一种 better/faster 的方法,而不是一个一个地为每个变量添加标签。
这是一种使用 purrr::imap_dfc
的方法
library(dplyr)
library(purrr)
# custom function for taking the column data and column name and reformat the values using factor
custom_function <- function(value, col_name) {
matching_vairable <- label %>%
filter(variable == col_name & !is.na(value)) %>%
select(label, value)
column_data <- tibble(!!sym(col_name) := value)
if (nrow(matching_vairable) > 0) {
column_data[[1]] <- labelled::labelled(column_data[[1]],
tibble::deframe(matching_vairable))
}
column_data
}
new_df <- imap_dfc(df, .f = custom_function)
输出
new_df
#> # A tibble: 10 x 7
#> id House_member dob age_quota work sex pss
#> <int> <int+lbl> <int> <int+lbl> <int+lbl> <dbl+lb> <dbl+lbl>
#> 1 1 4 [4 persons] 1983 2 [15-19] 2 [part tim… 1 [Man] 1 [Daily]
#> 2 2 1 [1 person] 1940 7 [70 +] 2 [part tim… 1 [Man] 2 [several times p…
#> 3 3 2 [2 persons] 1951 5 [40-49] 6 [unemploy… 1 [Man] 1 [Daily]
#> 4 4 4 [4 persons] 1965 2 [15-19] 2 [part tim… 1 [Man] 4 [several time pe…
#> 5 5 3 [3 persons] 1965 2 [15-19] 3 [retired] 1 [Man] 1 [Daily]
#> 6 6 1 [1 person] 1951 3 [20-29] 1 [full tim… 1 [Man] 3 [once per week]
#> 7 7 1 [1 person] 1955 1 [10-14] 1 [full tim… 1 [Man] 3 [once per week]
#> 8 8 4 [4 persons] 1982 2 [15-19] 2 [part tim… 2 [Woma… 5 [Rarly]
#> 9 9 2 [2 persons] 1990 2 [15-19] 4 [student] 2 [Woma… 3 [once per week]
#> 10 10 2 [2 persons] 1953 3 [20-29] 2 [part tim… 2 [Woma… 4 [several time pe…
new_df %>% pull(House_member)
#> <labelled<integer>[10]>
#> [1] 4 1 2 4 3 1 1 4 2 2
#>
#> Labels:
#> value label
#> 1 1 person
#> 2 2 persons
#> 3 3 persons
#> 4 4 persons
#> 5 5 persons
#> 6 6 persons
#> 7 7 persons
#> 8 8 persons
#> 9 9 persons
#> 10 10 or more
由 reprex package (v2.0.0)
于 2021-05-16 创建
另一个imap_dfc
解决方案:
library(tidyverse)
df %>% imap_dfc(~{
label[label$variable==.y,c('label','value')] %>%
deframe() %>% # to named vector
haven::labelled(.x,.)
})
# A tibble: 10 x 7
id House_member dob age_quota work sex pss
<int+lbl> <int+lbl> <int+lbl> <int+lbl> <int+lbl> <dbl+lbl> <dbl+lbl>
1 1 4 [4 persons] 1983 2 [15-19] 2 [part time] 1 [Man] 1 [Daily]
2 2 1 [1 person] 1940 7 [70 +] 2 [part time] 1 [Man] 2 [several times per week]
3 3 2 [2 persons] 1951 5 [40-49] 6 [unemployed] 1 [Man] 1 [Daily]
4 4 4 [4 persons] 1965 2 [15-19] 2 [part time] 1 [Man] 4 [several time per month]
5 5 3 [3 persons] 1965 2 [15-19] 3 [retired] 1 [Man] 1 [Daily]
6 6 1 [1 person] 1951 3 [20-29] 1 [full time] 1 [Man] 3 [once per week]
7 7 1 [1 person] 1955 1 [10-14] 1 [full time] 1 [Man] 3 [once per week]
8 8 4 [4 persons] 1982 2 [15-19] 2 [part time] 2 [Woman] 5 [Rarly]
9 9 2 [2 persons] 1990 2 [15-19] 4 [student] 2 [Woman] 3 [once per week]
10 10 2 [2 persons] 1953 3 [20-29] 2 [part time] 2 [Woman] 4 [several time per month]
使用了 tibble::deframe
和 haven::labelled
,它们包含在 tidyverse
中
直接访问label
替换filter
/select
后的速度对比:
Waldi <- function() {
df %>% imap_dfc(~{
label[label$variable==.y,c('label','value')] %>%
deframe() %>% # to named vector
haven::labelled(.x,.)})}
Waldi_old <- function() {
df %>% imap_dfc(~{
label %>% filter(variable==.y) %>%
select(label, value) %>%
deframe() %>% # to named vector
haven::labelled(.x,.)
})}
#EDIT : Included TIC33() for-loop solution
microbenchmark::microbenchmark(TIC3(),Waldi(),Anil(),TIC1(),Waldi_old(),Sinh())
Unit: microseconds
expr min lq mean median uq max neval cld
TIC3() 688.0 871.80 982.280 920.95 1005.55 1801.6 100 a
Waldi() 1345.5 1543.60 1804.758 1635.45 1893.75 4306.8 100 b
Anil() 4006.8 4476.65 5188.519 4862.95 5439.10 10163.6 100 c
TIC1() 3898.2 4278.80 5009.927 4774.95 5277.05 12916.2 100 c
Waldi_old() 18712.3 20091.75 21756.140 20609.35 22169.75 33359.8 100 d
Sinh() 22730.9 24093.45 25931.412 24946.00 26614.00 38735.3 100 e
虽然不如@Waldi 提出的其他答案那么快,但由于使用了最少的外部包,这可能被视为一种选择。
交替使用 purrr::imap_*
和 tibble::deframe
,这可以在 dplyr
中仅使用 mutate(across(..
完成,如下所示-
library(dplyr)
library(labelled)
df %>%
mutate(across(everything(), ~labelled::labelled(as.double(.),
setNames(label$value[label$variable == cur_column()],
label$label[label$variable == cur_column()])
)))
# A tibble: 10 x 7
id House_member dob age_quota work sex pss
<dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl>
1 1 4 [4 persons] 1983 2 [15-19] 2 [part time] 1 [Man] 1 [Daily]
2 2 1 [1 person] 1940 7 [70 +] 2 [part time] 1 [Man] 2 [several times per week]
3 3 2 [2 persons] 1951 5 [40-49] 6 [unemployed] 1 [Man] 1 [Daily]
4 4 4 [4 persons] 1965 2 [15-19] 2 [part time] 1 [Man] 4 [several time per month]
5 5 3 [3 persons] 1965 2 [15-19] 3 [retired] 1 [Man] 1 [Daily]
6 6 1 [1 person] 1951 3 [20-29] 1 [full time] 1 [Man] 3 [once per week]
7 7 1 [1 person] 1955 1 [10-14] 1 [full time] 1 [Man] 3 [once per week]
8 8 4 [4 persons] 1982 2 [15-19] 2 [part time] 2 [Woman] 5 [Rarly]
9 9 2 [2 persons] 1990 2 [15-19] 4 [student] 2 [Woman] 3 [once per week]
10 10 2 [2 persons] 1953 3 [20-29] 2 [part time] 2 [Woman] 4 [several time per month]
如前所述,在注释中,您需要将输出列作为 dbl + lbl
,因此第一个参数已用作 as.double(.)
而不是 .
,其中输出将是 int + lbl
当输入列是 integer
类型时。
以下是 回答的一些变体。似乎 for
循环(参见 TIC3()
)提供了不错的速度。
- 变体 1
TIC1 <- function() {
df %>%
mutate(
across(everything(), ~ labelled(
.,
with(label, setNames(value, label)[variable == cur_column()])
))
)
}
- 变体 2(如果您不想
NA
被标记)
TIC2 <- function() {
df %>%
mutate(
across(
with(label, unique(variable[!is.na(value)])),
~ labelled(
.,
with(label, setNames(value, label)[variable == cur_column()])
)
)
)
}
- 变体 3(
for
循环版本 TIC1()
)
TIC3 <- function() {
nms <- names(df)
for (k in nms[nms %in% label$variable]) {
df[[k]] <- labelled(df[[k]], with(label, setNames(value, label)[variable == k]))
}
df
}
基准测试
TIC1 <- function() {
df %>%
mutate(
across(everything(), ~ labelled(
.,
with(label, setNames(value, label)[variable == cur_column()])
))
)
}
TIC2 <- function() {
df %>%
mutate(
across(
with(label, unique(variable[!is.na(value)])),
~ labelled(
.,
with(label, setNames(value, label)[variable == cur_column()])
)
)
)
}
TIC3 <- function() {
nms <- names(df)
for (k in nms[nms %in% label$variable]) {
df[[k]] <- labelled(df[[k]], with(label, setNames(value, label)[variable == k]))
}
df
}
Waldi1 <- function() {
df %>% imap_dfc(~ {
label %>%
filter(variable == .y) %>%
select(label, value) %>%
deframe() %>%
# to named vector
haven::labelled(.x, .)
})
}
Waldi2 <- function() {
df %>% imap_dfc(~ {
label[label$variable == .y, c("label", "value")] %>%
deframe() %>% # to named vector
haven::labelled(.x, .)
})
}
Anil <- function() {
df %>%
mutate(across(everything(), ~ labelled::labelled(
as.double(.),
setNames(
label$value[label$variable == cur_column()],
label$label[label$variable == cur_column()]
)
)))
}
custom_function <- function(value, col_name) {
matching_vairable <- label %>%
filter(variable == col_name & !is.na(value)) %>%
select(label, value)
column_data <- tibble(!!sym(col_name) := value)
if (nrow(matching_vairable) > 0) {
column_data[[1]] <- labelled::labelled(
column_data[[1]],
tibble::deframe(matching_vairable)
)
}
column_data
}
Sinh <- function(x) {
imap_dfc(df, .f = custom_function)
}
microbenchmark(
Waldi1(),
Waldi2(),
Anil(),
Sinh(),
TIC1(),
TIC2(),
TIC3(),
unit = "relative"
)
给予
Unit: relative
expr min lq mean median uq max neval
Waldi1() 17.540613 17.359550 17.019266 17.238594 18.502584 4.7788575 100
Waldi2() 1.355634 1.350547 1.338517 1.352509 1.342408 0.7033271 100
Anil() 3.996836 4.011826 3.902559 4.029819 3.937232 1.2877871 100
Sinh() 20.756122 20.595253 20.637410 20.452746 21.484992 13.0362139 100
TIC1() 3.617278 3.617310 3.480283 3.609973 3.526703 1.0682179 100
TIC2() 3.315545 3.384422 3.282862 3.389645 3.325616 1.0474304 100
TIC3() 1.000000 1.000000 1.000000 1.000000 1.000000 1.0000000 100
我有两个数据框,一个是调查数据(data.csv),另一个是标签数据(label.csv)。这是样本数据(我的原始数据有大约150个变量)
#sample data
df <- tibble::tribble(
~id, ~House_member, ~dob, ~age_quota, ~work, ~sex, ~pss,
1L, 4L, 1983L, 2L, 2L, 1, 1,
2L, 1L, 1940L, 7L, 2L, 1, 2,
3L, 2L, 1951L, 5L, 6L, 1, 1,
4L, 4L, 1965L, 2L, 2L, 1, 4,
5L, 3L, 1965L, 2L, 3L, 1, 1,
6L, 1L, 1951L, 3L, 1L, 1, 3,
7L, 1L, 1955L, 1L, 1L, 1, 3,
8L, 4L, 1982L, 2L, 2L, 2, 5,
9L, 2L, 1990L, 2L, 4L, 2, 3,
10L, 2L, 1953L, 3L, 2L, 2, 4
)
#sample label data
label <- tibble::tribble(
~variable, ~value, ~label,
"House_member", NA, "How many people live with you?",
"House_member", 1L, "1 person",
"House_member", 2L, "2 persons",
"House_member", 3L, "3 persons",
"House_member", 4L, "4 persons",
"House_member", 5L, "5 persons",
"House_member", 6L, "6 persons",
"House_member", 7L, "7 persons",
"House_member", 8L, "8 persons",
"House_member", 9L, "9 persons",
"House_member", 10L, "10 or more",
"dob", NA, "date of brith",
"age_quota", NA, "age_quota",
"age_quota", 1L, "10-14",
"age_quota", 2L, "15-19",
"age_quota", 3L, "20-29",
"age_quota", 4L, "30-39",
"age_quota", 5L, "40-49",
"age_quota", 6L, "50-70",
"age_quota", 7L, "70 +",
"work", NA, "what is your occupation?",
"work", 1L, "full time",
"work", 2L, "part time",
"work", 3L, "retired",
"work", 4L, "student",
"work", 5L, "housewife",
"work", 6L, "unemployed",
"work", 7L, "other",
"work", 8L, "kid under 15",
"sex", NA, "gender?",
"sex", 1L, "Man",
"sex", 2L, "Woman",
"pss", NA, "How often do you use PS?",
"pss", 1L, "Daily",
"pss", 2L, "several times per week",
"pss", 3L, "once per week",
"pss", 4L, "several time per month",
"pss", 5L, "Rarly"
)
我想知道有什么方法可以将这些文件组合在一起,形成一个像 SPSS
的样式格式(dbl+lbl 格式)一样的标记数据框。我知道 labelled
包可以将值标签添加到未标记的向量中,例如这个例子:
v <- labelled::labelled(c(1,2,2,2,3,9,1,3,2,NA), c(yes = 1, maybe = 2, no = 3))
我希望有一种 better/faster 的方法,而不是一个一个地为每个变量添加标签。
这是一种使用 purrr::imap_dfc
library(dplyr)
library(purrr)
# custom function for taking the column data and column name and reformat the values using factor
custom_function <- function(value, col_name) {
matching_vairable <- label %>%
filter(variable == col_name & !is.na(value)) %>%
select(label, value)
column_data <- tibble(!!sym(col_name) := value)
if (nrow(matching_vairable) > 0) {
column_data[[1]] <- labelled::labelled(column_data[[1]],
tibble::deframe(matching_vairable))
}
column_data
}
new_df <- imap_dfc(df, .f = custom_function)
输出
new_df
#> # A tibble: 10 x 7
#> id House_member dob age_quota work sex pss
#> <int> <int+lbl> <int> <int+lbl> <int+lbl> <dbl+lb> <dbl+lbl>
#> 1 1 4 [4 persons] 1983 2 [15-19] 2 [part tim… 1 [Man] 1 [Daily]
#> 2 2 1 [1 person] 1940 7 [70 +] 2 [part tim… 1 [Man] 2 [several times p…
#> 3 3 2 [2 persons] 1951 5 [40-49] 6 [unemploy… 1 [Man] 1 [Daily]
#> 4 4 4 [4 persons] 1965 2 [15-19] 2 [part tim… 1 [Man] 4 [several time pe…
#> 5 5 3 [3 persons] 1965 2 [15-19] 3 [retired] 1 [Man] 1 [Daily]
#> 6 6 1 [1 person] 1951 3 [20-29] 1 [full tim… 1 [Man] 3 [once per week]
#> 7 7 1 [1 person] 1955 1 [10-14] 1 [full tim… 1 [Man] 3 [once per week]
#> 8 8 4 [4 persons] 1982 2 [15-19] 2 [part tim… 2 [Woma… 5 [Rarly]
#> 9 9 2 [2 persons] 1990 2 [15-19] 4 [student] 2 [Woma… 3 [once per week]
#> 10 10 2 [2 persons] 1953 3 [20-29] 2 [part tim… 2 [Woma… 4 [several time pe…
new_df %>% pull(House_member)
#> <labelled<integer>[10]>
#> [1] 4 1 2 4 3 1 1 4 2 2
#>
#> Labels:
#> value label
#> 1 1 person
#> 2 2 persons
#> 3 3 persons
#> 4 4 persons
#> 5 5 persons
#> 6 6 persons
#> 7 7 persons
#> 8 8 persons
#> 9 9 persons
#> 10 10 or more
由 reprex package (v2.0.0)
于 2021-05-16 创建另一个imap_dfc
解决方案:
library(tidyverse)
df %>% imap_dfc(~{
label[label$variable==.y,c('label','value')] %>%
deframe() %>% # to named vector
haven::labelled(.x,.)
})
# A tibble: 10 x 7
id House_member dob age_quota work sex pss
<int+lbl> <int+lbl> <int+lbl> <int+lbl> <int+lbl> <dbl+lbl> <dbl+lbl>
1 1 4 [4 persons] 1983 2 [15-19] 2 [part time] 1 [Man] 1 [Daily]
2 2 1 [1 person] 1940 7 [70 +] 2 [part time] 1 [Man] 2 [several times per week]
3 3 2 [2 persons] 1951 5 [40-49] 6 [unemployed] 1 [Man] 1 [Daily]
4 4 4 [4 persons] 1965 2 [15-19] 2 [part time] 1 [Man] 4 [several time per month]
5 5 3 [3 persons] 1965 2 [15-19] 3 [retired] 1 [Man] 1 [Daily]
6 6 1 [1 person] 1951 3 [20-29] 1 [full time] 1 [Man] 3 [once per week]
7 7 1 [1 person] 1955 1 [10-14] 1 [full time] 1 [Man] 3 [once per week]
8 8 4 [4 persons] 1982 2 [15-19] 2 [part time] 2 [Woman] 5 [Rarly]
9 9 2 [2 persons] 1990 2 [15-19] 4 [student] 2 [Woman] 3 [once per week]
10 10 2 [2 persons] 1953 3 [20-29] 2 [part time] 2 [Woman] 4 [several time per month]
使用了 tibble::deframe
和 haven::labelled
,它们包含在 tidyverse
直接访问label
替换filter
/select
后的速度对比:
Waldi <- function() {
df %>% imap_dfc(~{
label[label$variable==.y,c('label','value')] %>%
deframe() %>% # to named vector
haven::labelled(.x,.)})}
Waldi_old <- function() {
df %>% imap_dfc(~{
label %>% filter(variable==.y) %>%
select(label, value) %>%
deframe() %>% # to named vector
haven::labelled(.x,.)
})}
#EDIT : Included TIC33() for-loop solution
microbenchmark::microbenchmark(TIC3(),Waldi(),Anil(),TIC1(),Waldi_old(),Sinh())
Unit: microseconds
expr min lq mean median uq max neval cld
TIC3() 688.0 871.80 982.280 920.95 1005.55 1801.6 100 a
Waldi() 1345.5 1543.60 1804.758 1635.45 1893.75 4306.8 100 b
Anil() 4006.8 4476.65 5188.519 4862.95 5439.10 10163.6 100 c
TIC1() 3898.2 4278.80 5009.927 4774.95 5277.05 12916.2 100 c
Waldi_old() 18712.3 20091.75 21756.140 20609.35 22169.75 33359.8 100 d
Sinh() 22730.9 24093.45 25931.412 24946.00 26614.00 38735.3 100 e
虽然不如@Waldi 提出的其他答案那么快,但由于使用了最少的外部包,这可能被视为一种选择。
交替使用 purrr::imap_*
和 tibble::deframe
,这可以在 dplyr
中仅使用 mutate(across(..
完成,如下所示-
library(dplyr)
library(labelled)
df %>%
mutate(across(everything(), ~labelled::labelled(as.double(.),
setNames(label$value[label$variable == cur_column()],
label$label[label$variable == cur_column()])
)))
# A tibble: 10 x 7
id House_member dob age_quota work sex pss
<dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl>
1 1 4 [4 persons] 1983 2 [15-19] 2 [part time] 1 [Man] 1 [Daily]
2 2 1 [1 person] 1940 7 [70 +] 2 [part time] 1 [Man] 2 [several times per week]
3 3 2 [2 persons] 1951 5 [40-49] 6 [unemployed] 1 [Man] 1 [Daily]
4 4 4 [4 persons] 1965 2 [15-19] 2 [part time] 1 [Man] 4 [several time per month]
5 5 3 [3 persons] 1965 2 [15-19] 3 [retired] 1 [Man] 1 [Daily]
6 6 1 [1 person] 1951 3 [20-29] 1 [full time] 1 [Man] 3 [once per week]
7 7 1 [1 person] 1955 1 [10-14] 1 [full time] 1 [Man] 3 [once per week]
8 8 4 [4 persons] 1982 2 [15-19] 2 [part time] 2 [Woman] 5 [Rarly]
9 9 2 [2 persons] 1990 2 [15-19] 4 [student] 2 [Woman] 3 [once per week]
10 10 2 [2 persons] 1953 3 [20-29] 2 [part time] 2 [Woman] 4 [several time per month]
如前所述,在注释中,您需要将输出列作为 dbl + lbl
,因此第一个参数已用作 as.double(.)
而不是 .
,其中输出将是 int + lbl
当输入列是 integer
类型时。
以下是 for
循环(参见 TIC3()
)提供了不错的速度。
- 变体 1
TIC1 <- function() {
df %>%
mutate(
across(everything(), ~ labelled(
.,
with(label, setNames(value, label)[variable == cur_column()])
))
)
}
- 变体 2(如果您不想
NA
被标记)
TIC2 <- function() {
df %>%
mutate(
across(
with(label, unique(variable[!is.na(value)])),
~ labelled(
.,
with(label, setNames(value, label)[variable == cur_column()])
)
)
)
}
- 变体 3(
for
循环版本TIC1()
)
TIC3 <- function() {
nms <- names(df)
for (k in nms[nms %in% label$variable]) {
df[[k]] <- labelled(df[[k]], with(label, setNames(value, label)[variable == k]))
}
df
}
基准测试
TIC1 <- function() {
df %>%
mutate(
across(everything(), ~ labelled(
.,
with(label, setNames(value, label)[variable == cur_column()])
))
)
}
TIC2 <- function() {
df %>%
mutate(
across(
with(label, unique(variable[!is.na(value)])),
~ labelled(
.,
with(label, setNames(value, label)[variable == cur_column()])
)
)
)
}
TIC3 <- function() {
nms <- names(df)
for (k in nms[nms %in% label$variable]) {
df[[k]] <- labelled(df[[k]], with(label, setNames(value, label)[variable == k]))
}
df
}
Waldi1 <- function() {
df %>% imap_dfc(~ {
label %>%
filter(variable == .y) %>%
select(label, value) %>%
deframe() %>%
# to named vector
haven::labelled(.x, .)
})
}
Waldi2 <- function() {
df %>% imap_dfc(~ {
label[label$variable == .y, c("label", "value")] %>%
deframe() %>% # to named vector
haven::labelled(.x, .)
})
}
Anil <- function() {
df %>%
mutate(across(everything(), ~ labelled::labelled(
as.double(.),
setNames(
label$value[label$variable == cur_column()],
label$label[label$variable == cur_column()]
)
)))
}
custom_function <- function(value, col_name) {
matching_vairable <- label %>%
filter(variable == col_name & !is.na(value)) %>%
select(label, value)
column_data <- tibble(!!sym(col_name) := value)
if (nrow(matching_vairable) > 0) {
column_data[[1]] <- labelled::labelled(
column_data[[1]],
tibble::deframe(matching_vairable)
)
}
column_data
}
Sinh <- function(x) {
imap_dfc(df, .f = custom_function)
}
microbenchmark(
Waldi1(),
Waldi2(),
Anil(),
Sinh(),
TIC1(),
TIC2(),
TIC3(),
unit = "relative"
)
给予
Unit: relative
expr min lq mean median uq max neval
Waldi1() 17.540613 17.359550 17.019266 17.238594 18.502584 4.7788575 100
Waldi2() 1.355634 1.350547 1.338517 1.352509 1.342408 0.7033271 100
Anil() 3.996836 4.011826 3.902559 4.029819 3.937232 1.2877871 100
Sinh() 20.756122 20.595253 20.637410 20.452746 21.484992 13.0362139 100
TIC1() 3.617278 3.617310 3.480283 3.609973 3.526703 1.0682179 100
TIC2() 3.315545 3.384422 3.282862 3.389645 3.325616 1.0474304 100
TIC3() 1.000000 1.000000 1.000000 1.000000 1.000000 1.0000000 100