使用重新编码以使用命名向量的命名列表跨多个列进行变异
Use recode to mutate across multiple columns using named list of named vectors
我找不到与我在这里遇到的问题类似的问题。我有一个非常大的命名向量列表,这些命名向量与数据框中的列名匹配。我想使用命名向量列表来替换与每个列表元素名称匹配的数据框列中的值。也就是说,列表中向量的名称与数据框列的名称相匹配,每个向量元素中的键值对将用于重新编码该列。
下面的代表:
library(tidyverse)
# Starting tibble
test <- tibble(Names = c("Alice","Bob","Cindy"),
A = c(3,"q",7),
B = c(1,2,"b"),
C = c("a","g",9))
# Named vector
A <- c("5" = "alpha", "7" = "bravo", "3" = "charlie", "q" = "delta")
B <- c("1" = "yes", "2" = "no", "b" = "bad", "c" = "missing")
C <- c("9" = "beta", "8" = "gamma", "a" = "delta", "g" = "epsilon")
# Named list of named vectors
dicts <- list("A" = A, "B" = B, "C" = C) # Same names as columns
我可以使用 mutate
并手动指定列和列表项。
# Works when replacement vector is specified
test %>%
mutate(across(c("A"),
~recode(., !!!dicts$A)))
#> # A tibble: 3 x 4
#> Names A B C
#> <chr> <chr> <chr> <chr>
#> 1 Alice charlie 1 a
#> 2 Bob delta 2 g
#> 3 Cindy bravo b 9
但是,以下方法不起作用:
# Does not work when replacement vector using column names
test %>%
mutate(across(c("A", "B", "C"),
~recode(., !!!dicts$.)))
Error: Problem with mutate()
input ..1
.
x No replacements provided.
i Input ..1
is (function (.cols = everything(), .fns = NULL, ..., .names = NULL) ...
.
此外,我发现 map2_dfr
仅在指定所有未重新编码的列时才有效:
# map2_dfr Sort of works, but requires dropping some columns
map2_dfr(test %>% select(names(dicts)),
dicts,
~recode(.x, !!!.y))
#> # A tibble: 3 x 3
#> A B C
#> <chr> <chr> <chr>
#> 1 charlie yes delta
#> 2 delta no epsilon
#> 3 bravo bad beta
我希望使用列表中的名称对列重新编码,而不删除列。
一种解决方法是使用您的 map2_dfr
代码,然后将需要的列绑定到 map2_dfr
输出。尽管您仍然必须删除名称列。
library(tidyverse)
map2_dfr(test %>% select(names(dicts)),
dicts,
~ recode(.x,!!!.y)) %>%
dplyr::bind_cols(., Names = test$Names) %>%
dplyr::select(4, 1:3)
输出
# A tibble: 3 × 4
Names A B C
<chr> <chr> <chr> <chr>
1 Alice charlie yes delta
2 Bob delta no epsilon
3 Cindy bravo bad beta
合并两者的解决方案可以是,
library(dplyr)
library(tidyr)
test %>%
pivot_longer(-1) %>%
left_join(stack(dicts) %>%
rownames_to_column('value'),
by = c('value', 'name' = 'ind')) %>%
pivot_wider(id_cols = -value, names_from = name, values_from = values)
# A tibble: 3 x 4
# Names A B C
# <chr> <chr> <chr> <chr>
#1 Alice charlie yes delta
#2 Bob delta no epsilon
#3 Cindy bravo bad beta
以下是三种方法:
首先,我们可以使用 dplyr::cur_column()
.
使其在自定义函数中与 dplyr::across
一起使用
library(tidyverse)
myfun <- function(x) {
mycol <- cur_column()
dplyr::recode(x, !!! dicts[[mycol]])
}
test %>%
mutate(across(c("A", "B", "C"), myfun))
#> # A tibble: 3 x 4
#> Names A B C
#> <chr> <chr> <chr> <chr>
#> 1 Alice charlie yes delta
#> 2 Bob delta no epsilon
#> 3 Cindy bravo bad beta
第二种选择是将 dicts
转换为表达式列表,然后使用 !!!
运算符将其拼接为 mutate
:
expr_ls <- imap(dicts, ~ quo(recode(!!sym(.y), !!!.x)))
test %>%
mutate(!!! expr_ls)
#> # A tibble: 3 x 4
#> Names A B C
#> <chr> <chr> <chr> <chr>
#> 1 Alice charlie yes delta
#> 2 Bob delta no epsilon
#> 3 Cindy bravo bad beta
最后,在更大的 tidyverse 中我们可以使用 purrr::lmap_at
,但它使底层函数比需要的更复杂:
myfun2 <- function(x) {
x_nm <- names(x)
mutate(x, !! x_nm := recode(!! sym(x_nm), !!! dicts[[x_nm]]))
}
lmap_at(test,
names(dicts),
myfun2)
#> # A tibble: 3 x 4
#> Names A B C
#> <chr> <chr> <chr> <chr>
#> 1 Alice charlie yes delta
#> 2 Bob delta no epsilon
#> 3 Cindy bravo bad beta
原始数据
# Starting tibble
test <- tibble(Names = c("Alice","Bob","Cindy"),
A = c(3,"q",7),
B = c(1,2,"b"),
C = c("a","g",9))
# Named vector
A <- c("5" = "alpha", "7" = "bravo", "3" = "charlie", "q" = "delta")
B <- c("1" = "yes", "2" = "no", "b" = "bad", "c" = "missing")
C <- c("9" = "beta", "8" = "gamma", "a" = "delta", "g" = "epsilon")
# Named list of named vectors
dicts <- list("A" = A, "B" = B, "C" = C) # Same names as columns
由 reprex package (v2.0.1)
于 2021-12-15 创建
您可以尝试下面的基本 R 代码
idx <- match(names(dicts), names(test))
test[idx] <- Map(`[`, dicts, test[idx])
这给出了
> test
# A tibble: 3 x 4
Names A B C
<chr> <chr> <chr> <chr>
1 Alice charlie yes delta
2 Bob delta no epsilon
3 Cindy bravo bad beta
使用 base R 并重新编码:
for (x in names(dicts)) { test[[x]] <- do.call(recode, c(list(test[[x]]), dicts[[x]])) }
> test
# A tibble: 3 × 4
Names A B C
<chr> <chr> <chr> <chr>
1 Alice charlie yes delta
2 Bob delta no epsilon
3 Cindy bravo bad beta
另请注意,基于 Map()
或 str_replace_all()
的其他解决方案仅适用,因为测试示例仅使用简单的替换。如果使用 .default
或 .missing
,它们很可能会失败。
Base R(应该很容易翻译成 dplyr
)
# Helper function
look_dict <- function(col, values) dicts[[col]][values]
# lapply
test[names(dicts)] <- lapply(names(dicts), \(col) look_dict(col, test[[col]]))
# magrittr and for loop to avoid repeating code
library(magrittr)
for (col in names(dicts)) test[[col]] %<>% look_dict(col, .)
# # A tibble: 3 x 4
# Names A B C
# <chr> <chr> <chr> <chr>
# 1 Alice charlie yes delta
# 2 Bob delta no epsilon
# 3 Cindy bravo bad beta
已编辑
这是一个使用 qdap::mgsub
的管道友好解决方案。恐怕 stringr::str_replace_all
和 stringi::stri_replace_first_fixed()
似乎都不起作用。详情请看评论。
test %>%
mutate(across(
c("A", "B", "C"),
~qdap::mgsub( names(dicts[[cur_column()]]), dicts[[cur_column()]], .x)
))
不是完整的答案,但我认为(在撰写本文时)现有解决方案的基准可能会有所帮助。与每个基准 YMMV 一样:
正如我们所见,sindri_baldur
的基本 R 版本实际上是最快的
(代码如下)
bench::mark(
karl_base_r(data, dicts),
tim_across(data, dicts),
tim_lmap(data, dicts),
sotos_pivot(data, dicts),
thomas_base_r(data, dicts),
sindri_base_r(data, dicts),
check = FALSE
)
#> # A tibble: 6 x 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 karl_base_r(data, dicts) 825.9us 968.9us 814. 428.17KB 6.25
#> 2 tim_across(data, dicts) 5.04ms 6.44ms 147. 2.4MB 4.15
#> 3 tim_lmap(data, dicts) 7.34ms 8.49ms 108. 106.06KB 4.17
#> 4 sotos_pivot(data, dicts) 12.79ms 14.58ms 60.6 1.26MB 4.18
#> 5 thomas_base_r(data, dicts) 392us 438.6us 1891. 0B 4.07
#> 6 sindir_base_r(data, dicts) 116.8us 136.7us 5793. 0B 4.11
更大的数据集
对于更大的数据集,ThomasIsCoding
基础 R 版本比 Sindir 的解决方案快一点。
set.seed(15)
data_large <- data %>% sample_n(1e6, replace = TRUE)
bench::mark(
karl_base_r(data_large, dicts),
tim_across(data_large, dicts),
tim_lmap(data_large, dicts),
thomas_base_r(data_large, dicts),
sindir_base_r(data_large, dicts),
check = FALSE
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 5 x 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:t> <dbl> <bch:byt> <dbl>
#> 1 karl_base_r(data_large, dicts) 856ms 856ms 1.17 503.9MB 9.35
#> 2 tim_across(data_large, dicts) 647ms 647ms 1.55 504.9MB 10.8
#> 3 tim_lmap(data_large, dicts) 809ms 809ms 1.24 503.6MB 11.1
#> 4 thomas_base_r(data_large, dicts) 131ms 148ms 6.53 80.1MB 3.27
#> 5 sindir_base_r(data_large, dicts) 150ms 180ms 5.08 80.1MB 5.08
代码
library(tidyverse)
library(magrittr)
# Starting tibble
data <- tibble(Names = c("Alice","Bob","Cindy"),
A = c(3,"q",7),
B = c(1,2,"b"),
C = c("a","g",9))
# Named vector
A <- c("5" = "alpha", "7" = "bravo", "3" = "charlie", "q" = "delta")
B <- c("1" = "yes", "2" = "no", "b" = "bad", "c" = "missing")
C <- c("9" = "beta", "8" = "gamma", "a" = "delta", "g" = "epsilon")
# Named list of named vectors
dicts <- list("A" = A, "B" = B, "C" = C) # Same names as columns
# function definitions
karl_base_r <- function(data, dicts) {
for (x in names(dicts))
{data[[x]] <- do.call(recode, c(list(data[[x]]), dicts[[x]])) }
data
}
tim_across <- function(data, dicts) {
myfun <- function(x) {
mycol <- cur_column()
dplyr::recode(x, !!! dicts[[mycol]])
}
data %>%
mutate(across(c("A", "B", "C"), myfun))
}
tim_lmap <- function(data, dicts) {
myfun2 <- function(x) {
x_nm <- names(x)
mutate(x, !! x_nm := recode(!! sym(x_nm), !!! dicts[[x_nm]]))
}
lmap_at(data,
names(dicts),
myfun2)
}
sotos_pivot <- function(data, dicts) {
data %>%
pivot_longer(-1) %>%
left_join(stack(dicts) %>%
rownames_to_column('value'),
by = c('value', 'name' = 'ind')) %>%
pivot_wider(id_cols = -value, names_from = name, values_from = values)
}
thomas_base_r <- function(data, dicts) {
idx <- match(names(dicts), names(data))
data[idx] <- Map(`[`, dicts, data[idx])
data
}
sindri_base_r <- function(data, dicts) {
look_dict <- function(col, values) dicts[[col]][values]
data[names(dicts)] <- lapply(names(dicts), \(col) look_dict(col, data[[col]]))
data
}
由 reprex package (v2.0.0)
于 2021-12-15 创建
使用 purrr
的另一种选择,无需深入研究复杂的潮汐问题。
library(purrr)
library(tibble)
test %>%
lmap_at(c("A", "B", "C"),
~ as_tibble_col(dicts[[names(.x)]][unlist(.x)], names(.x)))
# # A tibble: 3 x 4
# Names A B C
# <chr> <chr> <chr> <chr>
# 1 Alice charlie yes delta
# 2 Bob delta no epsilon
# 3 Cindy bravo bad beta
如果有 modify2_at()
或 imodify_at()
函数之类的东西,这会很容易,但这里我们使用 lmap_at()
作为变通方法。
我找不到与我在这里遇到的问题类似的问题。我有一个非常大的命名向量列表,这些命名向量与数据框中的列名匹配。我想使用命名向量列表来替换与每个列表元素名称匹配的数据框列中的值。也就是说,列表中向量的名称与数据框列的名称相匹配,每个向量元素中的键值对将用于重新编码该列。
下面的代表:
library(tidyverse)
# Starting tibble
test <- tibble(Names = c("Alice","Bob","Cindy"),
A = c(3,"q",7),
B = c(1,2,"b"),
C = c("a","g",9))
# Named vector
A <- c("5" = "alpha", "7" = "bravo", "3" = "charlie", "q" = "delta")
B <- c("1" = "yes", "2" = "no", "b" = "bad", "c" = "missing")
C <- c("9" = "beta", "8" = "gamma", "a" = "delta", "g" = "epsilon")
# Named list of named vectors
dicts <- list("A" = A, "B" = B, "C" = C) # Same names as columns
我可以使用 mutate
并手动指定列和列表项。
# Works when replacement vector is specified
test %>%
mutate(across(c("A"),
~recode(., !!!dicts$A)))
#> # A tibble: 3 x 4
#> Names A B C
#> <chr> <chr> <chr> <chr>
#> 1 Alice charlie 1 a
#> 2 Bob delta 2 g
#> 3 Cindy bravo b 9
但是,以下方法不起作用:
# Does not work when replacement vector using column names
test %>%
mutate(across(c("A", "B", "C"),
~recode(., !!!dicts$.)))
Error: Problem with
mutate()
input..1
. x No replacements provided. i Input..1
is(function (.cols = everything(), .fns = NULL, ..., .names = NULL) ...
.
此外,我发现 map2_dfr
仅在指定所有未重新编码的列时才有效:
# map2_dfr Sort of works, but requires dropping some columns
map2_dfr(test %>% select(names(dicts)),
dicts,
~recode(.x, !!!.y))
#> # A tibble: 3 x 3
#> A B C
#> <chr> <chr> <chr>
#> 1 charlie yes delta
#> 2 delta no epsilon
#> 3 bravo bad beta
我希望使用列表中的名称对列重新编码,而不删除列。
一种解决方法是使用您的 map2_dfr
代码,然后将需要的列绑定到 map2_dfr
输出。尽管您仍然必须删除名称列。
library(tidyverse)
map2_dfr(test %>% select(names(dicts)),
dicts,
~ recode(.x,!!!.y)) %>%
dplyr::bind_cols(., Names = test$Names) %>%
dplyr::select(4, 1:3)
输出
# A tibble: 3 × 4
Names A B C
<chr> <chr> <chr> <chr>
1 Alice charlie yes delta
2 Bob delta no epsilon
3 Cindy bravo bad beta
合并两者的解决方案可以是,
library(dplyr)
library(tidyr)
test %>%
pivot_longer(-1) %>%
left_join(stack(dicts) %>%
rownames_to_column('value'),
by = c('value', 'name' = 'ind')) %>%
pivot_wider(id_cols = -value, names_from = name, values_from = values)
# A tibble: 3 x 4
# Names A B C
# <chr> <chr> <chr> <chr>
#1 Alice charlie yes delta
#2 Bob delta no epsilon
#3 Cindy bravo bad beta
以下是三种方法:
首先,我们可以使用 dplyr::cur_column()
.
dplyr::across
一起使用
library(tidyverse)
myfun <- function(x) {
mycol <- cur_column()
dplyr::recode(x, !!! dicts[[mycol]])
}
test %>%
mutate(across(c("A", "B", "C"), myfun))
#> # A tibble: 3 x 4
#> Names A B C
#> <chr> <chr> <chr> <chr>
#> 1 Alice charlie yes delta
#> 2 Bob delta no epsilon
#> 3 Cindy bravo bad beta
第二种选择是将 dicts
转换为表达式列表,然后使用 !!!
运算符将其拼接为 mutate
:
expr_ls <- imap(dicts, ~ quo(recode(!!sym(.y), !!!.x)))
test %>%
mutate(!!! expr_ls)
#> # A tibble: 3 x 4
#> Names A B C
#> <chr> <chr> <chr> <chr>
#> 1 Alice charlie yes delta
#> 2 Bob delta no epsilon
#> 3 Cindy bravo bad beta
最后,在更大的 tidyverse 中我们可以使用 purrr::lmap_at
,但它使底层函数比需要的更复杂:
myfun2 <- function(x) {
x_nm <- names(x)
mutate(x, !! x_nm := recode(!! sym(x_nm), !!! dicts[[x_nm]]))
}
lmap_at(test,
names(dicts),
myfun2)
#> # A tibble: 3 x 4
#> Names A B C
#> <chr> <chr> <chr> <chr>
#> 1 Alice charlie yes delta
#> 2 Bob delta no epsilon
#> 3 Cindy bravo bad beta
原始数据
# Starting tibble
test <- tibble(Names = c("Alice","Bob","Cindy"),
A = c(3,"q",7),
B = c(1,2,"b"),
C = c("a","g",9))
# Named vector
A <- c("5" = "alpha", "7" = "bravo", "3" = "charlie", "q" = "delta")
B <- c("1" = "yes", "2" = "no", "b" = "bad", "c" = "missing")
C <- c("9" = "beta", "8" = "gamma", "a" = "delta", "g" = "epsilon")
# Named list of named vectors
dicts <- list("A" = A, "B" = B, "C" = C) # Same names as columns
由 reprex package (v2.0.1)
于 2021-12-15 创建您可以尝试下面的基本 R 代码
idx <- match(names(dicts), names(test))
test[idx] <- Map(`[`, dicts, test[idx])
这给出了
> test
# A tibble: 3 x 4
Names A B C
<chr> <chr> <chr> <chr>
1 Alice charlie yes delta
2 Bob delta no epsilon
3 Cindy bravo bad beta
使用 base R 并重新编码:
for (x in names(dicts)) { test[[x]] <- do.call(recode, c(list(test[[x]]), dicts[[x]])) }
> test
# A tibble: 3 × 4
Names A B C
<chr> <chr> <chr> <chr>
1 Alice charlie yes delta
2 Bob delta no epsilon
3 Cindy bravo bad beta
另请注意,基于 Map()
或 str_replace_all()
的其他解决方案仅适用,因为测试示例仅使用简单的替换。如果使用 .default
或 .missing
,它们很可能会失败。
Base R(应该很容易翻译成 dplyr
)
# Helper function
look_dict <- function(col, values) dicts[[col]][values]
# lapply
test[names(dicts)] <- lapply(names(dicts), \(col) look_dict(col, test[[col]]))
# magrittr and for loop to avoid repeating code
library(magrittr)
for (col in names(dicts)) test[[col]] %<>% look_dict(col, .)
# # A tibble: 3 x 4
# Names A B C
# <chr> <chr> <chr> <chr>
# 1 Alice charlie yes delta
# 2 Bob delta no epsilon
# 3 Cindy bravo bad beta
已编辑
这是一个使用 qdap::mgsub
的管道友好解决方案。恐怕 stringr::str_replace_all
和 stringi::stri_replace_first_fixed()
似乎都不起作用。详情请看评论。
test %>%
mutate(across(
c("A", "B", "C"),
~qdap::mgsub( names(dicts[[cur_column()]]), dicts[[cur_column()]], .x)
))
不是完整的答案,但我认为(在撰写本文时)现有解决方案的基准可能会有所帮助。与每个基准 YMMV 一样:
正如我们所见,sindri_baldur
的基本 R 版本实际上是最快的
(代码如下)
bench::mark(
karl_base_r(data, dicts),
tim_across(data, dicts),
tim_lmap(data, dicts),
sotos_pivot(data, dicts),
thomas_base_r(data, dicts),
sindri_base_r(data, dicts),
check = FALSE
)
#> # A tibble: 6 x 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 karl_base_r(data, dicts) 825.9us 968.9us 814. 428.17KB 6.25
#> 2 tim_across(data, dicts) 5.04ms 6.44ms 147. 2.4MB 4.15
#> 3 tim_lmap(data, dicts) 7.34ms 8.49ms 108. 106.06KB 4.17
#> 4 sotos_pivot(data, dicts) 12.79ms 14.58ms 60.6 1.26MB 4.18
#> 5 thomas_base_r(data, dicts) 392us 438.6us 1891. 0B 4.07
#> 6 sindir_base_r(data, dicts) 116.8us 136.7us 5793. 0B 4.11
更大的数据集
对于更大的数据集,ThomasIsCoding
基础 R 版本比 Sindir 的解决方案快一点。
set.seed(15)
data_large <- data %>% sample_n(1e6, replace = TRUE)
bench::mark(
karl_base_r(data_large, dicts),
tim_across(data_large, dicts),
tim_lmap(data_large, dicts),
thomas_base_r(data_large, dicts),
sindir_base_r(data_large, dicts),
check = FALSE
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 5 x 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:t> <dbl> <bch:byt> <dbl>
#> 1 karl_base_r(data_large, dicts) 856ms 856ms 1.17 503.9MB 9.35
#> 2 tim_across(data_large, dicts) 647ms 647ms 1.55 504.9MB 10.8
#> 3 tim_lmap(data_large, dicts) 809ms 809ms 1.24 503.6MB 11.1
#> 4 thomas_base_r(data_large, dicts) 131ms 148ms 6.53 80.1MB 3.27
#> 5 sindir_base_r(data_large, dicts) 150ms 180ms 5.08 80.1MB 5.08
代码
library(tidyverse)
library(magrittr)
# Starting tibble
data <- tibble(Names = c("Alice","Bob","Cindy"),
A = c(3,"q",7),
B = c(1,2,"b"),
C = c("a","g",9))
# Named vector
A <- c("5" = "alpha", "7" = "bravo", "3" = "charlie", "q" = "delta")
B <- c("1" = "yes", "2" = "no", "b" = "bad", "c" = "missing")
C <- c("9" = "beta", "8" = "gamma", "a" = "delta", "g" = "epsilon")
# Named list of named vectors
dicts <- list("A" = A, "B" = B, "C" = C) # Same names as columns
# function definitions
karl_base_r <- function(data, dicts) {
for (x in names(dicts))
{data[[x]] <- do.call(recode, c(list(data[[x]]), dicts[[x]])) }
data
}
tim_across <- function(data, dicts) {
myfun <- function(x) {
mycol <- cur_column()
dplyr::recode(x, !!! dicts[[mycol]])
}
data %>%
mutate(across(c("A", "B", "C"), myfun))
}
tim_lmap <- function(data, dicts) {
myfun2 <- function(x) {
x_nm <- names(x)
mutate(x, !! x_nm := recode(!! sym(x_nm), !!! dicts[[x_nm]]))
}
lmap_at(data,
names(dicts),
myfun2)
}
sotos_pivot <- function(data, dicts) {
data %>%
pivot_longer(-1) %>%
left_join(stack(dicts) %>%
rownames_to_column('value'),
by = c('value', 'name' = 'ind')) %>%
pivot_wider(id_cols = -value, names_from = name, values_from = values)
}
thomas_base_r <- function(data, dicts) {
idx <- match(names(dicts), names(data))
data[idx] <- Map(`[`, dicts, data[idx])
data
}
sindri_base_r <- function(data, dicts) {
look_dict <- function(col, values) dicts[[col]][values]
data[names(dicts)] <- lapply(names(dicts), \(col) look_dict(col, data[[col]]))
data
}
由 reprex package (v2.0.0)
于 2021-12-15 创建使用 purrr
的另一种选择,无需深入研究复杂的潮汐问题。
library(purrr)
library(tibble)
test %>%
lmap_at(c("A", "B", "C"),
~ as_tibble_col(dicts[[names(.x)]][unlist(.x)], names(.x)))
# # A tibble: 3 x 4
# Names A B C
# <chr> <chr> <chr> <chr>
# 1 Alice charlie yes delta
# 2 Bob delta no epsilon
# 3 Cindy bravo bad beta
如果有 modify2_at()
或 imodify_at()
函数之类的东西,这会很容易,但这里我们使用 lmap_at()
作为变通方法。