将分组的 tibble 转换为命名列表
Converting grouped tibble to named list
我觉得在 tidyverse
中可能有比 for-loop
更好的方法来做到这一点。从一个标准的 tibble/dataframe 开始,创建一个列表,其中列表元素的名称是一列的唯一值(group_by
?),列表元素是另一列的所有值。
my_data <- tibble(list_names = c("Ford", "Chevy", "Ford", "Dodge", "Dodge", "Ford"),
list_values = c("Ranger", "Equinox", "F150", "Caravan", "Ram", "Explorer"))
# A tibble: 6 × 2
list_names list_values
<chr> <chr>
1 Ford Ranger
2 Chevy Equinox
3 Ford F150
4 Dodge Caravan
5 Dodge Ram
6 Ford Explorer
这是期望的输出:
desired_output <- list(Ford = c("Ranger", "F150", "Explorer"),
Chevy = c("Equinox"),
Dodge = c("Caravan", "Ram"))
$Ford
[1] "Ranger" "F150" "Explorer"
$Chevy
[1] "Equinox"
$Dodge
[1] "Caravan" "Ram"
它可以用 for-loop
来完成,但我敢打赌有一个 tidyverse
函数可以使它更 simple/faster,等等
desired_output <- list()
for(i in seq_along(my_data$list_names)) {
entry <- my_data %>%
filter(list_names == my_data$list_names[i]) %>%
pull(list_values)
desired_output[[my_data$list_names[i]]] <- entry
}
我们可以使用split
with(my_data, split(list_values,
factor(list_names, levels = unique(list_names))))
$Ford
[1] "Ranger" "F150" "Explorer"
$Chevy
[1] "Equinox"
$Dodge
[1] "Caravan" "Ram"
或者用unstack
unstack(my_data, list_values ~ list_names)
$Chevy
[1] "Equinox"
$Dodge
[1] "Caravan" "Ram"
$Ford
[1] "Ranger" "F150" "Explorer"
这是另一个选项(虽然有点冗长)使用 group_modify
和 deframe
来自 tidyverse
.
library(tidyverse)
my_data |>
group_by(list_names) |>
group_modify(\(x, ...) tibble(res = list(deframe(x)))) |>
deframe()
或者另一种选择是使用 summarise
然后再次使用 deframe
:
my_data %>%
group_by(list_names) %>%
summarise(named_vec = list(list_values)) %>%
deframe()
输出
$Chevy
[1] "Equinox"
$Dodge
[1] "Caravan" "Ram"
$Ford
[1] "Ranger" "F150" "Explorer"
基准
我很想知道这里的答案中最快的是什么,显然@akrun 的 split
是迄今为止最快的,其次是 unstack
。
bm <- microbenchmark::microbenchmark(
akrun_split = with(my_data, split(list_values,
factor(list_names, levels = unique(list_names)))),
akrun_unstack = unstack(my_data, list_values ~ list_names),
andrew_deframe1 = my_data |>
group_by(list_names) |>
group_modify(\(x, ...) tibble(res = list(deframe(x)))) |>
deframe(),
andrew_deframe2 = my_data %>%
group_by(list_names) %>%
summarise(named_vec = list(list_values)) %>%
deframe(),
paulsmith = my_data %>%
group_by(list_names) %>%
summarise(list_values = list(list_values)) %>%
{set_names(.$list_values, .$list_names)},
times=1000L
)
更新 II: 将 tibble 输出更改为矢量输出。感谢杰夫帕克!
更新 由于 Jeff Parker 的评论(请参阅评论)。我现在更新了代码。问题是将名称设置为未排序,在使用 sort
后我们可以正确使用 setNames
。然后我添加 map
应用 dplyr
s select
删除每个数据帧中的第一列:
library(dplyr)
library(purrr)
my_data %>%
group_by(list_names) %>%
mutate(list_values= paste(list_values, collapse = ", ")) %>%
slice(1) %>%
group_split() %>%
setNames(sort(unique(my_data$list_names))) %>%
map(., dplyr::pull, -list_names) %>%
map(., ~str_split(.x, ", ")[[1]] )
$Chevy
[1] "Equinox"
$Dodge
[1] "Caravan" "Ram"
$Ford
[1] "Ranger" "F150" "Explorer"
另一个可能的解决方案:
library(tidyverse)
my_data <- tibble(list_names = c("Ford", "Chevy", "Ford", "Dodge", "Dodge", "Ford"),
list_values = c("Ranger", "Equinox", "F150", "Caravan", "Ram", "Explorer"))
my_data %>%
group_by(list_names) %>%
summarise(list_values = list(list_values)) %>%
{set_names(.$list_values, .$list_names)}
#> $Chevy
#> [1] "Equinox"
#>
#> $Dodge
#> [1] "Caravan" "Ram"
#>
#> $Ford
#> [1] "Ranger" "F150" "Explorer"
只是为了好玩,用我的 for 循环和 TarJae 的回答添加了基准。
bm <- microbenchmark::microbenchmark(
akrun_split = with(my_data, split(list_values,
factor(list_names, levels = unique(list_names)))),
akrun_unstack = unstack(my_data, list_values ~ list_names),
andrew_deframe1 = my_data |>
group_by(list_names) |>
group_modify(\(x, ...) tibble(res = list(deframe(x)))) |>
deframe(),
andrew_deframe2 = my_data %>%
group_by(list_names) %>%
summarise(named_vec = list(list_values)) %>%
deframe(),
paulsmith = my_data %>%
group_by(list_names) %>%
summarise(list_values = list(list_values)) %>%
{set_names(.$list_values, .$list_names)},
jeffs = {
desired_output <- list()
for(i in seq_along(my_data$list_names)) {
entry <- my_data %>%
filter(list_names == my_data$list_names[i]) %>%
pull(list_values)
desired_output[[my_data$list_names[i]]] <- entry
}
desired_output},
TarJae = my_data %>%
group_by(list_names) %>%
mutate(list_values= paste(list_values, collapse = ", ")) %>%
slice(1) %>%
group_split() %>%
setNames(sort(unique(my_data$list_names))) %>%
map(., dplyr::pull, -list_names) %>%
map(., ~str_split(.x, ", ")[[1]] ),
times=100L
)
我还 运行 在两个最快的选项(来自 Akrun)上使用更大的数据集进行基准测试。
library(nycflights13)
my_data <- nycflights13::flights %>%
select(list_names = carrier, list_values = flight)
另一种方法:
library(tidyverse)
my_data %>%
group_split(list_names) %>%
map(~ lst(!!unique(pull(.x, list_names)) := unique(pull(.x, list_values)))) %>%
flatten()
#> $Chevy
#> [1] "Equinox"
#>
#> $Dodge
#> [1] "Caravan" "Ram"
#>
#> $Ford
#> [1] "Ranger" "F150" "Explorer"
由 reprex package (v2.0.1)
创建于 2022-01-14
数据:
my_data <- tibble(
list_names = c("Ford", "Chevy", "Ford", "Dodge", "Dodge", "Ford"),
list_values = c("Ranger", "Equinox", "F150", "Caravan", "Ram", "Explorer")
)
我觉得在 tidyverse
中可能有比 for-loop
更好的方法来做到这一点。从一个标准的 tibble/dataframe 开始,创建一个列表,其中列表元素的名称是一列的唯一值(group_by
?),列表元素是另一列的所有值。
my_data <- tibble(list_names = c("Ford", "Chevy", "Ford", "Dodge", "Dodge", "Ford"),
list_values = c("Ranger", "Equinox", "F150", "Caravan", "Ram", "Explorer"))
# A tibble: 6 × 2
list_names list_values
<chr> <chr>
1 Ford Ranger
2 Chevy Equinox
3 Ford F150
4 Dodge Caravan
5 Dodge Ram
6 Ford Explorer
这是期望的输出:
desired_output <- list(Ford = c("Ranger", "F150", "Explorer"),
Chevy = c("Equinox"),
Dodge = c("Caravan", "Ram"))
$Ford
[1] "Ranger" "F150" "Explorer"
$Chevy
[1] "Equinox"
$Dodge
[1] "Caravan" "Ram"
它可以用 for-loop
来完成,但我敢打赌有一个 tidyverse
函数可以使它更 simple/faster,等等
desired_output <- list()
for(i in seq_along(my_data$list_names)) {
entry <- my_data %>%
filter(list_names == my_data$list_names[i]) %>%
pull(list_values)
desired_output[[my_data$list_names[i]]] <- entry
}
我们可以使用split
with(my_data, split(list_values,
factor(list_names, levels = unique(list_names))))
$Ford
[1] "Ranger" "F150" "Explorer"
$Chevy
[1] "Equinox"
$Dodge
[1] "Caravan" "Ram"
或者用unstack
unstack(my_data, list_values ~ list_names)
$Chevy
[1] "Equinox"
$Dodge
[1] "Caravan" "Ram"
$Ford
[1] "Ranger" "F150" "Explorer"
这是另一个选项(虽然有点冗长)使用 group_modify
和 deframe
来自 tidyverse
.
library(tidyverse)
my_data |>
group_by(list_names) |>
group_modify(\(x, ...) tibble(res = list(deframe(x)))) |>
deframe()
或者另一种选择是使用 summarise
然后再次使用 deframe
:
my_data %>%
group_by(list_names) %>%
summarise(named_vec = list(list_values)) %>%
deframe()
输出
$Chevy
[1] "Equinox"
$Dodge
[1] "Caravan" "Ram"
$Ford
[1] "Ranger" "F150" "Explorer"
基准
我很想知道这里的答案中最快的是什么,显然@akrun 的 split
是迄今为止最快的,其次是 unstack
。
bm <- microbenchmark::microbenchmark(
akrun_split = with(my_data, split(list_values,
factor(list_names, levels = unique(list_names)))),
akrun_unstack = unstack(my_data, list_values ~ list_names),
andrew_deframe1 = my_data |>
group_by(list_names) |>
group_modify(\(x, ...) tibble(res = list(deframe(x)))) |>
deframe(),
andrew_deframe2 = my_data %>%
group_by(list_names) %>%
summarise(named_vec = list(list_values)) %>%
deframe(),
paulsmith = my_data %>%
group_by(list_names) %>%
summarise(list_values = list(list_values)) %>%
{set_names(.$list_values, .$list_names)},
times=1000L
)
更新 II: 将 tibble 输出更改为矢量输出。感谢杰夫帕克!
更新 由于 Jeff Parker 的评论(请参阅评论)。我现在更新了代码。问题是将名称设置为未排序,在使用 sort
后我们可以正确使用 setNames
。然后我添加 map
应用 dplyr
s select
删除每个数据帧中的第一列:
library(dplyr)
library(purrr)
my_data %>%
group_by(list_names) %>%
mutate(list_values= paste(list_values, collapse = ", ")) %>%
slice(1) %>%
group_split() %>%
setNames(sort(unique(my_data$list_names))) %>%
map(., dplyr::pull, -list_names) %>%
map(., ~str_split(.x, ", ")[[1]] )
$Chevy
[1] "Equinox"
$Dodge
[1] "Caravan" "Ram"
$Ford
[1] "Ranger" "F150" "Explorer"
另一个可能的解决方案:
library(tidyverse)
my_data <- tibble(list_names = c("Ford", "Chevy", "Ford", "Dodge", "Dodge", "Ford"),
list_values = c("Ranger", "Equinox", "F150", "Caravan", "Ram", "Explorer"))
my_data %>%
group_by(list_names) %>%
summarise(list_values = list(list_values)) %>%
{set_names(.$list_values, .$list_names)}
#> $Chevy
#> [1] "Equinox"
#>
#> $Dodge
#> [1] "Caravan" "Ram"
#>
#> $Ford
#> [1] "Ranger" "F150" "Explorer"
只是为了好玩,用我的 for 循环和 TarJae 的回答添加了基准。
bm <- microbenchmark::microbenchmark(
akrun_split = with(my_data, split(list_values,
factor(list_names, levels = unique(list_names)))),
akrun_unstack = unstack(my_data, list_values ~ list_names),
andrew_deframe1 = my_data |>
group_by(list_names) |>
group_modify(\(x, ...) tibble(res = list(deframe(x)))) |>
deframe(),
andrew_deframe2 = my_data %>%
group_by(list_names) %>%
summarise(named_vec = list(list_values)) %>%
deframe(),
paulsmith = my_data %>%
group_by(list_names) %>%
summarise(list_values = list(list_values)) %>%
{set_names(.$list_values, .$list_names)},
jeffs = {
desired_output <- list()
for(i in seq_along(my_data$list_names)) {
entry <- my_data %>%
filter(list_names == my_data$list_names[i]) %>%
pull(list_values)
desired_output[[my_data$list_names[i]]] <- entry
}
desired_output},
TarJae = my_data %>%
group_by(list_names) %>%
mutate(list_values= paste(list_values, collapse = ", ")) %>%
slice(1) %>%
group_split() %>%
setNames(sort(unique(my_data$list_names))) %>%
map(., dplyr::pull, -list_names) %>%
map(., ~str_split(.x, ", ")[[1]] ),
times=100L
)
我还 运行 在两个最快的选项(来自 Akrun)上使用更大的数据集进行基准测试。
library(nycflights13)
my_data <- nycflights13::flights %>%
select(list_names = carrier, list_values = flight)
另一种方法:
library(tidyverse)
my_data %>%
group_split(list_names) %>%
map(~ lst(!!unique(pull(.x, list_names)) := unique(pull(.x, list_values)))) %>%
flatten()
#> $Chevy
#> [1] "Equinox"
#>
#> $Dodge
#> [1] "Caravan" "Ram"
#>
#> $Ford
#> [1] "Ranger" "F150" "Explorer"
由 reprex package (v2.0.1)
创建于 2022-01-14数据:
my_data <- tibble(
list_names = c("Ford", "Chevy", "Ford", "Dodge", "Dodge", "Ford"),
list_values = c("Ranger", "Equinox", "F150", "Caravan", "Ram", "Explorer")
)