使用 multidplyr 进行矢量化不会呈现正确的输出
Vectorizing with multidplyr does not render the correct output
我尝试并行化 ape::dist_topo()
,一个计算无根树之间距离的函数。
通常函数是这样工作的(reprex:4 棵随机树,每棵树有 5 个叶子):
library(tidyverse)
# devtools::install_github("hadley/multidplyr")
library(multidplyr)
library(ape)
set.seed(3)
trees <-
map(rep(5, 4), rtree) %>%
do.call(c.phylo, .) %>% # To transform my list of phylo objects in a multiPhylo object
unroot.multiPhylo()
dist.topo(trees)
# tree1 tree2 tree3
# tree2 4
# tree3 4 2
# tree4 4 4 2
我创建了一个函数来计算 data.frame 中 2 乘 2 的距离(以便按行拆分成簇):
dist.topo2 <- function(multiphylo){
expand.grid(multiphylo, multiphylo) %>%
as.tibble() %>%
mutate(dist = map2(Var1, Var2, dist.topo)) %>%
pull(dist) %>%
matrix(., nrow = sqrt(length(.))) %>%
as.dist()
}
dist.topo2(trees)
# 1 2 3
# 2 4
# 3 4 2
# 4 4 4 2
不出所料,结果是一样的(不管名称如何)。
然后我在管道中添加了 multidplyr::partition()
和 multidplyr::collect()
函数:
dist.topo3 <- function(multiphylo){
expand.grid(multiphylo, multiphylo) %>%
as.tibble() %>%
partition() %>%
mutate(dist = purrr::map2(Var1, Var2, ape::dist.topo)) %>%
collect() %>%
pull(dist) %>%
matrix(., nrow = sqrt(length(.))) %>%
as.dist()
}
dist.topo3(trees)
# 1 2 3
# 2 4
# 3 0 4
# 4 2 4 4
# Warning messages:
# 1: In bind_rows_(x, .id) :
# Vectorizing 'multiPhylo' elements may not preserve their attributes
# 2: In bind_rows_(x, .id) :
# Vectorizing 'multiPhylo' elements may not preserve their attributes
# 3: In bind_rows_(x, .id) :
# Vectorizing 'multiPhylo' elements may not preserve their attributes
# 4: In bind_rows_(x, .id) :
# Vectorizing 'multiPhylo' elements may not preserve their attributes
# 5: In bind_rows_(x, .id) :
# Vectorizing 'multiPhylo' elements may not preserve their attributes
# 6: In bind_rows_(x, .id) :
# Vectorizing 'multiPhylo' elements may not preserve their attributes
如您所见,距离不同,但操作没有改变。
我该如何解决?也许不可能(See here)
谢谢
注意:我知道这个解决方案可能不是最优的(特别是因为它计算每个距离两次)但这不是重点。
我宁愿使用包 {furrr} 而不是 {multidplyr}:
library(furrr)
plan(multiprocess)
dist.topo4 <- function(multiphylo) {
dists <- expand.grid(multiphylo, multiphylo) %>%
setNames(c("x", "y")) %>%
future_pmap_dbl(ape::dist.topo)
n <- length(multiphylo)
dim(dists) <- c(n, n)
as.dist(dists)
}
结果
> dist.topo4(trees)
1 2 3
2 4
3 4 2
4 4 4 2
问题是 partition
会随机分片 data.frame 而 collect
会随机分片 data.frame。如果将行号添加为一列并在收集后进行排列,则可以解决问题
dist.topo3 <- function(multiphylo){
expand.grid(multiphylo, multiphylo) %>%
as.tibble() %>%
mutate(rn = row_number()) %>%
partition(rn) %>%
mutate(dist = purrr::map2(Var1, Var2, ape::dist.topo)) %>%
collect() %>%
arrange(rn) %>%
pull(dist) %>%
matrix(., nrow = sqrt(length(.))) %>%
as.dist()
}
dist.topo3(trees)
# 1 2 3
# 2 4
# 3 4 2
# 4 4 4 2
我尝试并行化 ape::dist_topo()
,一个计算无根树之间距离的函数。
通常函数是这样工作的(reprex:4 棵随机树,每棵树有 5 个叶子):
library(tidyverse)
# devtools::install_github("hadley/multidplyr")
library(multidplyr)
library(ape)
set.seed(3)
trees <-
map(rep(5, 4), rtree) %>%
do.call(c.phylo, .) %>% # To transform my list of phylo objects in a multiPhylo object
unroot.multiPhylo()
dist.topo(trees)
# tree1 tree2 tree3
# tree2 4
# tree3 4 2
# tree4 4 4 2
我创建了一个函数来计算 data.frame 中 2 乘 2 的距离(以便按行拆分成簇):
dist.topo2 <- function(multiphylo){
expand.grid(multiphylo, multiphylo) %>%
as.tibble() %>%
mutate(dist = map2(Var1, Var2, dist.topo)) %>%
pull(dist) %>%
matrix(., nrow = sqrt(length(.))) %>%
as.dist()
}
dist.topo2(trees)
# 1 2 3
# 2 4
# 3 4 2
# 4 4 4 2
不出所料,结果是一样的(不管名称如何)。
然后我在管道中添加了 multidplyr::partition()
和 multidplyr::collect()
函数:
dist.topo3 <- function(multiphylo){
expand.grid(multiphylo, multiphylo) %>%
as.tibble() %>%
partition() %>%
mutate(dist = purrr::map2(Var1, Var2, ape::dist.topo)) %>%
collect() %>%
pull(dist) %>%
matrix(., nrow = sqrt(length(.))) %>%
as.dist()
}
dist.topo3(trees)
# 1 2 3
# 2 4
# 3 0 4
# 4 2 4 4
# Warning messages:
# 1: In bind_rows_(x, .id) :
# Vectorizing 'multiPhylo' elements may not preserve their attributes
# 2: In bind_rows_(x, .id) :
# Vectorizing 'multiPhylo' elements may not preserve their attributes
# 3: In bind_rows_(x, .id) :
# Vectorizing 'multiPhylo' elements may not preserve their attributes
# 4: In bind_rows_(x, .id) :
# Vectorizing 'multiPhylo' elements may not preserve their attributes
# 5: In bind_rows_(x, .id) :
# Vectorizing 'multiPhylo' elements may not preserve their attributes
# 6: In bind_rows_(x, .id) :
# Vectorizing 'multiPhylo' elements may not preserve their attributes
如您所见,距离不同,但操作没有改变。
我该如何解决?也许不可能(See here)
谢谢
注意:我知道这个解决方案可能不是最优的(特别是因为它计算每个距离两次)但这不是重点。
我宁愿使用包 {furrr} 而不是 {multidplyr}:
library(furrr)
plan(multiprocess)
dist.topo4 <- function(multiphylo) {
dists <- expand.grid(multiphylo, multiphylo) %>%
setNames(c("x", "y")) %>%
future_pmap_dbl(ape::dist.topo)
n <- length(multiphylo)
dim(dists) <- c(n, n)
as.dist(dists)
}
结果
> dist.topo4(trees)
1 2 3
2 4
3 4 2
4 4 4 2
问题是 partition
会随机分片 data.frame 而 collect
会随机分片 data.frame。如果将行号添加为一列并在收集后进行排列,则可以解决问题
dist.topo3 <- function(multiphylo){
expand.grid(multiphylo, multiphylo) %>%
as.tibble() %>%
mutate(rn = row_number()) %>%
partition(rn) %>%
mutate(dist = purrr::map2(Var1, Var2, ape::dist.topo)) %>%
collect() %>%
arrange(rn) %>%
pull(dist) %>%
matrix(., nrow = sqrt(length(.))) %>%
as.dist()
}
dist.topo3(trees)
# 1 2 3
# 2 4
# 3 4 2
# 4 4 4 2