获取parent的所有后代的快速方法

Fast method of getting all the descendants of a parent

parent-child 关系数据框如下:

  parent_id child_id
1         1        2
2         2        3
3         3        4

目标是实现以下目标,即先前数据框的扩展版本,其中所有后代(children、grandchildren 等)都分配给每个 parent (包括 parent/child 本身):

   parent_id child_id
1          1        1
2          1        2
3          1        3
4          1        4
5          2        2
6          2        3
7          2        4
8          3        3
9          3        4
10         4        4

我的问题:在 R 中实现该目标的最快方法(或其中之一)是什么?

我已经尝试过各种方法 - 从 for 循环、SQL 递归到使用 igraph(如 here 所述)。它们都比较慢,其中一些在处理大量组合时也容易崩溃。

下面是 sqldfigraph 的示例,基准数据帧比上面的稍大。

library(sqldf)
library(purrr)
library(dplyr)
library(igraph)

df <- data.frame(parent_id = 1:1000L)
df$child_id <- df$parent_id + 1L

# SQL recursion

sqlQuery <- 'with recursive
             dfDescendants (parent_id, child_id)
             as
             (select parent_id, child_id from df
             union all
             select d.parent_id, s.child_id from dfDescendants d
             join df s
             on d.child_id = s.parent_id)
             select distinct parent_id, parent_id as child_id from dfDescendants
             union
             select distinct child_id as parent_id, child_id from dfDescendants
             union
             select * from dfDescendants;'

sqldf(sqlQuery)

# igraph with purrr

df_g = graph_from_data_frame(df, directed = TRUE)

map(V(df_g), ~ names(subcomponent(df_g, .x, mode = "out"))) %>% 
  map_df(~ data.frame(child_id = .x), .id = "parent_id")

基准(不包括 sqldf 中的查询创建和 igraph 中的图形转换):

set.seed(23423)

microbenchmark::microbenchmark(
  sqldf = sqldf(sqlQuery),
  tidyigraph = map(V(df_g), ~ names(subcomponent(df_g, .x, mode = "out"))) %>% 
    map_df(~ data.frame(child_id = .x), .id = "parent_id"),
  times = 5
)

#    Unit: seconds
#           expr      min       lq     mean   median       uq      max neval
#          sqldf 7.815179 8.002836 8.113392 8.084038 8.315207 8.349701     5
#     tidyigraph 5.784239 5.806539 5.883241 5.889171 5.964906 5.971350     5

我们可以像下面这样使用ego

library(igraph)
df <- data.frame(parent_id = 1:3, child_id = 2:4)
g <- graph_from_data_frame(df)

setNames(
  rev(
    stack(
      Map(
        names,
        setNames(
          ego(g,
            order = vcount(g),
            mode = "out"
          ),
          names(V(g))
        )
      )
    )
  ),
  names(df)
)

这给出了

   parent_id child_id
1          1        1
2          1        2
3          1        3
4          1        4
5          2        2
6          2        3
7          2        4
8          3        3
9          3        4
10         4        4

基准测试

set.seed(23423)

microbenchmark::microbenchmark(
  sqldf = sqldf(sqlQuery),
  tidyigraph = map(V(df_g), ~ names(subcomponent(df_g, .x, mode = "out"))) %>%
    map_df(~ data.frame(child_id = .x), .id = "parent_id"),
  ego = setNames(
    rev(
      stack(
        Map(
          names,
          setNames(
            ego(df_g,
              order = vcount(df_g),
              mode = "out"
            ),
            names(V(df_g))
          )
        )
      )
    ),
    names(df)
  ),
  times = 5
)

显示

Unit: milliseconds
       expr       min       lq      mean    median         uq        max neval
      sqldf 7156.2753 9072.155 9402.6904 9518.2796 10206.3683 11060.3738     5
 tidyigraph 2483.9943 2623.558 3136.7490 2689.8388  2879.5688  5006.7853     5
        ego  182.5941  219.151  307.2481  253.2171   325.8721   555.4064     5

为了可读性使用管道的代码:

g |>
  ego(order = vcount(g), mode = "out") |>
  setNames(names(V(g))) |>
  Map(f = names) |>
  stack() |>
  rev() |>
  setNames(names(df))

igraph 当然是回答图形问题的好方法(另请参阅 Bioconductor 的 graph + RBGL 包),但我认为这在 R 中有一个迭代解决方案。

seems like 一个合理的方法是执行 depth-first 图遍历(我期待一个更高级的解决方案)。这实际上很容易在 R 中有效地实现。假设向量 pidcid 描述图中父节点和子节点之间的链接(如问题中的 data.frame )。将每个节点表示为正整数。

all_nodes <- unique(c(parent_id, child_id)  # all nodes
uid <- match(all_nodes, all_nodes)
pid <- match(parent_id, all_nodes)
cid <- match(child_id, all_nodes)

并形成从每个节点到其所有子节点的边列表。

edge_list <- unname(split(cid, factor(pid, levels = uid)))
edge_lengths <- lengths(edge_list)

当前子节点的子节点数为edge_list[cid],每个原始父节点关联的2级子节点数为rep(pid, edge_lengths[cid])。所以从任何节点到任何其他可达节点的路径通过简单的迭代遍历

while (length(pid)) {
    pid <- rep(pid, edge_lengths[cid])
    cid <- unlist(edge_list[cid])
}

@jblood94 指出遍历必须跟踪已经访问过的边。我们可以通过创建访问边的逻辑向量来有效地(及时,而不是 space!)实现这一点。我们使用 'factory' 模式,我们在其中创建一个保留状态(访问节点的逻辑向量)的函数。该向量由边 pid * n + cid 的唯一 ID (key) 索引。我们对不重复且尚未访问过的键感兴趣。

visitor <- function(uid, n_max = 3000) {
    n <- length(uid)
    if (n <= n_max) {
        ## over-allocate, to support `key = pid * n + cid`
        visited <- logical((n + 1L) * n) # FALSE on construction
    } else {
        stop("length(uid) greater than n_max = ", n_max)
    }
    function(pid, cid) {
        key <- pid * n + cid
        to_visit <- !(duplicated(key) | visited[key])
        visited[key[to_visit]] <<- TRUE  # update nodes that we will now visit
        to_visit
    }
}

因此

> visit = visitor(1:10)
> visit(1:3, 2:4)
[1] TRUE TRUE TRUE
> visit(2:4, 3:5)
[1] FALSE FALSE  TRUE

这是整个解决方案的更完整的实现,另外还有 book-keeping

visitor <- function(uid, n_max = 3000) {
    n <- length(uid)
    if (n <= n_max) {
        ## over-allocate, to support `key = pid * n + cid`
        visited <- logical((n + 1L) * n) # FALSE on construction
    } else {
        stop("length(uid) greater than n_max = ", n_max)
    }
    function(pid, cid) {
        key <- pid * n + cid
        to_visit <- !(duplicated(key) | visited[key])
        visited[key[to_visit]] <<- TRUE
        to_visit
    }
}

ancestor_descendant <- function(df) {
    ## encode parent and child to unique integer values
    ids <- unique(c(df$parent_id, df$child_id))
    uid <- match(ids, ids)
    pid <- match(df$parent_id, ids)
    cid <- match(df$child_id, ids)
    n <- length(uid)

    ## edge list of parent-offspring relationships, based on unique
    ## integer values; list is ordered by id, all ids are present, ids
    ## without children have zero-length elements. Use `unname()` so
    ## that edge_list is always indexed by integer
    edge_list <- unname(split(cid, factor(pid, levels = uid), drop = FALSE))
    edge_lengths <- lengths(edge_list)

    visit <- visitor(uid)
    keep <- visit(uid, uid) # all TRUE
    aid = did = list(uid) # results -- all uid's are there own ancestor / descendant
    i = 1L
   
    while (length(pid)) {
        ## only add new edges
        keep <- visit(pid, cid)
        ## record current generation ancestors and descendants
        pid <- pid[keep]
        cid <- cid[keep]
        i <- i + 1L
        aid[[i]] <- pid
        did[[i]] <- cid

        ## calculate next generation pid and cid.
        pid <- rep(pid, edge_lengths[cid])
        cid <- unlist(edge_list[cid])
    }
    ## decode results to original ids and clean up return value
    df <- data.frame(
        ancestor_id = ids[unlist(aid)],
        descendant_id = ids[unlist(did)]
    )
    df <- df[order(df$ancestor_id, df$descendant_id),]
    rownames(df) <- NULL
    df
}

这似乎是正确且高效的,至少从表面上看是这样

## Original example
df <- data.frame(parent_id = 1:1000L)
df$child_id <- df$parent_id + 1L
df = df[sample(nrow(df)),]
system.time(result <- ancestor_descendant(df))
##  user  system elapsed 
## 0.243   0.001   0.245 
dim(result)
## [1] 501501      2

## updated example from comments
df <- data.frame(parent_id = 1:1000L)
df$child_id <- df$parent_id + 1L
df <- rbind(df, data.frame(parent_id = 1000L, child_id = 1002L))
system.time(result <- ancestor_descendant(df))
##  user  system elapsed 
## 0.195   0.001   0.195 
dim(result)
## [1] 502502      2

## problematic case from @jblood94
df <- data.frame(
    parent_id=c(1, 1, 2),
    child_id = c(2, 3, 3)
)
ancestor_descendant(df)
##   ancestor_id descendant_id
## 1           1             1
## 2           1             2
## 3           1             3
## 4           2             2
## 5           2             3
## 6           3             3

## previously failed without filtering re-visited nodes
df <- data.frame(
    parent_id = rep(1:100, each = 2),
    child_id = c(2, rep(3:101, each = 2), 102)
)
system.time(result <- ancestor_descendant(df))
##  user  system elapsed 
## 0.005   0.000   0.006 
dim(result)
## [1] 5252    2