获取parent的所有后代的快速方法
Fast method of getting all the descendants of a parent
parent-child
关系数据框如下:
parent_id child_id
1 1 2
2 2 3
3 3 4
目标是实现以下目标,即先前数据框的扩展版本,其中所有后代(children、grandchildren 等)都分配给每个 parent (包括 parent/child 本身):
parent_id child_id
1 1 1
2 1 2
3 1 3
4 1 4
5 2 2
6 2 3
7 2 4
8 3 3
9 3 4
10 4 4
我的问题:在 R 中实现该目标的最快方法(或其中之一)是什么?
我已经尝试过各种方法 - 从 for 循环、SQL 递归到使用 igraph
(如 here 所述)。它们都比较慢,其中一些在处理大量组合时也容易崩溃。
下面是 sqldf
和 igraph
的示例,基准数据帧比上面的稍大。
library(sqldf)
library(purrr)
library(dplyr)
library(igraph)
df <- data.frame(parent_id = 1:1000L)
df$child_id <- df$parent_id + 1L
# SQL recursion
sqlQuery <- 'with recursive
dfDescendants (parent_id, child_id)
as
(select parent_id, child_id from df
union all
select d.parent_id, s.child_id from dfDescendants d
join df s
on d.child_id = s.parent_id)
select distinct parent_id, parent_id as child_id from dfDescendants
union
select distinct child_id as parent_id, child_id from dfDescendants
union
select * from dfDescendants;'
sqldf(sqlQuery)
# igraph with purrr
df_g = graph_from_data_frame(df, directed = TRUE)
map(V(df_g), ~ names(subcomponent(df_g, .x, mode = "out"))) %>%
map_df(~ data.frame(child_id = .x), .id = "parent_id")
基准(不包括 sqldf
中的查询创建和 igraph
中的图形转换):
set.seed(23423)
microbenchmark::microbenchmark(
sqldf = sqldf(sqlQuery),
tidyigraph = map(V(df_g), ~ names(subcomponent(df_g, .x, mode = "out"))) %>%
map_df(~ data.frame(child_id = .x), .id = "parent_id"),
times = 5
)
# Unit: seconds
# expr min lq mean median uq max neval
# sqldf 7.815179 8.002836 8.113392 8.084038 8.315207 8.349701 5
# tidyigraph 5.784239 5.806539 5.883241 5.889171 5.964906 5.971350 5
我们可以像下面这样使用ego
library(igraph)
df <- data.frame(parent_id = 1:3, child_id = 2:4)
g <- graph_from_data_frame(df)
setNames(
rev(
stack(
Map(
names,
setNames(
ego(g,
order = vcount(g),
mode = "out"
),
names(V(g))
)
)
)
),
names(df)
)
这给出了
parent_id child_id
1 1 1
2 1 2
3 1 3
4 1 4
5 2 2
6 2 3
7 2 4
8 3 3
9 3 4
10 4 4
基准测试
set.seed(23423)
microbenchmark::microbenchmark(
sqldf = sqldf(sqlQuery),
tidyigraph = map(V(df_g), ~ names(subcomponent(df_g, .x, mode = "out"))) %>%
map_df(~ data.frame(child_id = .x), .id = "parent_id"),
ego = setNames(
rev(
stack(
Map(
names,
setNames(
ego(df_g,
order = vcount(df_g),
mode = "out"
),
names(V(df_g))
)
)
)
),
names(df)
),
times = 5
)
显示
Unit: milliseconds
expr min lq mean median uq max neval
sqldf 7156.2753 9072.155 9402.6904 9518.2796 10206.3683 11060.3738 5
tidyigraph 2483.9943 2623.558 3136.7490 2689.8388 2879.5688 5006.7853 5
ego 182.5941 219.151 307.2481 253.2171 325.8721 555.4064 5
为了可读性使用管道的代码:
g |>
ego(order = vcount(g), mode = "out") |>
setNames(names(V(g))) |>
Map(f = names) |>
stack() |>
rev() |>
setNames(names(df))
igraph 当然是回答图形问题的好方法(另请参阅 Bioconductor 的 graph + RBGL 包),但我认为这在 R 中有一个迭代解决方案。
seems like 一个合理的方法是执行 depth-first 图遍历(我期待一个更高级的解决方案)。这实际上很容易在 R 中有效地实现。假设向量 pid
和 cid
描述图中父节点和子节点之间的链接(如问题中的 data.frame )。将每个节点表示为正整数。
all_nodes <- unique(c(parent_id, child_id) # all nodes
uid <- match(all_nodes, all_nodes)
pid <- match(parent_id, all_nodes)
cid <- match(child_id, all_nodes)
并形成从每个节点到其所有子节点的边列表。
edge_list <- unname(split(cid, factor(pid, levels = uid)))
edge_lengths <- lengths(edge_list)
当前子节点的子节点数为edge_list[cid]
,每个原始父节点关联的2级子节点数为rep(pid, edge_lengths[cid])
。所以从任何节点到任何其他可达节点的路径通过简单的迭代遍历
while (length(pid)) {
pid <- rep(pid, edge_lengths[cid])
cid <- unlist(edge_list[cid])
}
@jblood94 指出遍历必须跟踪已经访问过的边。我们可以通过创建访问边的逻辑向量来有效地(及时,而不是 space!)实现这一点。我们使用 'factory' 模式,我们在其中创建一个保留状态(访问节点的逻辑向量)的函数。该向量由边 pid * n + cid
的唯一 ID (key
) 索引。我们对不重复且尚未访问过的键感兴趣。
visitor <- function(uid, n_max = 3000) {
n <- length(uid)
if (n <= n_max) {
## over-allocate, to support `key = pid * n + cid`
visited <- logical((n + 1L) * n) # FALSE on construction
} else {
stop("length(uid) greater than n_max = ", n_max)
}
function(pid, cid) {
key <- pid * n + cid
to_visit <- !(duplicated(key) | visited[key])
visited[key[to_visit]] <<- TRUE # update nodes that we will now visit
to_visit
}
}
因此
> visit = visitor(1:10)
> visit(1:3, 2:4)
[1] TRUE TRUE TRUE
> visit(2:4, 3:5)
[1] FALSE FALSE TRUE
这是整个解决方案的更完整的实现,另外还有 book-keeping
visitor <- function(uid, n_max = 3000) {
n <- length(uid)
if (n <= n_max) {
## over-allocate, to support `key = pid * n + cid`
visited <- logical((n + 1L) * n) # FALSE on construction
} else {
stop("length(uid) greater than n_max = ", n_max)
}
function(pid, cid) {
key <- pid * n + cid
to_visit <- !(duplicated(key) | visited[key])
visited[key[to_visit]] <<- TRUE
to_visit
}
}
ancestor_descendant <- function(df) {
## encode parent and child to unique integer values
ids <- unique(c(df$parent_id, df$child_id))
uid <- match(ids, ids)
pid <- match(df$parent_id, ids)
cid <- match(df$child_id, ids)
n <- length(uid)
## edge list of parent-offspring relationships, based on unique
## integer values; list is ordered by id, all ids are present, ids
## without children have zero-length elements. Use `unname()` so
## that edge_list is always indexed by integer
edge_list <- unname(split(cid, factor(pid, levels = uid), drop = FALSE))
edge_lengths <- lengths(edge_list)
visit <- visitor(uid)
keep <- visit(uid, uid) # all TRUE
aid = did = list(uid) # results -- all uid's are there own ancestor / descendant
i = 1L
while (length(pid)) {
## only add new edges
keep <- visit(pid, cid)
## record current generation ancestors and descendants
pid <- pid[keep]
cid <- cid[keep]
i <- i + 1L
aid[[i]] <- pid
did[[i]] <- cid
## calculate next generation pid and cid.
pid <- rep(pid, edge_lengths[cid])
cid <- unlist(edge_list[cid])
}
## decode results to original ids and clean up return value
df <- data.frame(
ancestor_id = ids[unlist(aid)],
descendant_id = ids[unlist(did)]
)
df <- df[order(df$ancestor_id, df$descendant_id),]
rownames(df) <- NULL
df
}
这似乎是正确且高效的,至少从表面上看是这样
## Original example
df <- data.frame(parent_id = 1:1000L)
df$child_id <- df$parent_id + 1L
df = df[sample(nrow(df)),]
system.time(result <- ancestor_descendant(df))
## user system elapsed
## 0.243 0.001 0.245
dim(result)
## [1] 501501 2
## updated example from comments
df <- data.frame(parent_id = 1:1000L)
df$child_id <- df$parent_id + 1L
df <- rbind(df, data.frame(parent_id = 1000L, child_id = 1002L))
system.time(result <- ancestor_descendant(df))
## user system elapsed
## 0.195 0.001 0.195
dim(result)
## [1] 502502 2
## problematic case from @jblood94
df <- data.frame(
parent_id=c(1, 1, 2),
child_id = c(2, 3, 3)
)
ancestor_descendant(df)
## ancestor_id descendant_id
## 1 1 1
## 2 1 2
## 3 1 3
## 4 2 2
## 5 2 3
## 6 3 3
## previously failed without filtering re-visited nodes
df <- data.frame(
parent_id = rep(1:100, each = 2),
child_id = c(2, rep(3:101, each = 2), 102)
)
system.time(result <- ancestor_descendant(df))
## user system elapsed
## 0.005 0.000 0.006
dim(result)
## [1] 5252 2
parent-child
关系数据框如下:
parent_id child_id
1 1 2
2 2 3
3 3 4
目标是实现以下目标,即先前数据框的扩展版本,其中所有后代(children、grandchildren 等)都分配给每个 parent (包括 parent/child 本身):
parent_id child_id
1 1 1
2 1 2
3 1 3
4 1 4
5 2 2
6 2 3
7 2 4
8 3 3
9 3 4
10 4 4
我的问题:在 R 中实现该目标的最快方法(或其中之一)是什么?
我已经尝试过各种方法 - 从 for 循环、SQL 递归到使用 igraph
(如 here 所述)。它们都比较慢,其中一些在处理大量组合时也容易崩溃。
下面是 sqldf
和 igraph
的示例,基准数据帧比上面的稍大。
library(sqldf)
library(purrr)
library(dplyr)
library(igraph)
df <- data.frame(parent_id = 1:1000L)
df$child_id <- df$parent_id + 1L
# SQL recursion
sqlQuery <- 'with recursive
dfDescendants (parent_id, child_id)
as
(select parent_id, child_id from df
union all
select d.parent_id, s.child_id from dfDescendants d
join df s
on d.child_id = s.parent_id)
select distinct parent_id, parent_id as child_id from dfDescendants
union
select distinct child_id as parent_id, child_id from dfDescendants
union
select * from dfDescendants;'
sqldf(sqlQuery)
# igraph with purrr
df_g = graph_from_data_frame(df, directed = TRUE)
map(V(df_g), ~ names(subcomponent(df_g, .x, mode = "out"))) %>%
map_df(~ data.frame(child_id = .x), .id = "parent_id")
基准(不包括 sqldf
中的查询创建和 igraph
中的图形转换):
set.seed(23423)
microbenchmark::microbenchmark(
sqldf = sqldf(sqlQuery),
tidyigraph = map(V(df_g), ~ names(subcomponent(df_g, .x, mode = "out"))) %>%
map_df(~ data.frame(child_id = .x), .id = "parent_id"),
times = 5
)
# Unit: seconds
# expr min lq mean median uq max neval
# sqldf 7.815179 8.002836 8.113392 8.084038 8.315207 8.349701 5
# tidyigraph 5.784239 5.806539 5.883241 5.889171 5.964906 5.971350 5
我们可以像下面这样使用ego
library(igraph)
df <- data.frame(parent_id = 1:3, child_id = 2:4)
g <- graph_from_data_frame(df)
setNames(
rev(
stack(
Map(
names,
setNames(
ego(g,
order = vcount(g),
mode = "out"
),
names(V(g))
)
)
)
),
names(df)
)
这给出了
parent_id child_id
1 1 1
2 1 2
3 1 3
4 1 4
5 2 2
6 2 3
7 2 4
8 3 3
9 3 4
10 4 4
基准测试
set.seed(23423)
microbenchmark::microbenchmark(
sqldf = sqldf(sqlQuery),
tidyigraph = map(V(df_g), ~ names(subcomponent(df_g, .x, mode = "out"))) %>%
map_df(~ data.frame(child_id = .x), .id = "parent_id"),
ego = setNames(
rev(
stack(
Map(
names,
setNames(
ego(df_g,
order = vcount(df_g),
mode = "out"
),
names(V(df_g))
)
)
)
),
names(df)
),
times = 5
)
显示
Unit: milliseconds
expr min lq mean median uq max neval
sqldf 7156.2753 9072.155 9402.6904 9518.2796 10206.3683 11060.3738 5
tidyigraph 2483.9943 2623.558 3136.7490 2689.8388 2879.5688 5006.7853 5
ego 182.5941 219.151 307.2481 253.2171 325.8721 555.4064 5
为了可读性使用管道的代码:
g |>
ego(order = vcount(g), mode = "out") |>
setNames(names(V(g))) |>
Map(f = names) |>
stack() |>
rev() |>
setNames(names(df))
igraph 当然是回答图形问题的好方法(另请参阅 Bioconductor 的 graph + RBGL 包),但我认为这在 R 中有一个迭代解决方案。
seems like 一个合理的方法是执行 depth-first 图遍历(我期待一个更高级的解决方案)。这实际上很容易在 R 中有效地实现。假设向量 pid
和 cid
描述图中父节点和子节点之间的链接(如问题中的 data.frame )。将每个节点表示为正整数。
all_nodes <- unique(c(parent_id, child_id) # all nodes
uid <- match(all_nodes, all_nodes)
pid <- match(parent_id, all_nodes)
cid <- match(child_id, all_nodes)
并形成从每个节点到其所有子节点的边列表。
edge_list <- unname(split(cid, factor(pid, levels = uid)))
edge_lengths <- lengths(edge_list)
当前子节点的子节点数为edge_list[cid]
,每个原始父节点关联的2级子节点数为rep(pid, edge_lengths[cid])
。所以从任何节点到任何其他可达节点的路径通过简单的迭代遍历
while (length(pid)) {
pid <- rep(pid, edge_lengths[cid])
cid <- unlist(edge_list[cid])
}
@jblood94 指出遍历必须跟踪已经访问过的边。我们可以通过创建访问边的逻辑向量来有效地(及时,而不是 space!)实现这一点。我们使用 'factory' 模式,我们在其中创建一个保留状态(访问节点的逻辑向量)的函数。该向量由边 pid * n + cid
的唯一 ID (key
) 索引。我们对不重复且尚未访问过的键感兴趣。
visitor <- function(uid, n_max = 3000) {
n <- length(uid)
if (n <= n_max) {
## over-allocate, to support `key = pid * n + cid`
visited <- logical((n + 1L) * n) # FALSE on construction
} else {
stop("length(uid) greater than n_max = ", n_max)
}
function(pid, cid) {
key <- pid * n + cid
to_visit <- !(duplicated(key) | visited[key])
visited[key[to_visit]] <<- TRUE # update nodes that we will now visit
to_visit
}
}
因此
> visit = visitor(1:10)
> visit(1:3, 2:4)
[1] TRUE TRUE TRUE
> visit(2:4, 3:5)
[1] FALSE FALSE TRUE
这是整个解决方案的更完整的实现,另外还有 book-keeping
visitor <- function(uid, n_max = 3000) {
n <- length(uid)
if (n <= n_max) {
## over-allocate, to support `key = pid * n + cid`
visited <- logical((n + 1L) * n) # FALSE on construction
} else {
stop("length(uid) greater than n_max = ", n_max)
}
function(pid, cid) {
key <- pid * n + cid
to_visit <- !(duplicated(key) | visited[key])
visited[key[to_visit]] <<- TRUE
to_visit
}
}
ancestor_descendant <- function(df) {
## encode parent and child to unique integer values
ids <- unique(c(df$parent_id, df$child_id))
uid <- match(ids, ids)
pid <- match(df$parent_id, ids)
cid <- match(df$child_id, ids)
n <- length(uid)
## edge list of parent-offspring relationships, based on unique
## integer values; list is ordered by id, all ids are present, ids
## without children have zero-length elements. Use `unname()` so
## that edge_list is always indexed by integer
edge_list <- unname(split(cid, factor(pid, levels = uid), drop = FALSE))
edge_lengths <- lengths(edge_list)
visit <- visitor(uid)
keep <- visit(uid, uid) # all TRUE
aid = did = list(uid) # results -- all uid's are there own ancestor / descendant
i = 1L
while (length(pid)) {
## only add new edges
keep <- visit(pid, cid)
## record current generation ancestors and descendants
pid <- pid[keep]
cid <- cid[keep]
i <- i + 1L
aid[[i]] <- pid
did[[i]] <- cid
## calculate next generation pid and cid.
pid <- rep(pid, edge_lengths[cid])
cid <- unlist(edge_list[cid])
}
## decode results to original ids and clean up return value
df <- data.frame(
ancestor_id = ids[unlist(aid)],
descendant_id = ids[unlist(did)]
)
df <- df[order(df$ancestor_id, df$descendant_id),]
rownames(df) <- NULL
df
}
这似乎是正确且高效的,至少从表面上看是这样
## Original example
df <- data.frame(parent_id = 1:1000L)
df$child_id <- df$parent_id + 1L
df = df[sample(nrow(df)),]
system.time(result <- ancestor_descendant(df))
## user system elapsed
## 0.243 0.001 0.245
dim(result)
## [1] 501501 2
## updated example from comments
df <- data.frame(parent_id = 1:1000L)
df$child_id <- df$parent_id + 1L
df <- rbind(df, data.frame(parent_id = 1000L, child_id = 1002L))
system.time(result <- ancestor_descendant(df))
## user system elapsed
## 0.195 0.001 0.195
dim(result)
## [1] 502502 2
## problematic case from @jblood94
df <- data.frame(
parent_id=c(1, 1, 2),
child_id = c(2, 3, 3)
)
ancestor_descendant(df)
## ancestor_id descendant_id
## 1 1 1
## 2 1 2
## 3 1 3
## 4 2 2
## 5 2 3
## 6 3 3
## previously failed without filtering re-visited nodes
df <- data.frame(
parent_id = rep(1:100, each = 2),
child_id = c(2, rep(3:101, each = 2), 102)
)
system.time(result <- ancestor_descendant(df))
## user system elapsed
## 0.005 0.000 0.006
dim(result)
## [1] 5252 2