在 R 中使用 igraph 进行引文网络中的主要路径分析
Main path analysis in citation network using igraph in R
有人熟悉在 R 中使用 igraph 实现主路径分析(Hummon 和 Doreian 1989)的方法吗?
这是 Hummon 和 Doreian 原文中的一个示例。它跟踪 40 篇关于 DNA 的期刊文章的引用情况。箭头在时间上向前移动(信息来自旧文章 'flows to' 新文章)。
dna_edges <- data.frame(from=c(1,2,3,3,3,5,6,9,12,12,15,15,10,11,11,13,14,14,14,16,16,17,19,19,19,19,19,20,20,20,20,24,24,21,21,23,22,26,27,29,30,31,31,32,32,32,33,33,35,35,36,36,36),
to=c(8,18,4,5,21,12,9,12,15,29,29,22,17,13,20,20,16,20,31,17,20,34,20,24,25,21,25,31,22,30,22,28,37,22,32,27,27,27,32,32,40,32,40,36,38,33,32,35,38,39,38,39,40))
dna_g <- graph_from_data_frame(dna_edges, directed=T)
plot(dna_g,
layout=layout_with_sugiyama(dna_g,
layers = V(dna_g)$name)$layout)
Liu et al (2019) 说明在引文网络中节点可以是以下三种情况之一:
- 资料来源:被引用但未引用任何人
- Sinks:引用他人但从未被引用
- 中级:引用和被引用
所以在这个例子中,我们有十篇文章 'sources' 和另外十篇 'sinks':
dna_sources <- V(dna_g)$name[which(degree(dna_g, mode="in")==0)] # sources
[1] "8" "18" "4" "34" "25" "28" "37" "40" "38" "39"
dna_sinks <- V(dna_g)$name[which(degree(dna_g, mode="out")==0)] # sinks
[1] "1" "2" "3" "6" "10" "11" "14" "19" "23" "26"
主路径是连接源和汇的最常用路径。搜索路径计数 (SPC) 是其中一种方法。
"A citation link’s SPC is the number of times the link is traversed if
one runs through all the possible citation chains from all the sources
to all the sinks in a citation network. To find SPC for a specific
link, one needs to enumerate all the possible citation chains that
emanate from all the sources and terminate at all the sinks" (Liu et al. 2019: 381)
所以似乎为了继续需要 (i) 选择一对 source-sink,(ii) 找到连接这两个节点的所有路径并在每条边上添加 +1 权重交叉,(iii) 重复其他 source-sink 对。
关于如何执行 (i) 到 (iii) 有什么想法吗?
更新
如果你想找出哪条路径给出的总和权重最高,你可以试试下面的代码
g <- graph_from_data_frame(edgelst)
asp <- unlist(
sapply(
names(dna_sources),
function(s) {
all_simple_paths(g, s, names(dna_sinks))
}
),
recursive = FALSE
)
path_weight <- sapply(
asp,
function(p) {
v <- names(p)
sum(merge(
edgelst,
data.frame(from = head(v, -1), to = tail(v, -1))
)$weight)
}
)
max_cum <- max(path_weight)
path_max <- asp[path_weight == max_cum]
你会看到
> max_cum
[1] 30
> path_max
$`61`
+ 7/33 vertices, named, from a74a1fe:
[1] 6 9 12 29 32 36 39
$`111`
+ 6/33 vertices, named, from a74a1fe:
[1] 11 20 31 32 36 39
这是一个使用all_shortest_paths
的暴力破解方法(如果你不需要所有路径都是最短的,你可以使用all_simple_paths
代替)
dna_g <- graph_from_data_frame(dna_edges, directed = T)
dna_sources <- V(dna_g)[degree(dna_g, mode = "in") == 0]
dna_sinks <- V(dna_g)[degree(dna_g, mode = "out") == 0]
edgelst <- aggregate(weight ~ ., cbind(
do.call(
rbind,
unlist(sapply(
dna_sources,
function(s) {
# asp <- all_simple_paths(dna_g, s, dna_sinks) ## if we apply `all_simple_paths`
asp <- all_shortest_paths(dna_g, s, dna_sinks)$res
lapply(asp, function(p) {
v <- names(p)
data.frame(from = head(v, -1), to = tail(v, -1))
})
}
), recursive = FALSE)
),
weight = 1
), sum)
dna_df <- merge(dna_edges, edgelst, all = TRUE)
g <- graph_from_data_frame(dna_df, directed = TRUE)
plot(g, edge.label = dna_df$weight)
哪里
> dna_df
from to weight
1 1 8 1
2 2 18 1
3 3 4 1
4 3 5 NA
5 3 21 3
6 5 12 NA
7 6 9 3
8 9 12 3
9 10 17 1
10 11 13 NA
11 11 20 4
12 12 15 NA
13 12 29 3
14 13 20 NA
15 14 16 1
16 14 20 NA
17 14 31 3
18 15 22 NA
19 15 29 NA
20 16 17 1
21 16 20 NA
22 17 34 2
23 19 20 2
24 19 21 2
25 19 24 2
26 19 25 2
27 19 25 2
28 20 22 NA
29 20 22 NA
30 20 30 2
31 20 31 4
32 21 22 NA
33 21 32 5
34 22 27 NA
35 23 27 3
36 24 28 1
37 24 37 1
38 26 27 3
39 27 32 6
40 29 32 3
41 30 40 2
42 31 32 4
43 31 40 3
44 32 33 NA
45 32 36 11
46 32 38 7
47 33 32 NA
48 33 35 NA
49 35 38 NA
50 35 39 NA
51 36 38 NA
52 36 39 7
53 36 40 4
你会得到如下图
SPC
以下函数实现了SPC。
spc <- function(g) {
linegraph <- make_line_graph(g)
source_edges <- V(linegraph)[degree(linegraph, mode = "in") == 0]
sink_edges <- V(linegraph)[degree(linegraph, mode = "out") == 0]
tabulate(
unlist(
lapply(
source_edges,
all_simple_paths,
graph = linegraph,
to = sink_edges,
mode = "out")))
}
主路径搜索
以下函数查找主路径。请注意,如果有多个主路径具有相同的总 SPC 值,则可能还有其他主路径。此函数 returns 它找到的第一条主路径。
main_search <- function(g) {
linegraph <- make_line_graph(g)
V(linegraph)$spc <- spc(g)
source_edges <- V(linegraph)[degree(linegraph, mode = "in") == 0]
sink_edges <- V(linegraph)[degree(linegraph, mode = "out") == 0]
paths <- unlist(
lapply(
source_edges,
all_simple_paths,
graph = linegraph,
to = sink_edges,
mode = "out"),
recursive = FALSE)
path_lengths <- unlist(lapply(paths, function (x) sum(x$spc)))
vertex_attr(linegraph, "main_path") <- 0
vertex_attr(
linegraph,
"main_path",
paths[[which(path_lengths == max(path_lengths))[[1]]]]) <- 1
V(linegraph)$main_path
}
测试
The Wikipedia article 用于主路径分析有一个图形,所有边都附有 SPC 值。您可以在上面看到此图的副本。我将此图转录到 R 中,包括预期的 SPC 值和(全局)主路径。
library(tibble)
wikipedia_g <- graph_from_data_frame(
tibble::tribble(
~from, ~to, ~expected_spc, ~expected_main_path
"A", "C", 2, 0,
"B", "C", 2, 0,
"B", "D", 5, 1,
"B", "J", 1, 0,
"C", "E", 2, 0,
"C", "H", 2, 0,
"D", "F", 3, 1,
"D", "I", 2, 0,
"J", "M", 1, 0,
"E", "G", 2, 0,
"F", "H", 1, 0,
"F", "I", 2, 1,
"G", "H", 2, 0,
"I", "L", 2, 0,
"I", "M", 2, 1,
"H", "K", 5, 0,
"M", "N", 3, 1),
directed = TRUE)
期望 spc
函数输出的所有值都等于 expected_spc
值,事实就是如此。同样,expected_main_path
的值应与 main_search
的输出相匹配,情况也是如此。
all(E(wikipedia_g)$expected_spc == spc(wikipedia_g))
# TRUE
all(E(wikipedia_g)$expected_main_path == main_search(wikipedia_g))
# TRUE
有人熟悉在 R 中使用 igraph 实现主路径分析(Hummon 和 Doreian 1989)的方法吗?
这是 Hummon 和 Doreian 原文中的一个示例。它跟踪 40 篇关于 DNA 的期刊文章的引用情况。箭头在时间上向前移动(信息来自旧文章 'flows to' 新文章)。
dna_edges <- data.frame(from=c(1,2,3,3,3,5,6,9,12,12,15,15,10,11,11,13,14,14,14,16,16,17,19,19,19,19,19,20,20,20,20,24,24,21,21,23,22,26,27,29,30,31,31,32,32,32,33,33,35,35,36,36,36),
to=c(8,18,4,5,21,12,9,12,15,29,29,22,17,13,20,20,16,20,31,17,20,34,20,24,25,21,25,31,22,30,22,28,37,22,32,27,27,27,32,32,40,32,40,36,38,33,32,35,38,39,38,39,40))
dna_g <- graph_from_data_frame(dna_edges, directed=T)
plot(dna_g,
layout=layout_with_sugiyama(dna_g,
layers = V(dna_g)$name)$layout)
Liu et al (2019) 说明在引文网络中节点可以是以下三种情况之一:
- 资料来源:被引用但未引用任何人
- Sinks:引用他人但从未被引用
- 中级:引用和被引用
所以在这个例子中,我们有十篇文章 'sources' 和另外十篇 'sinks':
dna_sources <- V(dna_g)$name[which(degree(dna_g, mode="in")==0)] # sources
[1] "8" "18" "4" "34" "25" "28" "37" "40" "38" "39"
dna_sinks <- V(dna_g)$name[which(degree(dna_g, mode="out")==0)] # sinks
[1] "1" "2" "3" "6" "10" "11" "14" "19" "23" "26"
主路径是连接源和汇的最常用路径。搜索路径计数 (SPC) 是其中一种方法。
"A citation link’s SPC is the number of times the link is traversed if one runs through all the possible citation chains from all the sources to all the sinks in a citation network. To find SPC for a specific link, one needs to enumerate all the possible citation chains that emanate from all the sources and terminate at all the sinks" (Liu et al. 2019: 381)
所以似乎为了继续需要 (i) 选择一对 source-sink,(ii) 找到连接这两个节点的所有路径并在每条边上添加 +1 权重交叉,(iii) 重复其他 source-sink 对。
关于如何执行 (i) 到 (iii) 有什么想法吗?
更新
如果你想找出哪条路径给出的总和权重最高,你可以试试下面的代码
g <- graph_from_data_frame(edgelst)
asp <- unlist(
sapply(
names(dna_sources),
function(s) {
all_simple_paths(g, s, names(dna_sinks))
}
),
recursive = FALSE
)
path_weight <- sapply(
asp,
function(p) {
v <- names(p)
sum(merge(
edgelst,
data.frame(from = head(v, -1), to = tail(v, -1))
)$weight)
}
)
max_cum <- max(path_weight)
path_max <- asp[path_weight == max_cum]
你会看到
> max_cum
[1] 30
> path_max
$`61`
+ 7/33 vertices, named, from a74a1fe:
[1] 6 9 12 29 32 36 39
$`111`
+ 6/33 vertices, named, from a74a1fe:
[1] 11 20 31 32 36 39
这是一个使用all_shortest_paths
的暴力破解方法(如果你不需要所有路径都是最短的,你可以使用all_simple_paths
代替)
dna_g <- graph_from_data_frame(dna_edges, directed = T)
dna_sources <- V(dna_g)[degree(dna_g, mode = "in") == 0]
dna_sinks <- V(dna_g)[degree(dna_g, mode = "out") == 0]
edgelst <- aggregate(weight ~ ., cbind(
do.call(
rbind,
unlist(sapply(
dna_sources,
function(s) {
# asp <- all_simple_paths(dna_g, s, dna_sinks) ## if we apply `all_simple_paths`
asp <- all_shortest_paths(dna_g, s, dna_sinks)$res
lapply(asp, function(p) {
v <- names(p)
data.frame(from = head(v, -1), to = tail(v, -1))
})
}
), recursive = FALSE)
),
weight = 1
), sum)
dna_df <- merge(dna_edges, edgelst, all = TRUE)
g <- graph_from_data_frame(dna_df, directed = TRUE)
plot(g, edge.label = dna_df$weight)
哪里
> dna_df
from to weight
1 1 8 1
2 2 18 1
3 3 4 1
4 3 5 NA
5 3 21 3
6 5 12 NA
7 6 9 3
8 9 12 3
9 10 17 1
10 11 13 NA
11 11 20 4
12 12 15 NA
13 12 29 3
14 13 20 NA
15 14 16 1
16 14 20 NA
17 14 31 3
18 15 22 NA
19 15 29 NA
20 16 17 1
21 16 20 NA
22 17 34 2
23 19 20 2
24 19 21 2
25 19 24 2
26 19 25 2
27 19 25 2
28 20 22 NA
29 20 22 NA
30 20 30 2
31 20 31 4
32 21 22 NA
33 21 32 5
34 22 27 NA
35 23 27 3
36 24 28 1
37 24 37 1
38 26 27 3
39 27 32 6
40 29 32 3
41 30 40 2
42 31 32 4
43 31 40 3
44 32 33 NA
45 32 36 11
46 32 38 7
47 33 32 NA
48 33 35 NA
49 35 38 NA
50 35 39 NA
51 36 38 NA
52 36 39 7
53 36 40 4
你会得到如下图
SPC
以下函数实现了SPC。
spc <- function(g) {
linegraph <- make_line_graph(g)
source_edges <- V(linegraph)[degree(linegraph, mode = "in") == 0]
sink_edges <- V(linegraph)[degree(linegraph, mode = "out") == 0]
tabulate(
unlist(
lapply(
source_edges,
all_simple_paths,
graph = linegraph,
to = sink_edges,
mode = "out")))
}
主路径搜索
以下函数查找主路径。请注意,如果有多个主路径具有相同的总 SPC 值,则可能还有其他主路径。此函数 returns 它找到的第一条主路径。
main_search <- function(g) {
linegraph <- make_line_graph(g)
V(linegraph)$spc <- spc(g)
source_edges <- V(linegraph)[degree(linegraph, mode = "in") == 0]
sink_edges <- V(linegraph)[degree(linegraph, mode = "out") == 0]
paths <- unlist(
lapply(
source_edges,
all_simple_paths,
graph = linegraph,
to = sink_edges,
mode = "out"),
recursive = FALSE)
path_lengths <- unlist(lapply(paths, function (x) sum(x$spc)))
vertex_attr(linegraph, "main_path") <- 0
vertex_attr(
linegraph,
"main_path",
paths[[which(path_lengths == max(path_lengths))[[1]]]]) <- 1
V(linegraph)$main_path
}
测试
The Wikipedia article 用于主路径分析有一个图形,所有边都附有 SPC 值。您可以在上面看到此图的副本。我将此图转录到 R 中,包括预期的 SPC 值和(全局)主路径。
library(tibble)
wikipedia_g <- graph_from_data_frame(
tibble::tribble(
~from, ~to, ~expected_spc, ~expected_main_path
"A", "C", 2, 0,
"B", "C", 2, 0,
"B", "D", 5, 1,
"B", "J", 1, 0,
"C", "E", 2, 0,
"C", "H", 2, 0,
"D", "F", 3, 1,
"D", "I", 2, 0,
"J", "M", 1, 0,
"E", "G", 2, 0,
"F", "H", 1, 0,
"F", "I", 2, 1,
"G", "H", 2, 0,
"I", "L", 2, 0,
"I", "M", 2, 1,
"H", "K", 5, 0,
"M", "N", 3, 1),
directed = TRUE)
期望 spc
函数输出的所有值都等于 expected_spc
值,事实就是如此。同样,expected_main_path
的值应与 main_search
的输出相匹配,情况也是如此。
all(E(wikipedia_g)$expected_spc == spc(wikipedia_g))
# TRUE
all(E(wikipedia_g)$expected_main_path == main_search(wikipedia_g))
# TRUE