使用 R 中的 igraph 计算空心三角形顶点中节点的出现次数

Question

在篮球运动员之间的传球网络中，我想：

检测网络中的开放三角形
统计代理位置的唯一玩家数量（A传给B&C；B&C互不传；A在代理）
数一数这些玩家促成了开放三角形的次数

根据这个问题我们可以做以下事情：

library(igraph)

set.seed(1234)
G <- sample_gnm(10, 15)

G
IGRAPH 72f8e6a U--- 10 15 -- Erdos renyi (gnm) graph
+ attr: name (g/c), type (g/c), loops (g/l), m (g/n)
+ edges from 72f8e6a:
[1] 1-- 3 1-- 4 3-- 4 1-- 5 3-- 5 6-- 7 3-- 8 4-- 8 6-- 8 7-- 8 2-- 9 6-- 9 7-- 9 4--10 9--10
 
plot(G)

找到空心三角形：

openTriList <- unique(do.call(c, lapply(as_ids(V(G)), function(v) {
    do.call(c, lapply(as_ids(neighbors(G, v)), function(v1) {
    v2 <- as_ids(neighbors(G, v1))
    v2 <- v2[shortest.paths(G, v, v2) == 2]

    if(length(v2) != 0) {
        lapply(v2, function(vv2) { c(v, v1, vv2)[order(c(v, v1, vv2))] })
    } else { list() }
    }))
})))

结果正确：

do.call(rbind, openTriList)
  [,1] [,2] [,3]
  [1,]    1    3    8
  [2,]    1    4    8
  [3,]    1    4   10
  [4,]    2    6    9
  [5,]    2    7    9
  [6,]    2    9   10
  [7,]    3    4   10
  [8,]    3    6    8
  [9,]    3    7    8
  [10,]    1    4    5
  [11,]    3    4    5
  [12,]    4    6    8
  [13,]    4    7    8
  [14,]    4    9   10
  [15,]    3    5    8
  [16,]    6    9   10
  [17,]    7    9   10
  [18,]    4    8   10
  [19,]    6    8    9
  [20,]    7    8    9

我们如何找到作为经纪人的玩家？

玩家 2 在此列表中，因为它是开放三角形的一部分，但不是经纪人。我们忽略这个玩家。

我们如何有效地统计这些玩家促成了空心三角的次数？

玩家 9 正在代理 5 个开放三角形。

[真实数据手握数百万次传球，数千名球员。所以性能是一个重要的方面。使用 combn 会导致极长的计算时间。有更快的方法吗？也许让邻接图构建一个稀疏矩阵并将其转换为 data.table 对象以供邻居加入？看到这个 link。 ]

Answer 1

更新

如果你想加快速度，下面是使用 for 循环 + combn 定义用户函数 f 的选项，它会输出一个包含 [=18] 的列表=] 和 occurCnt（感谢@minem 的反馈以及性能改进）：

f <- function(G) {
  dmat <- as_adj(G, sparse = FALSE)
  resLst <- c()
  for (broker in 1:nrow(dmat)) {
    k <- which(dmat[broker, ] == 1)
    if (length(k) > 1) {
      inds <- t(combn(k, 2))
      resLst[[broker]] <- subset(cbind(broker, inds), dmat[inds] == 0)
    }
  }
  resLst <- do.call(rbind, resLst)
  resCnt <- table(resLst[, "broker"])
  list(openTriLst = resLst, occurCnt = resCnt)
}

你会看到他们可以达到预期的输出

> set.seed(1234)

> G <- sample_gnm(10, 15)

> f(G)
$openTriLst
      broker
 [1,]      1 4  5
 [2,]      3 1  8
 [3,]      3 4  5
 [4,]      3 5  8
 [5,]      4 1  8
 [6,]      4 1 10
 [7,]      4 3 10
 [8,]      4 8 10
 [9,]      6 8  9
[10,]      7 8  9
[11,]      8 3  6
[12,]      8 3  7
[13,]      8 4  6
[14,]      8 4  7
[15,]      9 2  6
[16,]      9 2  7
[17,]      9 2 10
[18,]      9 6 10
[19,]      9 7 10
[20,]     10 4  9

$occurCnt

 1  3  4  6  7  8  9 10
 1  3  4  1  1  4  5  1

而且速度比我之前的回答有明显提升。您还可以将其与 .

进行比较

> set.seed(1234)

> G1 <- sample_gnm(1000, 4000)

> system.time(f(G1))
   user  system elapsed 
   0.07    0.00    0.08

> G2 <- sample_gnm(10000, 40000)

> system.time(f(G2))
   user  system elapsed 
   2.46    0.16    2.62

上一个答案

您可以使用 combn + are_ajdacent 尝试下面的代码，例如，

G <- sample_gnm(10, 15) %>%
  get.data.frame() %>%
  graph_from_data_frame(directed = FALSE)

openTriList <- do.call(
  rbind,
  sapply(
    names(V(G)),
    function(v) {
      nbs <- names(neighbors(G, v))
      if (length(nbs) > 1) {
        do.call(rbind, Filter(length, combn(nbs, 2, FUN = function(x) {
          if (!are_adjacent(G, x[1], x[2])) {
            sort(as.numeric(c(v, x)))
          }
        }, simplify = FALSE)))
      }
    }
  )
)

occurCount <- na.omit(
  sapply(names(V(G)), function(v) {
    nbs <- names(neighbors(G, v))
    ifelse(length(nbs) > 1,
      sum(!combn(nbs,
        2,
        FUN = function(x) are_adjacent(G, x[1], x[2])
      )),
      NA
    )
  })
)

您将获得命名向量

> openTriList
      [,1] [,2] [,3]
 [1,]    1    4    5
 [2,]    1    3    8
 [3,]    3    4    5
 [4,]    3    5    8
 [5,]    6    8    9
 [6,]    1    4    8
 [7,]    1    4   10
 [8,]    3    4   10
 [9,]    4    8   10
[10,]    7    8    9
[11,]    2    6    9
[12,]    6    9   10
[13,]    2    7    9
[14,]    7    9   10
[15,]    2    9   10
[16,]    3    6    8
[17,]    3    7    8
[18,]    4    6    8
[19,]    4    7    8
[20,]    4    9   10

> occurCount
 1  3  6  4  7  9  5  8 10 
 1  3  1  4  1  5  0  4  1
attr(,"na.action")
2
6
attr(,"class")
[1] "omit"

Answer 2

我修改了 Thomas 的 openTriList 计算：

require(data.table)

v2 <- function(a) {
  n <- names(V(a))
  d <- as.data.table(a %>% get.data.frame())
  d <- as.data.table(lapply(d, as.numeric))
  k1 <- d[[1]]
  k2 <- d[[2]]
  ks <- k1 + k2
  # v <- n[[1]] # for testing
  xx <- sapply(n, function(v) {
    nbs <- as.numeric(names(neighbors(a, v)))
    vn <- as.numeric(v)
    
    if (length(nbs) > 1) {
      i2 <- combn(nbs, 2, simplify = F)
      
      # reduce test vectors:
      ss <- sapply(i2, sum)
      ii <- ks %in% ss
      kk1 <- k1[ii]
      kk2 <- k2[ii]
      
      # reduce test vectors 2:
      i <- (kk1 %in% nbs) & (kk2 %in% nbs)
      kk1 <- kk1[i]
      kk2 <- kk2[i]
      
      # x <- i2[[1]] # for testing
      i3 <- lapply(i2, function(x) {
        q1 <- kk1 == x[1]
        q2 <- kk2 == x[2]
        zz <- q1 & q2
        r2 <- !any(zz)
        if (is.null(r2) || r2) {
          rs <- c(vn, x)
          rs <- .Internal(sort(rs, decreasing = F)) # less overhead
          rs
        }
      })
      s <- i3[lengths(i3) > 0]
      do.call(rbind, s)
      }
    }
  )
  do.call(rbind, xx)
}

openTriList <- v2(G)

这应该会快很多。 combn 不慢。慢是are_adjacent。 Thomas 代码编写得相当复杂，很难调试并发现可能的减速... 测试：

set.seed(1)
G <- sample_gnm(1000, 4000)
G <- get.data.frame(G) %>% graph_from_data_frame(directed = FALSE)
system.time(r1 <- v1(G)) # 12.00 sec
system.time(r2 <- v2(G)) # 0.99 sec
all.equal(r1, r2) # TRUE

更新

fun_openTriList2 <- function(G) {
  dmat <- as_adj(G, sparse = FALSE)
  res <- list()
  for (i in 1:nrow(dmat)) {
    inds <- which(dmat[i, ] == 1)
    if (length(inds) > 1) {
      r <- combn(inds, 2,
                 FUN = function(x) {
                   if (dmat[x[1], x[2]] == 0) {
                     .Internal(sort(c(i, x), decreasing = F))
                   }
                 },
                 simplify = F
      )
      res[[i]] <- do.call(rbind, r)
    }
  }
  res <- do.call(rbind, res)
  res
}

使用 R 中的 igraph 计算空心三角形顶点中节点的出现次数

Count occurrence of nodes in vertex of open triangles using igraph in R

performance

networking

r

igraph

更新

上一个答案

更新