如何在 R 中同时对三个字段进行网络分析

How to do network analysis on three fields simultaneously in R

如何在 R 中同时对三个字段进行网络分析。下面是示例数据以及最后一列中的 desired output

df <- data.frame(
  stringsAsFactors = FALSE,
                    id_1 = c("ABC","ABC","BCD",
                             "CDE","DEF","EFG","GHI","HIJ","IJK","JKL",
                             "GHI","KLM","LMN","MNO","NOP"),
                    id_2 = c("1A","2A","3A",
                             "1A","4A","5A","6A","8A","9A","10A","7A",
                             "12A","13A","14A","15A"),
                    id_3 = c("Z3","Z2","Z1",
                             "Z4","Z1","Z5","Z5","Z6","Z7","Z8","Z6","Z8",
                             "Z9","Z9","Z1"),
                    Name = c("Whosebug1",
                             "Whosebug2","Whosebug3","Whosebug4",
                             "Whosebug5","Whosebug6",
                             "Whosebug7","Whosebug8","Whosebug9",
                             "Whosebug10","Whosebug11","Whosebug12",
                             "Whosebug13","Whosebug14","Whosebug15"),
          desired_output = c(1L,1L,2L,1L,2L,
                             3L,3L,3L,4L,5L,3L,5L,6L,6L,2L)
      )
df
#>    id_1 id_2 id_3            Name desired_output
#> 1   ABC   1A   Z3  Whosebug1              1
#> 2   ABC   2A   Z2  Whosebug2              1
#> 3   BCD   3A   Z1  Whosebug3              2
#> 4   CDE   1A   Z4  Whosebug4              1
#> 5   DEF   4A   Z1  Whosebug5              2
#> 6   EFG   5A   Z5  Whosebug6              3
#> 7   GHI   6A   Z5  Whosebug7              3
#> 8   HIJ   8A   Z6  Whosebug8              3
#> 9   IJK   9A   Z7  Whosebug9              4
#> 10  JKL  10A   Z8 Whosebug10              5
#> 11  GHI   7A   Z6 Whosebug11              3
#> 12  KLM  12A   Z8 Whosebug12              5
#> 13  LMN  13A   Z9 Whosebug13              6
#> 14  MNO  14A   Z9 Whosebug14              6
#> 15  NOP  15A   Z1 Whosebug15              2

实际上我可以按照我自己的回答 中的描述使用 igraph 同时对 2 个字段执行网络分析,但我无法对 2 个字段执行此操作。

请帮忙。

我目前的方法(2次迭代),我觉得可以优化。

library(igraph)
library(tidyverse)

graph.data.frame(df) %>%
  components() %>%
  pluck(membership) %>%
  stack() %>%
  set_names(c('GRP', 'id_1')) %>%
  right_join(df %>% mutate(id_1 = as.factor(id_1)), by = c('id_1')) %>%
  select(GRP, id_3) %>%
  graph.data.frame() %>% 
  components() %>%
  pluck(membership) %>%
  stack() %>%
  set_names(c('GRP', 'id_3')) %>%
  right_join(df %>% mutate(id_3 = as.factor(id_3)), by = c('id_3'))
#>    GRP id_3 id_1 id_2            Name desired_output
#> 1    1   Z3  ABC   1A  Whosebug1              1
#> 2    1   Z2  ABC   2A  Whosebug2              1
#> 3    2   Z1  BCD   3A  Whosebug3              2
#> 4    2   Z1  DEF   4A  Whosebug5              2
#> 5    2   Z1  NOP  15A Whosebug15              2
#> 6    1   Z4  CDE   1A  Whosebug4              1
#> 7    3   Z5  EFG   5A  Whosebug6              3
#> 8    3   Z5  GHI   6A  Whosebug7              3
#> 9    3   Z6  HIJ   8A  Whosebug8              3
#> 10   3   Z6  GHI   7A Whosebug11              3
#> 11   4   Z7  IJK   9A  Whosebug9              4
#> 12   5   Z8  JKL  10A Whosebug10              5
#> 13   5   Z8  KLM  12A Whosebug12              5
#> 14   6   Z9  LMN  13A Whosebug13              6
#> 15   6   Z9  MNO  14A Whosebug14              6

reprex package (v2.0.1)

于 2021-11-15 创建

创建由 id 列和行号定义的顶点之间的所有连接的列表(函数 f)。最后你只对行之间的连接感兴趣。

f <- function(vec){
  
  i <- last(vec)
  vec <- head(vec, -1)
  
  c(
    seq_len(length(vec) - 1) %>% map(~vec[.x:(.x+1)]),
    vec %>% map(~c(i, .x))
  ) 
}

df$desired_output <- df %>%
  select(matches("^id_[0-9]+$")) %>%
  mutate(row = row_number()) %>%
  pmap(~f(c(...))) %>%
  flatten() %>%
  reduce(rbind) %>%
  igraph::graph_from_edgelist() %>% 
  components() %>%
  membership() %>%
  .[as.character(seq_len(nrow(df)))]

编辑

想象一下 id 之间的联系。您对行之间的连接感兴趣。为此,您需要为每一行添加顶点。这些顶点连接到该行中的所有 ID。

第 6 行示例:

6  EFG   5A   Z5

我们对 id 之间的联系感兴趣(c 函数 f 中的第一部分:

[[1]]
[1] "EFG" "5A" 

[[2]]
[1] "5A" "Z5"

以及行和 id 之间的连接(fc 的第二部分):

[[1]]
[1] "6"   "EFG"

[[2]]
[1] "6"  "5A"

[[3]]
[1] "6"  "Z5"

当您以这种方式创建图表时,您会得到:

并且您对连接了哪些行顶点感兴趣

您可以在为此结果创建图表时使用 directed = FALSE,或者如果您对此感兴趣,可以在 components 中使用 mode = "strong"

更新解决方案 如果只得到所需的列,它可能是另一种选择:

library(dplyr)
library(purrr)
library(igraph)

df %>%
  select(starts_with("id")) %>%
  pmap_dfr(~ as.data.frame(t(combn(c(...), 2)))) %>%
  graph_from_data_frame(directed = TRUE) %>%
  components() %>% 
  groups() -> lst
  
df %>%
  rowwise() %>%
  mutate(grp = seq_len(length(lst))[map_lgl(lst, ~ id_1 %in% .x)])

# A tibble: 15 x 5
# Rowwise: 
   id_1  id_2  id_3  Name              grp
   <chr> <chr> <chr> <chr>           <int>
 1 ABC   1A    Z3    Whosebug1      1
 2 ABC   2A    Z2    Whosebug2      1
 3 BCD   3A    Z1    Whosebug3      2
 4 CDE   1A    Z4    Whosebug4      1
 5 DEF   4A    Z1    Whosebug5      2
 6 EFG   5A    Z5    Whosebug6      3
 7 GHI   6A    Z5    Whosebug7      3
 8 HIJ   8A    Z6    Whosebug8      3
 9 IJK   9A    Z7    Whosebug9      4
10 JKL   10A   Z8    Whosebug10     5
11 GHI   7A    Z6    Whosebug11     3
12 KLM   12A   Z8    Whosebug12     5
13 LMN   13A   Z9    Whosebug13     6
14 MNO   14A   Z9    Whosebug14     6
15 NOP   15A   Z1    Whosebug15     2

为了让你只是绘制它:

library(dplyr)
library(purrr)
library(igraph)

df %>%
  mutate(id = row_number()) %>%
  select(starts_with("id"), id) %>%
  pmap_dfr(~ as.data.frame(t(combn(c(...), 2)))) %>%
  graph_from_data_frame(directed = TRUE) %>%
  plot()

您可以试试下面的代码

transform(
  df,
  GRP = membership(
    components(
      graph_from_data_frame(
        reshape(
          df,
          direction = "long",
          idvar = c("id_1", "Name"),
          varying = 2:3,
          v.names = "to"
        )[c("id_1", "to")]
      )
    )
  )[id_1]
)

这给出了

   id_1 id_2 id_3            Name GRP
1   ABC   1A   Z3  Whosebug1   1
2   ABC   2A   Z2  Whosebug2   1
3   BCD   3A   Z1  Whosebug3   2
4   CDE   1A   Z4  Whosebug4   1
5   DEF   4A   Z1  Whosebug5   2
6   EFG   5A   Z5  Whosebug6   3
7   GHI   6A   Z5  Whosebug7   3
8   HIJ   8A   Z6  Whosebug8   3
9   IJK   9A   Z7  Whosebug9   4
10  JKL  10A   Z8 Whosebug10   5
11  GHI   7A   Z6 Whosebug11   3
12  KLM  12A   Z8 Whosebug12   5
13  LMN  13A   Z9 Whosebug13   6
14  MNO  14A   Z9 Whosebug14   6
15  NOP  15A   Z1 Whosebug15   2

数据

> dput(df)
structure(list(id_1 = c("ABC", "ABC", "BCD", "CDE", "DEF", "EFG", 
"GHI", "HIJ", "IJK", "JKL", "GHI", "KLM", "LMN", "MNO", "NOP"
), id_2 = c("1A", "2A", "3A", "1A", "4A", "5A", "6A", "8A", "9A",
"10A", "7A", "12A", "13A", "14A", "15A"), id_3 = c("Z3", "Z2",
"Z1", "Z4", "Z1", "Z5", "Z5", "Z6", "Z7", "Z8", "Z6", "Z8", "Z9",
"Z9", "Z1"), Name = c("Whosebug1", "Whosebug2", "Whosebug3",
"Whosebug4", "Whosebug5", "Whosebug6", "Whosebug7",
"Whosebug8", "Whosebug9", "Whosebug10", "Whosebug11",
"Whosebug12", "Whosebug13", "Whosebug14", "Whosebug15"
)), row.names = c(NA, -15L), class = "data.frame")