网络分析中如何处理空值/NA

How to handle null values /NAs in network analysis

这个问题基本上是我之前问题的延伸 posted .
如何在这些类型的情况下处理 null values/NAs。 示例场景和数据

df1 <- data.frame(
  stringsAsFactors = FALSE,
                    id_1 = c("ABC","ABC","BCD",
                             "CDE","DEF","EFG","GHI","HIJ","IJK","JKL",
                             "GHI","KLM","LMN","MNO","NOP"),
                    id_2 = c("1A","2A","3A",
                             "1A","4A","5A","6A",NA,"9A","10A","7A",
                             "12A","13A",NA,"15A"),
                    id_3 = c("Z3","Z2","Z1",
                             "Z4","Z1","Z5","Z5","Z6","Z7","Z8","Z6","Z8",
                             "Z9","Z9","Z1"),
                    Name = c("Whosebug1",
                             "Whosebug2","Whosebug3","Whosebug4",
                             "Whosebug5","Whosebug6",
                             "Whosebug7","Whosebug8","Whosebug9",
                             "Whosebug10","Whosebug11","Whosebug12",
                             "Whosebug13","Whosebug14","Whosebug15"),
          desired_output = c(1L,1L,2L,1L,2L,
                             3L,3L,3L,4L,5L,3L,5L,6L,6L,2L)
      )

df1
   id_1 id_2 id_3            Name desired_output
1   ABC   1A   Z3  Whosebug1              1
2   ABC   2A   Z2  Whosebug2              1
3   BCD   3A   Z1  Whosebug3              2
4   CDE   1A   Z4  Whosebug4              1
5   DEF   4A   Z1  Whosebug5              2
6   EFG   5A   Z5  Whosebug6              3
7   GHI   6A   Z5  Whosebug7              3
8   HIJ <NA>   Z6  Whosebug8              3
9   IJK   9A   Z7  Whosebug9              4
10  JKL  10A   Z8 Whosebug10              5
11  GHI   7A   Z6 Whosebug11              3
12  KLM  12A   Z8 Whosebug12              5
13  LMN  13A   Z9 Whosebug13              6
14  MNO <NA>   Z9 Whosebug14              6
15  NOP  15A   Z1 Whosebug15              2

但是链接 post 中建议的三种方法都不起作用并且给我错误。

请推荐。

更新

如果某行有多个NA,可以试试下面的代码

transform(
  df,
  GRP = membership(
    components(
      graph_from_data_frame(
        transform(
          reshape(
            df,
            direction = "long",
            idvar = c("id_1", "Name"),
            varying = 2:3,
            v.names = "to"
          )[c("id_1", "to")],
          to = ifelse(is.na(to), id_1, to)
        )
      )
    )
  )[id_1]
)

这给出了

   id_1 id_2 id_3            Name GRP
1   ABC   1A   Z3  Whosebug1   1
2   ABC   2A   Z2  Whosebug2   1
3   BCD   3A   Z1  Whosebug3   2
4   CDE   1A   Z4  Whosebug4   1
5   DEF   4A   Z1  Whosebug5   2
6   EFG   5A   Z5  Whosebug6   3
7   GHI   6A   Z5  Whosebug7   3
8   HIJ <NA> <NA>  Whosebug8   4
9   IJK   9A   Z7  Whosebug9   5
10  JKL  10A   Z8 Whosebug10   6
11  GHI   7A   Z6 Whosebug11   3
12  KLM  12A   Z8 Whosebug12   6
13  LMN  13A <NA> Whosebug13   7
14  MNO <NA> <NA> Whosebug14   8
15  NOP  15A   Z1 Whosebug15   2

虚拟数据

> dput(df)
structure(list(id_1 = c("ABC", "ABC", "BCD", "CDE", "DEF", "EFG", 
"GHI", "HIJ", "IJK", "JKL", "GHI", "KLM", "LMN", "MNO", "NOP"
), id_2 = c("1A", "2A", "3A", "1A", "4A", "5A", "6A", NA, "9A",
"10A", "7A", "12A", "13A", NA, "15A"), id_3 = c("Z3", "Z2", "Z1",
"Z4", "Z1", "Z5", "Z5", NA, "Z7", "Z8", "Z6", "Z8", NA, NA, "Z1"
), Name = c("Whosebug1", "Whosebug2", "Whosebug3",
"Whosebug4", "Whosebug5", "Whosebug6", "Whosebug7",
"Whosebug8", "Whosebug9", "Whosebug10", "Whosebug11",
"Whosebug12", "Whosebug13", "Whosebug14", "Whosebug15"
)), row.names = c(NA, -15L), class = "data.frame")

上一个答案

也许您可以将 id_2 列中的 NA 替换为 id_1 中的值,然后按照前面问题的答案进行操作。

你可以试试这个

transform(
  df,
  GRP = membership(
    components(
      graph_from_data_frame(
        reshape(
          transform(
            df,
            id_2 = ifelse(is.na(id_2), id_1, id_2)
          ),
          direction = "long",
          idvar = c("id_1", "Name"),
          varying = 2:3,
          v.names = "to"
        )[c("id_1", "to")]
      )
    )
  )[id_1]
)

这给出了

   id_1 id_2 id_3            Name GRP
1   ABC   1A   Z3  Whosebug1   1
2   ABC   2A   Z2  Whosebug2   1
3   BCD   3A   Z1  Whosebug3   2
4   CDE   1A   Z4  Whosebug4   1
5   DEF   4A   Z1  Whosebug5   2
6   EFG   5A   Z5  Whosebug6   3
7   GHI   6A   Z5  Whosebug7   3
8   HIJ <NA>   Z6  Whosebug8   3
9   IJK   9A   Z7  Whosebug9   4
10  JKL  10A   Z8 Whosebug10   5
11  GHI   7A   Z6 Whosebug11   3
12  KLM  12A   Z8 Whosebug12   5
13  LMN  13A   Z9 Whosebug13   6
14  MNO <NA>   Z9 Whosebug14   6
15  NOP  15A   Z1 Whosebug15   2

只需删除 NA:

df$desired_output <- df %>%
  select(matches("^id_[0-9]+$")) %>%
  mutate(row = row_number()) %>%
  pmap(~c(...) %>% .[!is.na(.)]) %>%
  map(f) %>%
  flatten() %>%
  reduce(rbind) %>%
  igraph::graph_from_edgelist() %>% 
  components() %>%
  membership() %>%
  .[as.character(seq_len(nrow(df)))]

补充说明

为了能够对这个网络进行聚类,我们需要一个 id 的边列表。在这个数据框中,我们实际上每一行都有 3 ids,表示一种三元组结构,或者通常它显示了所有三个结构之间的联系。所以

  • 我首先选择了 3 个目标变量,然后我使用 pmap 函数创建了这 3 个顶点之间长度为 2 的每个组合,并将结果按行绑定在一起
  • 在下一步中,我们得到了一个只有 2 个变量的数据框,它形成了我们所需的边列表,其中包含我们原始顶点(变量)之间的所有现有边]
  • 然后我使用 tidyr::drop_na 删除那些缺失值,假设在像 HIJ --- NA 这样的设置中我们只能定义 HIJ 的成员,如果它之间有任何边缘和其他观察中的其他顶点。所以在第一个代码块中删除它们是安全的
  • 最后我用membership函数提取了相关成分,也用groups提取了聚类。这些集群的 id 是我们提取 desired_output 变量的地方,怎么样? 由于每个集群都与其他集群完全分开,我们假设原始数据集中的每个顶点或 ids 只能是一个集群的一部分,所以我们通过第二个代码块检查了这一点,我只检查了 id_1为此。
library(tidyverse)
library(igraph)

df1 %>%
  select(starts_with("id")) %>%
  pmap_dfr(~ as.data.frame(t(combn(c(...), 2)))) %>%
  drop_na() %>%
  graph_from_data_frame(directed = TRUE) %>%
  components() %>% 
  groups() -> lst
  
df1 %>%
  rowwise() %>%
  mutate(grp = seq_len(length(lst))[map_lgl(lst, ~ id_1 %in% .x)])

# A tibble: 15 x 6
# Rowwise: 
   id_1  id_2  id_3  Name            desired_output   grp
   <chr> <chr> <chr> <chr>                    <int> <int>
 1 ABC   1A    Z3    Whosebug1               1     1
 2 ABC   2A    Z2    Whosebug2               1     1
 3 BCD   3A    Z1    Whosebug3               2     2
 4 CDE   1A    Z4    Whosebug4               1     1
 5 DEF   4A    Z1    Whosebug5               2     2
 6 EFG   5A    Z5    Whosebug6               3     3
 7 GHI   6A    Z5    Whosebug7               3     3
 8 HIJ   NA    Z6    Whosebug8               3     3
 9 IJK   9A    Z7    Whosebug9               4     4
10 JKL   10A   Z8    Whosebug10              5     5
11 GHI   7A    Z6    Whosebug11              3     3
12 KLM   12A   Z8    Whosebug12              5     5
13 LMN   13A   Z9    Whosebug13              6     6
14 MNO   NA    Z9    Whosebug14              6     6
15 NOP   15A   Z1    Whosebug15              2     2