网络分析中如何处理空值/NA
How to handle null values /NAs in network analysis
这个问题基本上是我之前问题的延伸 posted .
如何在这些类型的情况下处理 null values/NAs。
示例场景和数据
df1 <- data.frame(
stringsAsFactors = FALSE,
id_1 = c("ABC","ABC","BCD",
"CDE","DEF","EFG","GHI","HIJ","IJK","JKL",
"GHI","KLM","LMN","MNO","NOP"),
id_2 = c("1A","2A","3A",
"1A","4A","5A","6A",NA,"9A","10A","7A",
"12A","13A",NA,"15A"),
id_3 = c("Z3","Z2","Z1",
"Z4","Z1","Z5","Z5","Z6","Z7","Z8","Z6","Z8",
"Z9","Z9","Z1"),
Name = c("Whosebug1",
"Whosebug2","Whosebug3","Whosebug4",
"Whosebug5","Whosebug6",
"Whosebug7","Whosebug8","Whosebug9",
"Whosebug10","Whosebug11","Whosebug12",
"Whosebug13","Whosebug14","Whosebug15"),
desired_output = c(1L,1L,2L,1L,2L,
3L,3L,3L,4L,5L,3L,5L,6L,6L,2L)
)
df1
id_1 id_2 id_3 Name desired_output
1 ABC 1A Z3 Whosebug1 1
2 ABC 2A Z2 Whosebug2 1
3 BCD 3A Z1 Whosebug3 2
4 CDE 1A Z4 Whosebug4 1
5 DEF 4A Z1 Whosebug5 2
6 EFG 5A Z5 Whosebug6 3
7 GHI 6A Z5 Whosebug7 3
8 HIJ <NA> Z6 Whosebug8 3
9 IJK 9A Z7 Whosebug9 4
10 JKL 10A Z8 Whosebug10 5
11 GHI 7A Z6 Whosebug11 3
12 KLM 12A Z8 Whosebug12 5
13 LMN 13A Z9 Whosebug13 6
14 MNO <NA> Z9 Whosebug14 6
15 NOP 15A Z1 Whosebug15 2
但是链接 post 中建议的三种方法都不起作用并且给我错误。
请推荐。
更新
如果某行有多个NA
,可以试试下面的代码
transform(
df,
GRP = membership(
components(
graph_from_data_frame(
transform(
reshape(
df,
direction = "long",
idvar = c("id_1", "Name"),
varying = 2:3,
v.names = "to"
)[c("id_1", "to")],
to = ifelse(is.na(to), id_1, to)
)
)
)
)[id_1]
)
这给出了
id_1 id_2 id_3 Name GRP
1 ABC 1A Z3 Whosebug1 1
2 ABC 2A Z2 Whosebug2 1
3 BCD 3A Z1 Whosebug3 2
4 CDE 1A Z4 Whosebug4 1
5 DEF 4A Z1 Whosebug5 2
6 EFG 5A Z5 Whosebug6 3
7 GHI 6A Z5 Whosebug7 3
8 HIJ <NA> <NA> Whosebug8 4
9 IJK 9A Z7 Whosebug9 5
10 JKL 10A Z8 Whosebug10 6
11 GHI 7A Z6 Whosebug11 3
12 KLM 12A Z8 Whosebug12 6
13 LMN 13A <NA> Whosebug13 7
14 MNO <NA> <NA> Whosebug14 8
15 NOP 15A Z1 Whosebug15 2
虚拟数据
> dput(df)
structure(list(id_1 = c("ABC", "ABC", "BCD", "CDE", "DEF", "EFG",
"GHI", "HIJ", "IJK", "JKL", "GHI", "KLM", "LMN", "MNO", "NOP"
), id_2 = c("1A", "2A", "3A", "1A", "4A", "5A", "6A", NA, "9A",
"10A", "7A", "12A", "13A", NA, "15A"), id_3 = c("Z3", "Z2", "Z1",
"Z4", "Z1", "Z5", "Z5", NA, "Z7", "Z8", "Z6", "Z8", NA, NA, "Z1"
), Name = c("Whosebug1", "Whosebug2", "Whosebug3",
"Whosebug4", "Whosebug5", "Whosebug6", "Whosebug7",
"Whosebug8", "Whosebug9", "Whosebug10", "Whosebug11",
"Whosebug12", "Whosebug13", "Whosebug14", "Whosebug15"
)), row.names = c(NA, -15L), class = "data.frame")
上一个答案
也许您可以将 id_2
列中的 NA
替换为 id_1
中的值,然后按照前面问题的答案进行操作。
你可以试试这个
transform(
df,
GRP = membership(
components(
graph_from_data_frame(
reshape(
transform(
df,
id_2 = ifelse(is.na(id_2), id_1, id_2)
),
direction = "long",
idvar = c("id_1", "Name"),
varying = 2:3,
v.names = "to"
)[c("id_1", "to")]
)
)
)[id_1]
)
这给出了
id_1 id_2 id_3 Name GRP
1 ABC 1A Z3 Whosebug1 1
2 ABC 2A Z2 Whosebug2 1
3 BCD 3A Z1 Whosebug3 2
4 CDE 1A Z4 Whosebug4 1
5 DEF 4A Z1 Whosebug5 2
6 EFG 5A Z5 Whosebug6 3
7 GHI 6A Z5 Whosebug7 3
8 HIJ <NA> Z6 Whosebug8 3
9 IJK 9A Z7 Whosebug9 4
10 JKL 10A Z8 Whosebug10 5
11 GHI 7A Z6 Whosebug11 3
12 KLM 12A Z8 Whosebug12 5
13 LMN 13A Z9 Whosebug13 6
14 MNO <NA> Z9 Whosebug14 6
15 NOP 15A Z1 Whosebug15 2
只需删除 NA
:
df$desired_output <- df %>%
select(matches("^id_[0-9]+$")) %>%
mutate(row = row_number()) %>%
pmap(~c(...) %>% .[!is.na(.)]) %>%
map(f) %>%
flatten() %>%
reduce(rbind) %>%
igraph::graph_from_edgelist() %>%
components() %>%
membership() %>%
.[as.character(seq_len(nrow(df)))]
补充说明
为了能够对这个网络进行聚类,我们需要一个 id
的边列表。在这个数据框中,我们实际上每一行都有 3 id
s,表示一种三元组结构,或者通常它显示了所有三个结构之间的联系。所以
- 我首先选择了 3 个目标变量,然后我使用
pmap
函数创建了这 3 个顶点之间长度为 2 的每个组合,并将结果按行绑定在一起
- 在下一步中,我们得到了一个只有 2 个变量的数据框,它形成了我们所需的边列表,其中包含我们原始顶点(变量)之间的所有现有边]
- 然后我使用
tidyr::drop_na
删除那些缺失值,假设在像 HIJ --- NA
这样的设置中我们只能定义 HIJ
的成员,如果它之间有任何边缘和其他观察中的其他顶点。所以在第一个代码块中删除它们是安全的
- 最后我用
membership
函数提取了相关成分,也用groups
提取了聚类。这些集群的 id 是我们提取 desired_output
变量的地方,怎么样?
由于每个集群都与其他集群完全分开,我们假设原始数据集中的每个顶点或 id
s 只能是一个集群的一部分,所以我们通过第二个代码块检查了这一点,我只检查了 id_1
为此。
library(tidyverse)
library(igraph)
df1 %>%
select(starts_with("id")) %>%
pmap_dfr(~ as.data.frame(t(combn(c(...), 2)))) %>%
drop_na() %>%
graph_from_data_frame(directed = TRUE) %>%
components() %>%
groups() -> lst
df1 %>%
rowwise() %>%
mutate(grp = seq_len(length(lst))[map_lgl(lst, ~ id_1 %in% .x)])
# A tibble: 15 x 6
# Rowwise:
id_1 id_2 id_3 Name desired_output grp
<chr> <chr> <chr> <chr> <int> <int>
1 ABC 1A Z3 Whosebug1 1 1
2 ABC 2A Z2 Whosebug2 1 1
3 BCD 3A Z1 Whosebug3 2 2
4 CDE 1A Z4 Whosebug4 1 1
5 DEF 4A Z1 Whosebug5 2 2
6 EFG 5A Z5 Whosebug6 3 3
7 GHI 6A Z5 Whosebug7 3 3
8 HIJ NA Z6 Whosebug8 3 3
9 IJK 9A Z7 Whosebug9 4 4
10 JKL 10A Z8 Whosebug10 5 5
11 GHI 7A Z6 Whosebug11 3 3
12 KLM 12A Z8 Whosebug12 5 5
13 LMN 13A Z9 Whosebug13 6 6
14 MNO NA Z9 Whosebug14 6 6
15 NOP 15A Z1 Whosebug15 2 2
这个问题基本上是我之前问题的延伸 posted
如何在这些类型的情况下处理 null values/NAs。
示例场景和数据
df1 <- data.frame(
stringsAsFactors = FALSE,
id_1 = c("ABC","ABC","BCD",
"CDE","DEF","EFG","GHI","HIJ","IJK","JKL",
"GHI","KLM","LMN","MNO","NOP"),
id_2 = c("1A","2A","3A",
"1A","4A","5A","6A",NA,"9A","10A","7A",
"12A","13A",NA,"15A"),
id_3 = c("Z3","Z2","Z1",
"Z4","Z1","Z5","Z5","Z6","Z7","Z8","Z6","Z8",
"Z9","Z9","Z1"),
Name = c("Whosebug1",
"Whosebug2","Whosebug3","Whosebug4",
"Whosebug5","Whosebug6",
"Whosebug7","Whosebug8","Whosebug9",
"Whosebug10","Whosebug11","Whosebug12",
"Whosebug13","Whosebug14","Whosebug15"),
desired_output = c(1L,1L,2L,1L,2L,
3L,3L,3L,4L,5L,3L,5L,6L,6L,2L)
)
df1
id_1 id_2 id_3 Name desired_output
1 ABC 1A Z3 Whosebug1 1
2 ABC 2A Z2 Whosebug2 1
3 BCD 3A Z1 Whosebug3 2
4 CDE 1A Z4 Whosebug4 1
5 DEF 4A Z1 Whosebug5 2
6 EFG 5A Z5 Whosebug6 3
7 GHI 6A Z5 Whosebug7 3
8 HIJ <NA> Z6 Whosebug8 3
9 IJK 9A Z7 Whosebug9 4
10 JKL 10A Z8 Whosebug10 5
11 GHI 7A Z6 Whosebug11 3
12 KLM 12A Z8 Whosebug12 5
13 LMN 13A Z9 Whosebug13 6
14 MNO <NA> Z9 Whosebug14 6
15 NOP 15A Z1 Whosebug15 2
但是链接 post 中建议的三种方法都不起作用并且给我错误。
请推荐。
更新
如果某行有多个NA
,可以试试下面的代码
transform(
df,
GRP = membership(
components(
graph_from_data_frame(
transform(
reshape(
df,
direction = "long",
idvar = c("id_1", "Name"),
varying = 2:3,
v.names = "to"
)[c("id_1", "to")],
to = ifelse(is.na(to), id_1, to)
)
)
)
)[id_1]
)
这给出了
id_1 id_2 id_3 Name GRP
1 ABC 1A Z3 Whosebug1 1
2 ABC 2A Z2 Whosebug2 1
3 BCD 3A Z1 Whosebug3 2
4 CDE 1A Z4 Whosebug4 1
5 DEF 4A Z1 Whosebug5 2
6 EFG 5A Z5 Whosebug6 3
7 GHI 6A Z5 Whosebug7 3
8 HIJ <NA> <NA> Whosebug8 4
9 IJK 9A Z7 Whosebug9 5
10 JKL 10A Z8 Whosebug10 6
11 GHI 7A Z6 Whosebug11 3
12 KLM 12A Z8 Whosebug12 6
13 LMN 13A <NA> Whosebug13 7
14 MNO <NA> <NA> Whosebug14 8
15 NOP 15A Z1 Whosebug15 2
虚拟数据
> dput(df)
structure(list(id_1 = c("ABC", "ABC", "BCD", "CDE", "DEF", "EFG",
"GHI", "HIJ", "IJK", "JKL", "GHI", "KLM", "LMN", "MNO", "NOP"
), id_2 = c("1A", "2A", "3A", "1A", "4A", "5A", "6A", NA, "9A",
"10A", "7A", "12A", "13A", NA, "15A"), id_3 = c("Z3", "Z2", "Z1",
"Z4", "Z1", "Z5", "Z5", NA, "Z7", "Z8", "Z6", "Z8", NA, NA, "Z1"
), Name = c("Whosebug1", "Whosebug2", "Whosebug3",
"Whosebug4", "Whosebug5", "Whosebug6", "Whosebug7",
"Whosebug8", "Whosebug9", "Whosebug10", "Whosebug11",
"Whosebug12", "Whosebug13", "Whosebug14", "Whosebug15"
)), row.names = c(NA, -15L), class = "data.frame")
上一个答案
也许您可以将 id_2
列中的 NA
替换为 id_1
中的值,然后按照前面问题的答案进行操作。
你可以试试这个
transform(
df,
GRP = membership(
components(
graph_from_data_frame(
reshape(
transform(
df,
id_2 = ifelse(is.na(id_2), id_1, id_2)
),
direction = "long",
idvar = c("id_1", "Name"),
varying = 2:3,
v.names = "to"
)[c("id_1", "to")]
)
)
)[id_1]
)
这给出了
id_1 id_2 id_3 Name GRP
1 ABC 1A Z3 Whosebug1 1
2 ABC 2A Z2 Whosebug2 1
3 BCD 3A Z1 Whosebug3 2
4 CDE 1A Z4 Whosebug4 1
5 DEF 4A Z1 Whosebug5 2
6 EFG 5A Z5 Whosebug6 3
7 GHI 6A Z5 Whosebug7 3
8 HIJ <NA> Z6 Whosebug8 3
9 IJK 9A Z7 Whosebug9 4
10 JKL 10A Z8 Whosebug10 5
11 GHI 7A Z6 Whosebug11 3
12 KLM 12A Z8 Whosebug12 5
13 LMN 13A Z9 Whosebug13 6
14 MNO <NA> Z9 Whosebug14 6
15 NOP 15A Z1 Whosebug15 2
只需删除 NA
:
df$desired_output <- df %>%
select(matches("^id_[0-9]+$")) %>%
mutate(row = row_number()) %>%
pmap(~c(...) %>% .[!is.na(.)]) %>%
map(f) %>%
flatten() %>%
reduce(rbind) %>%
igraph::graph_from_edgelist() %>%
components() %>%
membership() %>%
.[as.character(seq_len(nrow(df)))]
补充说明
为了能够对这个网络进行聚类,我们需要一个 id
的边列表。在这个数据框中,我们实际上每一行都有 3 id
s,表示一种三元组结构,或者通常它显示了所有三个结构之间的联系。所以
- 我首先选择了 3 个目标变量,然后我使用
pmap
函数创建了这 3 个顶点之间长度为 2 的每个组合,并将结果按行绑定在一起 - 在下一步中,我们得到了一个只有 2 个变量的数据框,它形成了我们所需的边列表,其中包含我们原始顶点(变量)之间的所有现有边]
- 然后我使用
tidyr::drop_na
删除那些缺失值,假设在像HIJ --- NA
这样的设置中我们只能定义HIJ
的成员,如果它之间有任何边缘和其他观察中的其他顶点。所以在第一个代码块中删除它们是安全的 - 最后我用
membership
函数提取了相关成分,也用groups
提取了聚类。这些集群的 id 是我们提取desired_output
变量的地方,怎么样? 由于每个集群都与其他集群完全分开,我们假设原始数据集中的每个顶点或id
s 只能是一个集群的一部分,所以我们通过第二个代码块检查了这一点,我只检查了id_1
为此。
library(tidyverse)
library(igraph)
df1 %>%
select(starts_with("id")) %>%
pmap_dfr(~ as.data.frame(t(combn(c(...), 2)))) %>%
drop_na() %>%
graph_from_data_frame(directed = TRUE) %>%
components() %>%
groups() -> lst
df1 %>%
rowwise() %>%
mutate(grp = seq_len(length(lst))[map_lgl(lst, ~ id_1 %in% .x)])
# A tibble: 15 x 6
# Rowwise:
id_1 id_2 id_3 Name desired_output grp
<chr> <chr> <chr> <chr> <int> <int>
1 ABC 1A Z3 Whosebug1 1 1
2 ABC 2A Z2 Whosebug2 1 1
3 BCD 3A Z1 Whosebug3 2 2
4 CDE 1A Z4 Whosebug4 1 1
5 DEF 4A Z1 Whosebug5 2 2
6 EFG 5A Z5 Whosebug6 3 3
7 GHI 6A Z5 Whosebug7 3 3
8 HIJ NA Z6 Whosebug8 3 3
9 IJK 9A Z7 Whosebug9 4 4
10 JKL 10A Z8 Whosebug10 5 5
11 GHI 7A Z6 Whosebug11 3 3
12 KLM 12A Z8 Whosebug12 5 5
13 LMN 13A Z9 Whosebug13 6 6
14 MNO NA Z9 Whosebug14 6 6
15 NOP 15A Z1 Whosebug15 2 2