尝试使用标签传播时的预期索引
Expected index while trying to use Label propagation
我正在尝试将我的边缘标签转换为节点标签,以预测未标记的节点。目前数据集有 edge_labels 但我需要让每个节点 (ID) 恰好有一个 node_label:
我使用的代码如下:
import networkx as nx
import pandas as pd
data = {'ID': {0: 1, 1: 2, 2: 4, 3: 4, 4: 12, 5: 12, 6: 13, 7: 17},
'Target': {0: 12, 1: 24, 2: 13, 3: 12, 4: 1, 5: 4, 6: 4, 7: 1},
'Weight': {0: 0.4, 1: 0.1, 2: 0.5, 3: 0.3, 4: 0.1, 5: 0.4, 6: 0.2, 7: 0.1},
'Label': {0: 1, 1: 0, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 0}}
df = pd.DataFrame.from_dict(data)
G = nx.from_pandas_edgelist(df, source='ID', target='Target', edge_attr=['Weight', 'Label'])
width = [d['Weight'] for (u, v, d) in G.edges(data=True)]
edge_color = [d['Label'] for (u, v, d) in G.edges(data=True)]
nx.draw_networkx(G, width=width, edge_color=edge_color)
这应该return独一无二node_labels
df_to_use=df.drop_duplicates(['ID'])
df_to_use=df_to_use[['ID','Label']]
adj_matrix = nx.adjacency_matrix(G).toarray()
建筑邻接矩阵
adj_matrix_t = torch.FloatTensor(adj_matrix)
labels_t = torch.LongTensor(df['Label'].tolist())
adj_matrix_t.shape
使用标签传播
label_propagation = LabelPropagation(adj_matrix_t)
print("Label Propagation: ", end="")
label_propagation.fit(labels_t)
label_propagation_output_labels = label_propagation.predict_classes()
最后一步出现以下错误:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-81-cf4f88a4bb12> in <module>
2 label_propagation = LabelPropagation(adj_matrix_t)
3 print("Label Propagation: ", end="")
----> 4 label_propagation.fit(labels_t)
5 label_propagation_output_labels = label_propagation.predict_classes()
6
<ipython-input-1-54a7dbc30bd1> in fit(self, labels, max_iter, tol)
100
101 def fit(self, labels, max_iter=1000, tol=1e-3):
--> 102 super().fit(labels, max_iter, tol)
103
104 ## Label spreading
<ipython-input-1-54a7dbc30bd1> in fit(self, labels, max_iter, tol)
58 Convergence tolerance: threshold to consider the system at steady state.
59 """
---> 60 self._one_hot_encode(labels)
61
62 self.predictions = self.one_hot_labels.clone()
<ipython-input-1-54a7dbc30bd1> in _one_hot_encode(self, labels)
42 labels[unlabeled_mask] = 0
43 self.one_hot_labels = torch.zeros((self.n_nodes, self.n_classes), dtype=torch.float)
---> 44 self.one_hot_labels = self.one_hot_labels.scatter(1, labels.unsqueeze(1), 1)
45 self.one_hot_labels[unlabeled_mask, 0] = 0
46
RuntimeError: Expected index [8, 1] to be smaller than self [7, 2] apart from dimension 1 and to be smaller size than src [7, 2]
你知道我该如何解决吗?
您的节点仅出现在 Target
列中,因此您需要在查找所有唯一节点时合并该列。我这样做是通过连接两列(连同 Label
),在对 Label
值求和时按节点 ID 分组,然后如果总和 > 0 则用 1 替换求和标签:
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
data = {'ID': {0: 1, 1: 2, 2: 4, 3: 4, 4: 12, 5: 12, 6: 13, 7: 17},
'Target': {0: 12, 1: 24, 2: 13, 3: 12, 4: 1, 5: 4, 6: 4, 7: 1},
'Weight': {0: 0.4, 1: 0.1, 2: 0.5, 3: 0.3, 4: 0.1, 5: 0.4, 6: 0.2, 7: 0.1},
'Label': {0: 1, 1: 0, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 0}}
df = pd.DataFrame.from_dict(data)
G = nx.from_pandas_edgelist(df, source='ID', target='Target', edge_attr=['Weight', 'Label'])
width = [10 * d['Weight'] for (u, v, d) in G.edges(data=True)]
edge_color = [d['Label'] for (u, v, d) in G.edges(data=True)]
df1 = df[['ID', 'Label']].rename(columns={'ID':'node'})
df2 = df[['Target', 'Label']].rename(columns={'Target':'node'})
df_to_use = pd.concat([df1, df2]).groupby('node').sum().reset_index()
df_to_use['Label'] = df_to_use['Label'].apply(lambda x: 1 if x > 0 else 0)
print(df_to_use)
这给出了
node Label
0 1 1
1 2 0
2 4 1
3 12 1
4 13 1
5 17 0
6 24 0
无法帮助自己,不得不尝试该方案,看看它是如何工作的:
node_labels = np.array([df_to_use[df_to_use['node'] == node]['Label'].item() for node in G.nodes()])
idx = np.random.choice(range(len(node_labels)))
node_labels_missing = node_labels.copy()
node_labels_missing[idx] = -1
adj_matrix_t = torch.FloatTensor(adj_matrix)
labels_t = torch.LongTensor(node_labels_missing)
label_propagation = LabelPropagation(adj_matrix_t)
print("Label Propagation: ", end="")
label_propagation.fit(labels_t)
label_propagation_output_labels = label_propagation.predict_classes()
pos = nx.spring_layout(G)
fig = plt.figure(1, figsize=(15, 4)); plt.clf()
fig, ax = plt.subplots(1, 3, num=1)
ax[0].set_title("Actual Labels")
ax[1].set_title("One Label Removed")
ax[2].set_title("With Predicted Label")
ax1 = nx.draw_networkx(G, pos, width=width, edge_color=edge_color, node_color=node_labels, ax=ax[0])
ax2 = nx.draw_networkx(G, pos, width=width, edge_color=edge_color, node_color=[c if c in (0, 1) else 0.5 for c in labels_t], ax=ax[1])
ax3 = nx.draw_networkx(G, pos, width=width, edge_color=edge_color, node_color=label_propagation_output_labels, ax=ax[2])
这给出了
注意:对于没有上下文的读者,该用户正在尝试重新实现 this code。下面的自定义 class 定义必须在上面的代码 运行.
之前执行
from abc import abstractmethod
import torch
class BaseLabelPropagation:
"""Base class for label propagation models.
Parameters
----------
adj_matrix: torch.FloatTensor
Adjacency matrix of the graph.
"""
def __init__(self, adj_matrix):
self.norm_adj_matrix = self._normalize(adj_matrix)
self.n_nodes = adj_matrix.size(0)
self.one_hot_labels = None
self.n_classes = None
self.labeled_mask = None
self.predictions = None
@staticmethod
@abstractmethod
def _normalize(adj_matrix):
raise NotImplementedError("_normalize must be implemented")
@abstractmethod
def _propagate(self):
raise NotImplementedError("_propagate must be implemented")
def _one_hot_encode(self, labels):
# Get the number of classes
classes = torch.unique(labels)
classes = classes[classes != -1]
self.n_classes = classes.size(0)
# One-hot encode labeled data instances and zero rows corresponding to unlabeled instances
unlabeled_mask = (labels == -1)
labels = labels.clone() # defensive copying
labels[unlabeled_mask] = 0
self.one_hot_labels = torch.zeros((self.n_nodes, self.n_classes), dtype=torch.float)
self.one_hot_labels = self.one_hot_labels.scatter(1, labels.unsqueeze(1), 1)
self.one_hot_labels[unlabeled_mask, 0] = 0
self.labeled_mask = ~unlabeled_mask
def fit(self, labels, max_iter, tol):
"""Fits a semi-supervised learning label propagation model.
labels: torch.LongTensor
Tensor of size n_nodes indicating the class number of each node.
Unlabeled nodes are denoted with -1.
max_iter: int
Maximum number of iterations allowed.
tol: float
Convergence tolerance: threshold to consider the system at steady state.
"""
self._one_hot_encode(labels)
self.predictions = self.one_hot_labels.clone()
prev_predictions = torch.zeros((self.n_nodes, self.n_classes), dtype=torch.float)
for i in range(max_iter):
# Stop iterations if the system is considered at a steady state
variation = torch.abs(self.predictions - prev_predictions).sum().item()
if variation < tol:
print(f"The method stopped after {i} iterations, variation={variation:.4f}.")
break
prev_predictions = self.predictions
self._propagate()
def predict(self):
return self.predictions
def predict_classes(self):
return self.predictions.max(dim=1).indices
class LabelPropagation(BaseLabelPropagation):
def __init__(self, adj_matrix):
super().__init__(adj_matrix)
@staticmethod
def _normalize(adj_matrix):
"""Computes D^-1 * W"""
degs = adj_matrix.sum(dim=1)
degs[degs == 0] = 1 # avoid division by 0 error
return adj_matrix / degs[:, None]
def _propagate(self):
self.predictions = torch.matmul(self.norm_adj_matrix, self.predictions)
# Put back already known labels
self.predictions[self.labeled_mask] = self.one_hot_labels[self.labeled_mask]
def fit(self, labels, max_iter=1000, tol=1e-3):
super().fit(labels, max_iter, tol)
我正在尝试将我的边缘标签转换为节点标签,以预测未标记的节点。目前数据集有 edge_labels 但我需要让每个节点 (ID) 恰好有一个 node_label:
我使用的代码如下:
import networkx as nx
import pandas as pd
data = {'ID': {0: 1, 1: 2, 2: 4, 3: 4, 4: 12, 5: 12, 6: 13, 7: 17},
'Target': {0: 12, 1: 24, 2: 13, 3: 12, 4: 1, 5: 4, 6: 4, 7: 1},
'Weight': {0: 0.4, 1: 0.1, 2: 0.5, 3: 0.3, 4: 0.1, 5: 0.4, 6: 0.2, 7: 0.1},
'Label': {0: 1, 1: 0, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 0}}
df = pd.DataFrame.from_dict(data)
G = nx.from_pandas_edgelist(df, source='ID', target='Target', edge_attr=['Weight', 'Label'])
width = [d['Weight'] for (u, v, d) in G.edges(data=True)]
edge_color = [d['Label'] for (u, v, d) in G.edges(data=True)]
nx.draw_networkx(G, width=width, edge_color=edge_color)
这应该return独一无二node_labels
df_to_use=df.drop_duplicates(['ID'])
df_to_use=df_to_use[['ID','Label']]
adj_matrix = nx.adjacency_matrix(G).toarray()
建筑邻接矩阵
adj_matrix_t = torch.FloatTensor(adj_matrix)
labels_t = torch.LongTensor(df['Label'].tolist())
adj_matrix_t.shape
使用标签传播
label_propagation = LabelPropagation(adj_matrix_t)
print("Label Propagation: ", end="")
label_propagation.fit(labels_t)
label_propagation_output_labels = label_propagation.predict_classes()
最后一步出现以下错误:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-81-cf4f88a4bb12> in <module>
2 label_propagation = LabelPropagation(adj_matrix_t)
3 print("Label Propagation: ", end="")
----> 4 label_propagation.fit(labels_t)
5 label_propagation_output_labels = label_propagation.predict_classes()
6
<ipython-input-1-54a7dbc30bd1> in fit(self, labels, max_iter, tol)
100
101 def fit(self, labels, max_iter=1000, tol=1e-3):
--> 102 super().fit(labels, max_iter, tol)
103
104 ## Label spreading
<ipython-input-1-54a7dbc30bd1> in fit(self, labels, max_iter, tol)
58 Convergence tolerance: threshold to consider the system at steady state.
59 """
---> 60 self._one_hot_encode(labels)
61
62 self.predictions = self.one_hot_labels.clone()
<ipython-input-1-54a7dbc30bd1> in _one_hot_encode(self, labels)
42 labels[unlabeled_mask] = 0
43 self.one_hot_labels = torch.zeros((self.n_nodes, self.n_classes), dtype=torch.float)
---> 44 self.one_hot_labels = self.one_hot_labels.scatter(1, labels.unsqueeze(1), 1)
45 self.one_hot_labels[unlabeled_mask, 0] = 0
46
RuntimeError: Expected index [8, 1] to be smaller than self [7, 2] apart from dimension 1 and to be smaller size than src [7, 2]
你知道我该如何解决吗?
您的节点仅出现在 Target
列中,因此您需要在查找所有唯一节点时合并该列。我这样做是通过连接两列(连同 Label
),在对 Label
值求和时按节点 ID 分组,然后如果总和 > 0 则用 1 替换求和标签:
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
data = {'ID': {0: 1, 1: 2, 2: 4, 3: 4, 4: 12, 5: 12, 6: 13, 7: 17},
'Target': {0: 12, 1: 24, 2: 13, 3: 12, 4: 1, 5: 4, 6: 4, 7: 1},
'Weight': {0: 0.4, 1: 0.1, 2: 0.5, 3: 0.3, 4: 0.1, 5: 0.4, 6: 0.2, 7: 0.1},
'Label': {0: 1, 1: 0, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 0}}
df = pd.DataFrame.from_dict(data)
G = nx.from_pandas_edgelist(df, source='ID', target='Target', edge_attr=['Weight', 'Label'])
width = [10 * d['Weight'] for (u, v, d) in G.edges(data=True)]
edge_color = [d['Label'] for (u, v, d) in G.edges(data=True)]
df1 = df[['ID', 'Label']].rename(columns={'ID':'node'})
df2 = df[['Target', 'Label']].rename(columns={'Target':'node'})
df_to_use = pd.concat([df1, df2]).groupby('node').sum().reset_index()
df_to_use['Label'] = df_to_use['Label'].apply(lambda x: 1 if x > 0 else 0)
print(df_to_use)
这给出了
node Label
0 1 1
1 2 0
2 4 1
3 12 1
4 13 1
5 17 0
6 24 0
无法帮助自己,不得不尝试该方案,看看它是如何工作的:
node_labels = np.array([df_to_use[df_to_use['node'] == node]['Label'].item() for node in G.nodes()])
idx = np.random.choice(range(len(node_labels)))
node_labels_missing = node_labels.copy()
node_labels_missing[idx] = -1
adj_matrix_t = torch.FloatTensor(adj_matrix)
labels_t = torch.LongTensor(node_labels_missing)
label_propagation = LabelPropagation(adj_matrix_t)
print("Label Propagation: ", end="")
label_propagation.fit(labels_t)
label_propagation_output_labels = label_propagation.predict_classes()
pos = nx.spring_layout(G)
fig = plt.figure(1, figsize=(15, 4)); plt.clf()
fig, ax = plt.subplots(1, 3, num=1)
ax[0].set_title("Actual Labels")
ax[1].set_title("One Label Removed")
ax[2].set_title("With Predicted Label")
ax1 = nx.draw_networkx(G, pos, width=width, edge_color=edge_color, node_color=node_labels, ax=ax[0])
ax2 = nx.draw_networkx(G, pos, width=width, edge_color=edge_color, node_color=[c if c in (0, 1) else 0.5 for c in labels_t], ax=ax[1])
ax3 = nx.draw_networkx(G, pos, width=width, edge_color=edge_color, node_color=label_propagation_output_labels, ax=ax[2])
这给出了
注意:对于没有上下文的读者,该用户正在尝试重新实现 this code。下面的自定义 class 定义必须在上面的代码 运行.
之前执行from abc import abstractmethod
import torch
class BaseLabelPropagation:
"""Base class for label propagation models.
Parameters
----------
adj_matrix: torch.FloatTensor
Adjacency matrix of the graph.
"""
def __init__(self, adj_matrix):
self.norm_adj_matrix = self._normalize(adj_matrix)
self.n_nodes = adj_matrix.size(0)
self.one_hot_labels = None
self.n_classes = None
self.labeled_mask = None
self.predictions = None
@staticmethod
@abstractmethod
def _normalize(adj_matrix):
raise NotImplementedError("_normalize must be implemented")
@abstractmethod
def _propagate(self):
raise NotImplementedError("_propagate must be implemented")
def _one_hot_encode(self, labels):
# Get the number of classes
classes = torch.unique(labels)
classes = classes[classes != -1]
self.n_classes = classes.size(0)
# One-hot encode labeled data instances and zero rows corresponding to unlabeled instances
unlabeled_mask = (labels == -1)
labels = labels.clone() # defensive copying
labels[unlabeled_mask] = 0
self.one_hot_labels = torch.zeros((self.n_nodes, self.n_classes), dtype=torch.float)
self.one_hot_labels = self.one_hot_labels.scatter(1, labels.unsqueeze(1), 1)
self.one_hot_labels[unlabeled_mask, 0] = 0
self.labeled_mask = ~unlabeled_mask
def fit(self, labels, max_iter, tol):
"""Fits a semi-supervised learning label propagation model.
labels: torch.LongTensor
Tensor of size n_nodes indicating the class number of each node.
Unlabeled nodes are denoted with -1.
max_iter: int
Maximum number of iterations allowed.
tol: float
Convergence tolerance: threshold to consider the system at steady state.
"""
self._one_hot_encode(labels)
self.predictions = self.one_hot_labels.clone()
prev_predictions = torch.zeros((self.n_nodes, self.n_classes), dtype=torch.float)
for i in range(max_iter):
# Stop iterations if the system is considered at a steady state
variation = torch.abs(self.predictions - prev_predictions).sum().item()
if variation < tol:
print(f"The method stopped after {i} iterations, variation={variation:.4f}.")
break
prev_predictions = self.predictions
self._propagate()
def predict(self):
return self.predictions
def predict_classes(self):
return self.predictions.max(dim=1).indices
class LabelPropagation(BaseLabelPropagation):
def __init__(self, adj_matrix):
super().__init__(adj_matrix)
@staticmethod
def _normalize(adj_matrix):
"""Computes D^-1 * W"""
degs = adj_matrix.sum(dim=1)
degs[degs == 0] = 1 # avoid division by 0 error
return adj_matrix / degs[:, None]
def _propagate(self):
self.predictions = torch.matmul(self.norm_adj_matrix, self.predictions)
# Put back already known labels
self.predictions[self.labeled_mask] = self.one_hot_labels[self.labeled_mask]
def fit(self, labels, max_iter=1000, tol=1e-3):
super().fit(labels, max_iter, tol)