尝试使用标签传播时的预期索引

Question

我正在尝试将我的边缘标签转换为节点标签，以预测未标记的节点。目前数据集有 edge_labels 但我需要让每个节点 (ID) 恰好有一个 node_label:

我使用的代码如下：

import networkx as nx
import pandas as pd

data = {'ID': {0: 1, 1: 2, 2: 4, 3: 4, 4: 12, 5: 12, 6: 13, 7: 17},
            'Target': {0: 12, 1: 24, 2: 13, 3: 12, 4: 1, 5: 4, 6: 4, 7: 1},
            'Weight': {0: 0.4, 1: 0.1, 2: 0.5, 3: 0.3, 4: 0.1, 5: 0.4, 6: 0.2, 7: 0.1},
            'Label': {0: 1, 1: 0, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 0}}
    
df = pd.DataFrame.from_dict(data)
    
G = nx.from_pandas_edgelist(df, source='ID', target='Target', edge_attr=['Weight', 'Label']) 
    
width = [d['Weight'] for (u, v, d) in G.edges(data=True)]
edge_color = [d['Label'] for (u, v, d) in G.edges(data=True)]
nx.draw_networkx(G, width=width, edge_color=edge_color)

这应该return独一无二node_labels

df_to_use=df.drop_duplicates(['ID'])
df_to_use=df_to_use[['ID','Label']]
adj_matrix = nx.adjacency_matrix(G).toarray()

建筑邻接矩阵

adj_matrix_t = torch.FloatTensor(adj_matrix)
labels_t = torch.LongTensor(df['Label'].tolist())
adj_matrix_t.shape

使用标签传播

label_propagation = LabelPropagation(adj_matrix_t)
print("Label Propagation: ", end="")
label_propagation.fit(labels_t)
label_propagation_output_labels = label_propagation.predict_classes()

最后一步出现以下错误：

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-81-cf4f88a4bb12> in <module>
      2 label_propagation = LabelPropagation(adj_matrix_t)
      3 print("Label Propagation: ", end="")
----> 4 label_propagation.fit(labels_t)
      5 label_propagation_output_labels = label_propagation.predict_classes()
      6 

<ipython-input-1-54a7dbc30bd1> in fit(self, labels, max_iter, tol)
    100 
    101     def fit(self, labels, max_iter=1000, tol=1e-3):
--> 102         super().fit(labels, max_iter, tol)
    103 
    104 ## Label spreading

<ipython-input-1-54a7dbc30bd1> in fit(self, labels, max_iter, tol)
     58             Convergence tolerance: threshold to consider the system at steady state.
     59         """
---> 60         self._one_hot_encode(labels)
     61 
     62         self.predictions = self.one_hot_labels.clone()

<ipython-input-1-54a7dbc30bd1> in _one_hot_encode(self, labels)
     42         labels[unlabeled_mask] = 0
     43         self.one_hot_labels = torch.zeros((self.n_nodes, self.n_classes), dtype=torch.float)
---> 44         self.one_hot_labels = self.one_hot_labels.scatter(1, labels.unsqueeze(1), 1)
     45         self.one_hot_labels[unlabeled_mask, 0] = 0
     46 

RuntimeError: Expected index [8, 1] to be smaller than self [7, 2] apart from dimension 1 and to be smaller size than src [7, 2]

你知道我该如何解决吗？

Answer 1

您的节点仅出现在 Target 列中，因此您需要在查找所有唯一节点时合并该列。我这样做是通过连接两列（连同 Label），在对 Label 值求和时按节点 ID 分组，然后如果总和 > 0 则用 1 替换求和标签：

import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

data = {'ID': {0: 1, 1: 2, 2: 4, 3: 4, 4: 12, 5: 12, 6: 13, 7: 17},
            'Target': {0: 12, 1: 24, 2: 13, 3: 12, 4: 1, 5: 4, 6: 4, 7: 1},
            'Weight': {0: 0.4, 1: 0.1, 2: 0.5, 3: 0.3, 4: 0.1, 5: 0.4, 6: 0.2, 7: 0.1},
            'Label': {0: 1, 1: 0, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 0}}
    
df = pd.DataFrame.from_dict(data)
    
G = nx.from_pandas_edgelist(df, source='ID', target='Target', edge_attr=['Weight', 'Label']) 
    
width = [10 * d['Weight'] for (u, v, d) in G.edges(data=True)]
edge_color = [d['Label'] for (u, v, d) in G.edges(data=True)]

df1 = df[['ID', 'Label']].rename(columns={'ID':'node'})
df2 = df[['Target', 'Label']].rename(columns={'Target':'node'})
df_to_use = pd.concat([df1, df2]).groupby('node').sum().reset_index()
df_to_use['Label'] = df_to_use['Label'].apply(lambda x: 1 if x > 0 else 0)

print(df_to_use)

这给出了

   node  Label
0     1      1
1     2      0
2     4      1
3    12      1
4    13      1
5    17      0
6    24      0

无法帮助自己，不得不尝试该方案，看看它是如何工作的：

node_labels = np.array([df_to_use[df_to_use['node'] == node]['Label'].item() for node in G.nodes()])

idx = np.random.choice(range(len(node_labels)))
node_labels_missing = node_labels.copy()
node_labels_missing[idx] = -1

adj_matrix_t = torch.FloatTensor(adj_matrix)
labels_t = torch.LongTensor(node_labels_missing)

label_propagation = LabelPropagation(adj_matrix_t)
print("Label Propagation: ", end="")
label_propagation.fit(labels_t)
label_propagation_output_labels = label_propagation.predict_classes()

pos = nx.spring_layout(G)

fig = plt.figure(1, figsize=(15, 4)); plt.clf()
fig, ax = plt.subplots(1, 3, num=1)

ax[0].set_title("Actual Labels")
ax[1].set_title("One Label Removed")
ax[2].set_title("With Predicted Label")

ax1 = nx.draw_networkx(G, pos, width=width, edge_color=edge_color, node_color=node_labels, ax=ax[0])
ax2 = nx.draw_networkx(G, pos, width=width, edge_color=edge_color, node_color=[c if c in (0, 1) else 0.5 for c in labels_t], ax=ax[1])
ax3 = nx.draw_networkx(G, pos, width=width, edge_color=edge_color, node_color=label_propagation_output_labels, ax=ax[2])

这给出了

注意：对于没有上下文的读者，该用户正在尝试重新实现 this code。下面的自定义 class 定义必须在上面的代码运行.

之前执行

from abc import abstractmethod
import torch

class BaseLabelPropagation:
    """Base class for label propagation models.
    
    Parameters
    ----------
    adj_matrix: torch.FloatTensor
        Adjacency matrix of the graph.
    """
    def __init__(self, adj_matrix):
        self.norm_adj_matrix = self._normalize(adj_matrix)
        self.n_nodes = adj_matrix.size(0)
        self.one_hot_labels = None 
        self.n_classes = None
        self.labeled_mask = None
        self.predictions = None

    @staticmethod
    @abstractmethod
    def _normalize(adj_matrix):
        raise NotImplementedError("_normalize must be implemented")

    @abstractmethod
    def _propagate(self):
        raise NotImplementedError("_propagate must be implemented")

    def _one_hot_encode(self, labels):
        # Get the number of classes
        classes = torch.unique(labels)
        classes = classes[classes != -1]
        self.n_classes = classes.size(0)

        # One-hot encode labeled data instances and zero rows corresponding to unlabeled instances
        unlabeled_mask = (labels == -1)
        labels = labels.clone()  # defensive copying
        labels[unlabeled_mask] = 0
        self.one_hot_labels = torch.zeros((self.n_nodes, self.n_classes), dtype=torch.float)
        self.one_hot_labels = self.one_hot_labels.scatter(1, labels.unsqueeze(1), 1)
        self.one_hot_labels[unlabeled_mask, 0] = 0

        self.labeled_mask = ~unlabeled_mask

    def fit(self, labels, max_iter, tol):
        """Fits a semi-supervised learning label propagation model.
        
        labels: torch.LongTensor
            Tensor of size n_nodes indicating the class number of each node.
            Unlabeled nodes are denoted with -1.
        max_iter: int
            Maximum number of iterations allowed.
        tol: float
            Convergence tolerance: threshold to consider the system at steady state.
        """
        self._one_hot_encode(labels)

        self.predictions = self.one_hot_labels.clone()
        prev_predictions = torch.zeros((self.n_nodes, self.n_classes), dtype=torch.float)

        for i in range(max_iter):
            # Stop iterations if the system is considered at a steady state
            variation = torch.abs(self.predictions - prev_predictions).sum().item()
            
            if variation < tol:
                print(f"The method stopped after {i} iterations, variation={variation:.4f}.")
                break

            prev_predictions = self.predictions
            self._propagate()

    def predict(self):
        return self.predictions

    def predict_classes(self):
        return self.predictions.max(dim=1).indices


class LabelPropagation(BaseLabelPropagation):
    def __init__(self, adj_matrix):
        super().__init__(adj_matrix)

    @staticmethod
    def _normalize(adj_matrix):
        """Computes D^-1 * W"""
        degs = adj_matrix.sum(dim=1)
        degs[degs == 0] = 1  # avoid division by 0 error
        return adj_matrix / degs[:, None]

    def _propagate(self):
        self.predictions = torch.matmul(self.norm_adj_matrix, self.predictions)

        # Put back already known labels
        self.predictions[self.labeled_mask] = self.one_hot_labels[self.labeled_mask]

    def fit(self, labels, max_iter=1000, tol=1e-3):
        super().fit(labels, max_iter, tol)

尝试使用标签传播时的预期索引

Expected index while trying to use Label propagation

python

networkx

pandas

torch