使用图形工具读取 Pajek .net 文件
Read Pajek .net files using Graph-tool
我有一个Pajek network file(带加权边的无向网络),这里提供了一个例子:
*Vertices 5
1 apple
2 cat
3 tree
4 nature
5 fire
*Edges
1 3 14
2 4 1
节点标签没有引用。边指定为节点 1、节点 2、边权重。
我需要在 graph-tool 中将此文件作为具有节点标签和边的“权重”属性的无向图来读取。该功能还应保留隔离节点。
在 Python 中有没有有效的方法来做到这一点?
到目前为止,我一直在使用 Networkx 读取 .net 文件,然后使用转换函数 like this。我正在寻找一种方法来加快这个过程。
这是我今天开发的解决方案:
import graph_tool.all as gt
import graph_tool.stats as gts
def pajTOgt(filepath, directed = False, removeloops = True):
if directed:
g = gt.Graph(directed=True)
else:
g = gt.Graph(directed=False)
#define edge and vertex properties
g.edge_properties["weight"] = g.new_edge_property("double")
g.vertex_properties["id"] = g.new_vertex_property("string")
with open(filepath, encoding = "utf-8") as input_data:
#create vertices
for line in input_data:
g.add_vertex(int(line.replace("*Vertices ", "").strip())) #add vertices
break
#label vertices
for line in input_data: #keeps going for node labels
if not line.strip() == '*Edges' or line.strip() == '*Arcs':
v_id = int(line.split()[0]) - 1
g.vertex_properties["id"][g.vertex(v_id)] = "".join(line.split()[1:])
else:
break
#create weighted edges
for line in input_data: #keeps going for edges
linesplit = line.split()
linesplit = [int(x) for x in linesplit[:2]] + [float(linesplit[2])]
if linesplit[2] > 0:
n1 = g.vertex(linesplit[0]-1)
n2 = g.vertex(linesplit[1]-1)
e = g.add_edge(n1, n2)
g.edge_properties["weight"][e] = linesplit[2]
if removeloops:
gts.remove_self_loops(g)
return g
不过,如果您发现更有效的东西,我很想知道。
Pajek 文件的每个部分 (Vertices/Edges) 似乎都可以解释为 space-delimited CSV 文件,这意味着您可以使用 pandas.read_csv()
对其进行解析。该函数比您在 pure-python 答案中建议的 line-by-line 解析更快。
此外,一次性初始化边缘列表和 属性 列表(作为 numpy 数组)比在 python 循环中单独设置每个元素更快。
我认为以下实现应该在某种程度上接近最佳,但我还没有对其进行基准测试。
import re
from io import StringIO
import numpy as np
import pandas as pd
import graph_tool as gt
def pajek_to_gt(path, directed=False, remove_loops=False):
"""
Load a Pajek .NET file[1] as a graph_tool.Graph.
Supports files which specify their edges via node pairs.
Does not support files which specify their edges via the
'edgeslist' scheme (i.e. the neighbors-list scheme).
Note:
Vertices are renumbered to start with 0, per graph-tool
conventions (not Pajek conventions, which start with 1).
Author: Stuart Berg (github.com/stuarteberg)
License: MIT
[1]: https://gephi.org/users/supported-graph-formats/pajek-net-format/
"""
# Load into RAM
with open(path, 'r') as f:
full_text = f.read()
if '*edgeslist' in full_text:
raise RuntimeError("Neighbor list format not supported.")
# Erase comment lines
full_text = re.sub(r'^\s*%.*$', '', full_text, flags=re.MULTILINE)
# Erase blank lines (including those created by erasing comments)
full_text = re.sub(r'\n+', '\n', full_text)
# Ensure delimiter is a single space
full_text = re.sub(r'[ \t]+', ' ', full_text)
num_vertices = int(StringIO(full_text).readline().split()[-1])
# Split into vertex section and edges section
# (Vertex section might be empty)
vertex_text, edges_text = re.split(r'\*[^\n]+\n', full_text)[1:]
# Parse vertices (if present)
v_df = None
if vertex_text:
v_df = pd.read_csv(StringIO(vertex_text), delimiter=' ', engine='c', names=['id', 'label'], header=None)
assert (v_df['id'] == np.arange(1, 1+num_vertices)).all(), \
"File does not list all vertices, or lists them out of order."
# Parse edges
e_df = pd.read_csv(StringIO(edges_text), delimiter=' ', engine='c', header=None)
if len(e_df.columns) == 2:
e_df.columns = ['v1', 'v2']
elif len(e_df.columns) == 3:
e_df.columns = ['v1', 'v2', 'weight']
else:
raise RuntimeError("Can't understand edge list")
e_df[['v1', 'v2']] -= 1
# Free up some RAM
del full_text, vertex_text, edges_text
# Create graph
g = gt.Graph(directed=directed)
g.add_vertex(num_vertices)
g.add_edge_list(e_df[['v1', 'v2']].values)
# Add properties
if 'weight' in e_df.columns:
g.edge_properties["weight"] = g.new_edge_property("double", e_df['weight'].values)
if v_df is not None:
g.vertex_properties["label"] = g.new_vertex_property("string", v_df['label'].values)
if remove_loops:
gt.stats.remove_self_loops(g)
return g
这是您的示例文件 returns 的内容:
In [1]: from pajek_to_gt import pajek_to_gt
In [2]: g = pajek_to_gt('pajek-example.NET')
In [3]: g.get_vertices()
Out[3]: array([0, 1, 2, 3, 4])
In [4]: g.vertex_properties['label'].get_2d_array([0])
Out[4]: array([['apple', 'cat', 'tree', 'nature', 'fire']], dtype='<U6')
In [5]: g.get_edges()
Out[5]:
array([[0, 2],
[1, 3]])
In [6]: g.edge_properties['weight'].get_array()
Out[6]: PropertyArray([14., 1.])
注意:此函数进行一些预处理以将 double-spaces 转换为 single-spaces,因为上面的示例在条目之间使用了 double-spaces。那是故意的吗?您链接到的 Pajek 文件规范使用 single-spaces.
编辑:
根据 re-reading 您链接到的 Pajek 文件规范,我注意到边缘部分有两种可能的格式。第二种格式在 variable-length 列表中列出每个节点的邻居:
*edgeslist
4941 386 395 451
1 3553 3586 3587 3637
2 3583
3 4930
4 88
5 13 120
显然,我上面的实现与该格式不兼容。如果文件中使用了该格式,我已经编辑了该函数以引发异常。
我有一个Pajek network file(带加权边的无向网络),这里提供了一个例子:
*Vertices 5
1 apple
2 cat
3 tree
4 nature
5 fire
*Edges
1 3 14
2 4 1
节点标签没有引用。边指定为节点 1、节点 2、边权重。
我需要在 graph-tool 中将此文件作为具有节点标签和边的“权重”属性的无向图来读取。该功能还应保留隔离节点。
在 Python 中有没有有效的方法来做到这一点? 到目前为止,我一直在使用 Networkx 读取 .net 文件,然后使用转换函数 like this。我正在寻找一种方法来加快这个过程。
这是我今天开发的解决方案:
import graph_tool.all as gt
import graph_tool.stats as gts
def pajTOgt(filepath, directed = False, removeloops = True):
if directed:
g = gt.Graph(directed=True)
else:
g = gt.Graph(directed=False)
#define edge and vertex properties
g.edge_properties["weight"] = g.new_edge_property("double")
g.vertex_properties["id"] = g.new_vertex_property("string")
with open(filepath, encoding = "utf-8") as input_data:
#create vertices
for line in input_data:
g.add_vertex(int(line.replace("*Vertices ", "").strip())) #add vertices
break
#label vertices
for line in input_data: #keeps going for node labels
if not line.strip() == '*Edges' or line.strip() == '*Arcs':
v_id = int(line.split()[0]) - 1
g.vertex_properties["id"][g.vertex(v_id)] = "".join(line.split()[1:])
else:
break
#create weighted edges
for line in input_data: #keeps going for edges
linesplit = line.split()
linesplit = [int(x) for x in linesplit[:2]] + [float(linesplit[2])]
if linesplit[2] > 0:
n1 = g.vertex(linesplit[0]-1)
n2 = g.vertex(linesplit[1]-1)
e = g.add_edge(n1, n2)
g.edge_properties["weight"][e] = linesplit[2]
if removeloops:
gts.remove_self_loops(g)
return g
不过,如果您发现更有效的东西,我很想知道。
Pajek 文件的每个部分 (Vertices/Edges) 似乎都可以解释为 space-delimited CSV 文件,这意味着您可以使用 pandas.read_csv()
对其进行解析。该函数比您在 pure-python 答案中建议的 line-by-line 解析更快。
此外,一次性初始化边缘列表和 属性 列表(作为 numpy 数组)比在 python 循环中单独设置每个元素更快。
我认为以下实现应该在某种程度上接近最佳,但我还没有对其进行基准测试。
import re
from io import StringIO
import numpy as np
import pandas as pd
import graph_tool as gt
def pajek_to_gt(path, directed=False, remove_loops=False):
"""
Load a Pajek .NET file[1] as a graph_tool.Graph.
Supports files which specify their edges via node pairs.
Does not support files which specify their edges via the
'edgeslist' scheme (i.e. the neighbors-list scheme).
Note:
Vertices are renumbered to start with 0, per graph-tool
conventions (not Pajek conventions, which start with 1).
Author: Stuart Berg (github.com/stuarteberg)
License: MIT
[1]: https://gephi.org/users/supported-graph-formats/pajek-net-format/
"""
# Load into RAM
with open(path, 'r') as f:
full_text = f.read()
if '*edgeslist' in full_text:
raise RuntimeError("Neighbor list format not supported.")
# Erase comment lines
full_text = re.sub(r'^\s*%.*$', '', full_text, flags=re.MULTILINE)
# Erase blank lines (including those created by erasing comments)
full_text = re.sub(r'\n+', '\n', full_text)
# Ensure delimiter is a single space
full_text = re.sub(r'[ \t]+', ' ', full_text)
num_vertices = int(StringIO(full_text).readline().split()[-1])
# Split into vertex section and edges section
# (Vertex section might be empty)
vertex_text, edges_text = re.split(r'\*[^\n]+\n', full_text)[1:]
# Parse vertices (if present)
v_df = None
if vertex_text:
v_df = pd.read_csv(StringIO(vertex_text), delimiter=' ', engine='c', names=['id', 'label'], header=None)
assert (v_df['id'] == np.arange(1, 1+num_vertices)).all(), \
"File does not list all vertices, or lists them out of order."
# Parse edges
e_df = pd.read_csv(StringIO(edges_text), delimiter=' ', engine='c', header=None)
if len(e_df.columns) == 2:
e_df.columns = ['v1', 'v2']
elif len(e_df.columns) == 3:
e_df.columns = ['v1', 'v2', 'weight']
else:
raise RuntimeError("Can't understand edge list")
e_df[['v1', 'v2']] -= 1
# Free up some RAM
del full_text, vertex_text, edges_text
# Create graph
g = gt.Graph(directed=directed)
g.add_vertex(num_vertices)
g.add_edge_list(e_df[['v1', 'v2']].values)
# Add properties
if 'weight' in e_df.columns:
g.edge_properties["weight"] = g.new_edge_property("double", e_df['weight'].values)
if v_df is not None:
g.vertex_properties["label"] = g.new_vertex_property("string", v_df['label'].values)
if remove_loops:
gt.stats.remove_self_loops(g)
return g
这是您的示例文件 returns 的内容:
In [1]: from pajek_to_gt import pajek_to_gt
In [2]: g = pajek_to_gt('pajek-example.NET')
In [3]: g.get_vertices()
Out[3]: array([0, 1, 2, 3, 4])
In [4]: g.vertex_properties['label'].get_2d_array([0])
Out[4]: array([['apple', 'cat', 'tree', 'nature', 'fire']], dtype='<U6')
In [5]: g.get_edges()
Out[5]:
array([[0, 2],
[1, 3]])
In [6]: g.edge_properties['weight'].get_array()
Out[6]: PropertyArray([14., 1.])
注意:此函数进行一些预处理以将 double-spaces 转换为 single-spaces,因为上面的示例在条目之间使用了 double-spaces。那是故意的吗?您链接到的 Pajek 文件规范使用 single-spaces.
编辑:
根据 re-reading 您链接到的 Pajek 文件规范,我注意到边缘部分有两种可能的格式。第二种格式在 variable-length 列表中列出每个节点的邻居:
*edgeslist
4941 386 395 451
1 3553 3586 3587 3637
2 3583
3 4930
4 88
5 13 120
显然,我上面的实现与该格式不兼容。如果文件中使用了该格式,我已经编辑了该函数以引发异常。