获取 numpy 二维数组中的所有行索引,其中每行中的元素在整个数组中存在超过 2 次
Getting all row indices in numpy 2d array where elements in each row exists more than 2 times in entire array
我正在处理定义为二维边数组的图形数据。
即
[[1, 0],
[2, 5],
[1, 5],
[3, 4],
[1, 4]]
定义一个图,所有元素定义一个节点id,没有自环,是有向的,一列中的值在另一列中不存在。
现在回答这个问题,
我需要 select 所有 'nodes' 在列表中出现不止一次的边缘。
我如何快速做到这一点。目前我正在遍历每条边并单独查看节点。感觉这是一个非常糟糕的方法。
当前dumb/slow解决方案
edges = []
for edge in graph:
src, dst = edge[0], edge[1]
# Check src for existance in col 1 & 2
src_fan = np.count_nonzero(graph == src, axis=1).sum()
dst_fan = np.count_nonzero(graph == dst, axis=1).sum()
if(src_fan >= 2 and dst_fan >= 2):
# Add to edges
edges.append(edge)
我也不完全确定这种方式是否正确...
# Obtain the unique nodes and their counts
from_nodes, from_counts = np.unique(a[:, 0], return_counts = True)
to_nodes, to_counts = np.unique(a[:, 1], return_counts = True)
# Obtain the duplicated nodes
dup_from_nodes = from_nodes[from_counts > 1]
dup_to_nodes = to_nodes[to_counts > 1]
# Obtain the edge whose nodes are duplicated
graph[np.in1d(a[:, 0], dup_from_nodes) & np.in1d(a[:, 1], dup_to_nodes)]
Out[297]: array([[1, 4]])
使用 networkx 的解决方案:
import networkx as nx
edges = [[1, 0],
[2, 5],
[1, 5],
[3, 4],
[1, 4]]
G = nx.DiGraph()
G.add_edges_from(edges)
print([node for node in G.nodes if G.degree[node]>1])
编辑:
print([edge for edge in G.edges if (G.degree[edge[0]]>1) & (G.degree[edge[1]]>1)])
import numpy as np
graph = np.array([[1, 0],
[2, 5],
[1, 5],
[3, 4],
[1, 4]])
# get a 1d array of all nodes
array = graph.reshape(-1)
# get occurances of each element
occurances = np.sum(np.equal(array, array[:,np.newaxis]), axis=0)
# reshape back to graph shape
occurances = occurances.reshape(graph.shape)
# check if both edges occur more than once
mask = np.all(occurances > 1, axis=1)
# select the masked elements
edges = graph[mask]
根据我的测试,这种方法比接受的答案快将近 2 倍。
测试:
import timeit
import numpy as np
graph = np.array([[1, 0],
[2, 5],
[1, 5],
[3, 4],
[1, 4]])
# accepted answer
def method1(a):
# Obtain the unique nodes and their counts
from_nodes, from_counts = np.unique(a[:, 0], return_counts = True)
to_nodes, to_counts = np.unique(a[:, 1], return_counts = True)
# Obtain the duplicated nodes
dup_from_nodes = from_nodes[from_counts > 1]
dup_to_nodes = to_nodes[to_counts > 1]
# Obtain the edge whose nodes are duplicated
return graph[np.in1d(a[:, 0], dup_from_nodes) & np.in1d(a[:, 1], dup_to_nodes)]
# this answer
def method2(graph):
# get a 1d array of all nodes
array = graph.reshape(-1)
# get occurances of each element then reshape back to graph shape
occurances = np.sum(np.equal(array, array[:,np.newaxis]), axis=0).reshape(graph.shape)
# check if both edges occur more than once
mask = np.all(occurances > 1, axis=1)
# select the masked elements
edges = graph[mask]
return edges
print('method1 (accepted answer): ', timeit.timeit(lambda: method1(graph), number=10000))
print('method2 (this answer): ', timeit.timeit(lambda: method2(graph), number=10000))
输出:
method1 (accepted answer): 0.20238440000000013
method2 (this answer): 0.06534320000000005
我正在处理定义为二维边数组的图形数据。 即
[[1, 0],
[2, 5],
[1, 5],
[3, 4],
[1, 4]]
定义一个图,所有元素定义一个节点id,没有自环,是有向的,一列中的值在另一列中不存在。
现在回答这个问题, 我需要 select 所有 'nodes' 在列表中出现不止一次的边缘。 我如何快速做到这一点。目前我正在遍历每条边并单独查看节点。感觉这是一个非常糟糕的方法。
当前dumb/slow解决方案
edges = []
for edge in graph:
src, dst = edge[0], edge[1]
# Check src for existance in col 1 & 2
src_fan = np.count_nonzero(graph == src, axis=1).sum()
dst_fan = np.count_nonzero(graph == dst, axis=1).sum()
if(src_fan >= 2 and dst_fan >= 2):
# Add to edges
edges.append(edge)
我也不完全确定这种方式是否正确...
# Obtain the unique nodes and their counts
from_nodes, from_counts = np.unique(a[:, 0], return_counts = True)
to_nodes, to_counts = np.unique(a[:, 1], return_counts = True)
# Obtain the duplicated nodes
dup_from_nodes = from_nodes[from_counts > 1]
dup_to_nodes = to_nodes[to_counts > 1]
# Obtain the edge whose nodes are duplicated
graph[np.in1d(a[:, 0], dup_from_nodes) & np.in1d(a[:, 1], dup_to_nodes)]
Out[297]: array([[1, 4]])
使用 networkx 的解决方案:
import networkx as nx
edges = [[1, 0],
[2, 5],
[1, 5],
[3, 4],
[1, 4]]
G = nx.DiGraph()
G.add_edges_from(edges)
print([node for node in G.nodes if G.degree[node]>1])
编辑:
print([edge for edge in G.edges if (G.degree[edge[0]]>1) & (G.degree[edge[1]]>1)])
import numpy as np
graph = np.array([[1, 0],
[2, 5],
[1, 5],
[3, 4],
[1, 4]])
# get a 1d array of all nodes
array = graph.reshape(-1)
# get occurances of each element
occurances = np.sum(np.equal(array, array[:,np.newaxis]), axis=0)
# reshape back to graph shape
occurances = occurances.reshape(graph.shape)
# check if both edges occur more than once
mask = np.all(occurances > 1, axis=1)
# select the masked elements
edges = graph[mask]
根据我的测试,这种方法比接受的答案快将近 2 倍。
测试:
import timeit
import numpy as np
graph = np.array([[1, 0],
[2, 5],
[1, 5],
[3, 4],
[1, 4]])
# accepted answer
def method1(a):
# Obtain the unique nodes and their counts
from_nodes, from_counts = np.unique(a[:, 0], return_counts = True)
to_nodes, to_counts = np.unique(a[:, 1], return_counts = True)
# Obtain the duplicated nodes
dup_from_nodes = from_nodes[from_counts > 1]
dup_to_nodes = to_nodes[to_counts > 1]
# Obtain the edge whose nodes are duplicated
return graph[np.in1d(a[:, 0], dup_from_nodes) & np.in1d(a[:, 1], dup_to_nodes)]
# this answer
def method2(graph):
# get a 1d array of all nodes
array = graph.reshape(-1)
# get occurances of each element then reshape back to graph shape
occurances = np.sum(np.equal(array, array[:,np.newaxis]), axis=0).reshape(graph.shape)
# check if both edges occur more than once
mask = np.all(occurances > 1, axis=1)
# select the masked elements
edges = graph[mask]
return edges
print('method1 (accepted answer): ', timeit.timeit(lambda: method1(graph), number=10000))
print('method2 (this answer): ', timeit.timeit(lambda: method2(graph), number=10000))
输出:
method1 (accepted answer): 0.20238440000000013
method2 (this answer): 0.06534320000000005