在 numpy 数组中找到仅符号不同的行对
find pairs of rows in numpy array that differ only by sign
我需要在 numpy 数组中找到仅符号不同的所有行的行索引。例如,如果我有数组:
>>> A
array([[ 0, 1, 2],
[ 3, 4, 5],
[ 0, -1, -2],
[ 9, 5, 6],
[-3, -4, -5]])
我希望输出为 [(0,2),(1,4)]
我知道如何找到唯一的行,numpy.unique,所以我的直觉是将数组附加到自身的否定,即 numpy.concatenate(A,-1*A),然后找到非唯一行,但我对如何从中提取我需要的信息感到困惑。此外,数组可能非常大,因此将其附加到自身可能不是一个好主意。
通过遍历数组并检查行索引是否等于另一个行索引的否定,我得到了正确的答案,但这需要很长时间。我想要和 numpy.unique.
一样快的东西
我已经删除了 A 中的所有重复行,如果这对流程有任何影响的话。
您可以在 one-liner
:
[(i, j) for i in range(len(a)) for j in range(i+1, len(a)) if all(abs(a[i]) == abs(a[j]))]
你的 a
给出:
[(0, 2), (1, 4)]
所以我们基本上是使用嵌套 for-loops
来遍历每一对 rows
- i
和 j
。然后我们检查第一个 row
中的每个元素(使用 all
)是否等于(==
)另一个 row
中的每个元素。但是,为了引入绝对方面,我们只是在比较之前先取每个row
中的abs()
。
哦,还有一个确切的 negation
:
[(i, j) for i in range(len(a)) for j in range(i+1, len(a)) if all(a[i] == -a[j])]
对于此示例给出相同的输出,但对于其他 arrays
.
显然会发生变化
尝试:
A = [[0,1,2],[3,4,5],[0,-1,-2],[9,5,6],[-3,-4,-5]]
outlist = []
c = 1
while len(A) > 1:
b = list(map(lambda x: -x, A[0]))
A = A[1:]
for i in range(len(A)):
if A[i] == b:
outlist.append((c-1, c+i))
c += 1
print(outlist)
输出:
[(0, 2), (1, 4)]
这是一个主要基于 NumPy 的 -
def group_dup_rowids(a):
sidx = np.lexsort(a.T)
b = a[sidx]
m = np.concatenate(([False], (b[1:] == b[:-1]).all(1), [False] ))
idx = np.flatnonzero(m[1:] != m[:-1])
C = sidx.tolist()
return [C[i:j] for i,j in zip(idx[::2],idx[1::2]+1)]
out = group_dup_rowids(np.abs(a))
样本运行-
In [175]: a
Out[175]:
array([[ 0, 1, 2],
[ 3, 4, 5],
[ 0, -1, -2],
[ 9, 5, 6],
[-3, -4, -5]])
In [176]: group_dup_rowids(np.abs(a))
Out[176]: [[0, 2], [1, 4]]
精确否定案例
对于寻找完全否定配对匹配的情况,我们只需要稍作修改-
def group_dup_rowids_negation(ar):
a = np.abs(ar)
sidx = np.lexsort(a.T)
b = ar[sidx]
m = np.concatenate(([False], (b[1:] == -b[:-1]).all(1), [False] ))
idx = np.flatnonzero(m[1:] != m[:-1])
C = sidx.tolist()
return [(C[i:j]) for i,j in zip(idx[::2],idx[1::2]+1)]
样本运行-
In [354]: a
Out[354]:
array([[ 0, 1, 2],
[ 3, 4, 5],
[ 0, -1, -2],
[ 9, 5, 6],
[-3, -4, -5]])
In [355]: group_dup_rowids_negation(a)
Out[355]: [[0, 2], [1, 4]]
In [356]: a[-1] = [-3,4,-5]
In [357]: group_dup_rowids_negation(a)
Out[357]: [[0, 2]]
运行时测试
其他工作方法-
# @Joe Iddon's soln
def for_for_if_listcompr(a):
return [(i, j) for i in range(len(a)) for j in range(i+1, len(a))
if all(a[i] == -a[j])]
# @dkato's soln
def find_pairs(A):
res = []
for r1 in range(len(A)):
for r2 in range(r1+1, len(A)):
if all(A[r1] == -A[r2]):
res.append((r1, r2))
return res
计时 -
In [492]: # Setup bigger input case
...: import pandas as pd
...: np.random.seed(0)
...: N = 2000 # datasize decider
...: a0 = np.random.randint(0,9,(N,10))
...: a = a0[np.random.choice(len(a0),4*N)]
...: a[np.random.choice(len(a),2*N, replace=0)] *= -1
...: a = pd.DataFrame(a).drop_duplicates().values
In [493]: %timeit for_for_if_listcompr(a)
...: %timeit find_pairs(a)
1 loop, best of 3: 6.1 s per loop
1 loop, best of 3: 6.05 s per loop
In [494]: %timeit group_dup_rowids_negation(a)
100 loops, best of 3: 2.05 ms per loop
进一步改进
def group_dup_rowids_negation_mod1(ar):
a = np.abs(ar)
sidx = np.lexsort(a.T)
b = ar[sidx]
dp = view1D(b)
dn = view1D(-b)
m = np.concatenate(([False], dp[1:] == dn[:-1], [False] ))
return zip(sidx[m[1:]], sidx[m[:-1]])
def group_dup_rowids_negation_mod2(ar):
a = np.abs(ar)
sidx = lexsort_cols_posnum(a)
b = ar[sidx]
dp = view1D(b)
dn = view1D(-b)
m = np.concatenate(([False], dp[1:] == dn[:-1], [False] ))
return zip(sidx[m[1:]], sidx[m[:-1]])
辅助函数:
# @Divakar
def view1D(a): # a is array
a = np.ascontiguousarray(a)
void_dt = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
return a.view(void_dt).ravel()
# Used to convert each row as a scalar by considering each of them as
# an indexing tuple and getting argsort indices
def lexsort_cols_posnum(ar):
shp = ar.max(0)+1
s = np.concatenate((np.asarray(shp[1:])[::-1].cumprod()[::-1],[1]))
return ar.dot(s).argsort()
运行时测试(借自@Paul Panzer 的基准测试)-
In [628]: N = 50000 # datasize decider
...: a0 = np.random.randint(0,99,(N,3))
...: a = a0[np.random.choice(len(a0),4*N)]
...: a[np.random.choice(len(a),2*N, replace=0)] *= -1
...: # OP says no dups
...: a = np.unique(a, axis=0)
...: np.random.shuffle(a)
In [629]: %timeit use_unique(a) # @Paul Panzer's soln
10 loops, best of 3: 33.9 ms per loop
In [630]: %timeit group_dup_rowids_negation(a)
10 loops, best of 3: 54.1 ms per loop
In [631]: %timeit group_dup_rowids_negation_mod1(a)
10 loops, best of 3: 37.4 ms per loop
In [632]: %timeit group_dup_rowids_negation_mod2(a)
100 loops, best of 3: 17.3 ms per loop
这是 Joe Iddon 发布的功能版本。主要区别在于 if 语句:如果一对 [1, 2, 3] 和 [-1, 2, 3] 是正确的,那么我认为 Joe 的 if 语句是正确的。
def find_pairs(A):
res = []
for r1 in range(len(A)):
for r2 in range(r1+1, len(A)):
if all(A[r1] == -A[r2]):
res.append((r1, r2))
return res
这是一个基于 np.unique
的快速解决方案。 numpy1.13
是必需的。
import numpy as np
# Divakar's method for reference
def group_dup_rowids_negation(ar):
a = np.abs(ar)
sidx = np.lexsort(a.T)
b = ar[sidx]
m = np.concatenate(([False], (b[1:] == -b[:-1]).all(1), [False] ))
idx = np.flatnonzero(m[1:] != m[:-1])
C = sidx.tolist()
return [(C[i:j]) for i,j in zip(idx[::2],idx[1::2]+1)]
def use_unique(a):
sign = np.sign(a)
nz = np.flatnonzero(sign)
firstnz = np.searchsorted(nz, np.arange(0, a.size, a.shape[1]))
a_nrm = np.where(sign.ravel()[nz[firstnz], None]==-1, -a, a)
uniq, idx, inv, cnt = np.unique(a_nrm, True, True, True, axis=0)
dup = np.flatnonzero(cnt==2)
out = np.empty((len(dup), 2), dtype=int)
out[:, 0] = idx[dup]
idx[inv] = np.arange(len(inv))
out[:, 1] = idx[dup]
return out
N = 50000 # datasize decider
a0 = np.random.randint(0,99,(N,3))
a = a0[np.random.choice(len(a0),4*N)]
a[np.random.choice(len(a),2*N, replace=0)] *= -1
# OP says no dups
a = np.unique(a, axis=0)
np.random.shuffle(a)
idxd = np.array(group_dup_rowids_negation(a))
idxp = use_unique(a)
assert len(idxd) == len(idxp)
assert not np.any(np.sum(a[idxd, :], axis=1))
assert not np.any(np.sum(a[idxp, :], axis=1))
assert {frozenset(i) for i in idxd} == {frozenset(i) for i in idxp}
from timeit import timeit
gl = {'a': a}
for fun, tag in [(group_dup_rowids_negation, 'D '), (use_unique, 'pp')]:
gl['f'] = fun
print(tag, timeit('f(a)', number=10, globals=gl))
示例输出:
D 0.5263204739894718
pp 0.3610327399801463
既然你说你的数组A
是独一无二的,那么这个呢?
import itertools as it
In [3]: idxs_comb = list(it.combinations(range(A.shape[0]), 2))
In [4]: rows_comb = it.combinations(A, 2)
In [5]: [idxs_comb[idx] for idx, pair in enumerate(rows_comb) if np.sum(pair) == 0]
Out[6]: [(0, 2), (1, 4)]
我需要在 numpy 数组中找到仅符号不同的所有行的行索引。例如,如果我有数组:
>>> A
array([[ 0, 1, 2],
[ 3, 4, 5],
[ 0, -1, -2],
[ 9, 5, 6],
[-3, -4, -5]])
我希望输出为 [(0,2),(1,4)]
我知道如何找到唯一的行,numpy.unique,所以我的直觉是将数组附加到自身的否定,即 numpy.concatenate(A,-1*A),然后找到非唯一行,但我对如何从中提取我需要的信息感到困惑。此外,数组可能非常大,因此将其附加到自身可能不是一个好主意。
通过遍历数组并检查行索引是否等于另一个行索引的否定,我得到了正确的答案,但这需要很长时间。我想要和 numpy.unique.
一样快的东西我已经删除了 A 中的所有重复行,如果这对流程有任何影响的话。
您可以在 one-liner
:
[(i, j) for i in range(len(a)) for j in range(i+1, len(a)) if all(abs(a[i]) == abs(a[j]))]
你的 a
给出:
[(0, 2), (1, 4)]
所以我们基本上是使用嵌套 for-loops
来遍历每一对 rows
- i
和 j
。然后我们检查第一个 row
中的每个元素(使用 all
)是否等于(==
)另一个 row
中的每个元素。但是,为了引入绝对方面,我们只是在比较之前先取每个row
中的abs()
。
哦,还有一个确切的 negation
:
[(i, j) for i in range(len(a)) for j in range(i+1, len(a)) if all(a[i] == -a[j])]
对于此示例给出相同的输出,但对于其他 arrays
.
尝试:
A = [[0,1,2],[3,4,5],[0,-1,-2],[9,5,6],[-3,-4,-5]]
outlist = []
c = 1
while len(A) > 1:
b = list(map(lambda x: -x, A[0]))
A = A[1:]
for i in range(len(A)):
if A[i] == b:
outlist.append((c-1, c+i))
c += 1
print(outlist)
输出:
[(0, 2), (1, 4)]
这是一个主要基于 NumPy 的 -
def group_dup_rowids(a):
sidx = np.lexsort(a.T)
b = a[sidx]
m = np.concatenate(([False], (b[1:] == b[:-1]).all(1), [False] ))
idx = np.flatnonzero(m[1:] != m[:-1])
C = sidx.tolist()
return [C[i:j] for i,j in zip(idx[::2],idx[1::2]+1)]
out = group_dup_rowids(np.abs(a))
样本运行-
In [175]: a
Out[175]:
array([[ 0, 1, 2],
[ 3, 4, 5],
[ 0, -1, -2],
[ 9, 5, 6],
[-3, -4, -5]])
In [176]: group_dup_rowids(np.abs(a))
Out[176]: [[0, 2], [1, 4]]
精确否定案例
对于寻找完全否定配对匹配的情况,我们只需要稍作修改-
def group_dup_rowids_negation(ar):
a = np.abs(ar)
sidx = np.lexsort(a.T)
b = ar[sidx]
m = np.concatenate(([False], (b[1:] == -b[:-1]).all(1), [False] ))
idx = np.flatnonzero(m[1:] != m[:-1])
C = sidx.tolist()
return [(C[i:j]) for i,j in zip(idx[::2],idx[1::2]+1)]
样本运行-
In [354]: a
Out[354]:
array([[ 0, 1, 2],
[ 3, 4, 5],
[ 0, -1, -2],
[ 9, 5, 6],
[-3, -4, -5]])
In [355]: group_dup_rowids_negation(a)
Out[355]: [[0, 2], [1, 4]]
In [356]: a[-1] = [-3,4,-5]
In [357]: group_dup_rowids_negation(a)
Out[357]: [[0, 2]]
运行时测试
其他工作方法-
# @Joe Iddon's soln
def for_for_if_listcompr(a):
return [(i, j) for i in range(len(a)) for j in range(i+1, len(a))
if all(a[i] == -a[j])]
# @dkato's soln
def find_pairs(A):
res = []
for r1 in range(len(A)):
for r2 in range(r1+1, len(A)):
if all(A[r1] == -A[r2]):
res.append((r1, r2))
return res
计时 -
In [492]: # Setup bigger input case
...: import pandas as pd
...: np.random.seed(0)
...: N = 2000 # datasize decider
...: a0 = np.random.randint(0,9,(N,10))
...: a = a0[np.random.choice(len(a0),4*N)]
...: a[np.random.choice(len(a),2*N, replace=0)] *= -1
...: a = pd.DataFrame(a).drop_duplicates().values
In [493]: %timeit for_for_if_listcompr(a)
...: %timeit find_pairs(a)
1 loop, best of 3: 6.1 s per loop
1 loop, best of 3: 6.05 s per loop
In [494]: %timeit group_dup_rowids_negation(a)
100 loops, best of 3: 2.05 ms per loop
进一步改进
def group_dup_rowids_negation_mod1(ar):
a = np.abs(ar)
sidx = np.lexsort(a.T)
b = ar[sidx]
dp = view1D(b)
dn = view1D(-b)
m = np.concatenate(([False], dp[1:] == dn[:-1], [False] ))
return zip(sidx[m[1:]], sidx[m[:-1]])
def group_dup_rowids_negation_mod2(ar):
a = np.abs(ar)
sidx = lexsort_cols_posnum(a)
b = ar[sidx]
dp = view1D(b)
dn = view1D(-b)
m = np.concatenate(([False], dp[1:] == dn[:-1], [False] ))
return zip(sidx[m[1:]], sidx[m[:-1]])
辅助函数:
# @Divakar
def view1D(a): # a is array
a = np.ascontiguousarray(a)
void_dt = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
return a.view(void_dt).ravel()
# Used to convert each row as a scalar by considering each of them as
# an indexing tuple and getting argsort indices
def lexsort_cols_posnum(ar):
shp = ar.max(0)+1
s = np.concatenate((np.asarray(shp[1:])[::-1].cumprod()[::-1],[1]))
return ar.dot(s).argsort()
运行时测试(借自@Paul Panzer 的基准测试)-
In [628]: N = 50000 # datasize decider
...: a0 = np.random.randint(0,99,(N,3))
...: a = a0[np.random.choice(len(a0),4*N)]
...: a[np.random.choice(len(a),2*N, replace=0)] *= -1
...: # OP says no dups
...: a = np.unique(a, axis=0)
...: np.random.shuffle(a)
In [629]: %timeit use_unique(a) # @Paul Panzer's soln
10 loops, best of 3: 33.9 ms per loop
In [630]: %timeit group_dup_rowids_negation(a)
10 loops, best of 3: 54.1 ms per loop
In [631]: %timeit group_dup_rowids_negation_mod1(a)
10 loops, best of 3: 37.4 ms per loop
In [632]: %timeit group_dup_rowids_negation_mod2(a)
100 loops, best of 3: 17.3 ms per loop
这是 Joe Iddon 发布的功能版本。主要区别在于 if 语句:如果一对 [1, 2, 3] 和 [-1, 2, 3] 是正确的,那么我认为 Joe 的 if 语句是正确的。
def find_pairs(A):
res = []
for r1 in range(len(A)):
for r2 in range(r1+1, len(A)):
if all(A[r1] == -A[r2]):
res.append((r1, r2))
return res
这是一个基于 np.unique
的快速解决方案。 numpy1.13
是必需的。
import numpy as np
# Divakar's method for reference
def group_dup_rowids_negation(ar):
a = np.abs(ar)
sidx = np.lexsort(a.T)
b = ar[sidx]
m = np.concatenate(([False], (b[1:] == -b[:-1]).all(1), [False] ))
idx = np.flatnonzero(m[1:] != m[:-1])
C = sidx.tolist()
return [(C[i:j]) for i,j in zip(idx[::2],idx[1::2]+1)]
def use_unique(a):
sign = np.sign(a)
nz = np.flatnonzero(sign)
firstnz = np.searchsorted(nz, np.arange(0, a.size, a.shape[1]))
a_nrm = np.where(sign.ravel()[nz[firstnz], None]==-1, -a, a)
uniq, idx, inv, cnt = np.unique(a_nrm, True, True, True, axis=0)
dup = np.flatnonzero(cnt==2)
out = np.empty((len(dup), 2), dtype=int)
out[:, 0] = idx[dup]
idx[inv] = np.arange(len(inv))
out[:, 1] = idx[dup]
return out
N = 50000 # datasize decider
a0 = np.random.randint(0,99,(N,3))
a = a0[np.random.choice(len(a0),4*N)]
a[np.random.choice(len(a),2*N, replace=0)] *= -1
# OP says no dups
a = np.unique(a, axis=0)
np.random.shuffle(a)
idxd = np.array(group_dup_rowids_negation(a))
idxp = use_unique(a)
assert len(idxd) == len(idxp)
assert not np.any(np.sum(a[idxd, :], axis=1))
assert not np.any(np.sum(a[idxp, :], axis=1))
assert {frozenset(i) for i in idxd} == {frozenset(i) for i in idxp}
from timeit import timeit
gl = {'a': a}
for fun, tag in [(group_dup_rowids_negation, 'D '), (use_unique, 'pp')]:
gl['f'] = fun
print(tag, timeit('f(a)', number=10, globals=gl))
示例输出:
D 0.5263204739894718
pp 0.3610327399801463
既然你说你的数组A
是独一无二的,那么这个呢?
import itertools as it
In [3]: idxs_comb = list(it.combinations(range(A.shape[0]), 2))
In [4]: rows_comb = it.combinations(A, 2)
In [5]: [idxs_comb[idx] for idx, pair in enumerate(rows_comb) if np.sum(pair) == 0]
Out[6]: [(0, 2), (1, 4)]