计算二维 Numpy 数组中数字对频率的最有效方法
Most efficient way to calculate frequency of pairs of numbers in a 2D Numpy array
假设我有以下二维数组:
import numpy as np
np.random.seed(123)
a = np.random.randint(1, 6, size=(5, 3))
产生:
In [371]: a
Out[371]:
array([[3, 5, 3],
[2, 4, 3],
[4, 2, 2],
[1, 2, 2],
[1, 1, 2]])
是否有比 the following solution 更有效的(Numpy、Pandas 等)方法来计算所有数字对的频率?
from collections import Counter
from itertools import combinations
def pair_freq(a, sort=False, sort_axis=-1):
a = np.asarray(a)
if sort:
a = np.sort(a, axis=sort_axis)
res = Counter()
for row in a:
res.update(combinations(row, 2))
return res
res = pair_freq(a)
制作类似的东西:
In [38]: res
Out[38]:
Counter({(3, 5): 1,
(3, 3): 1,
(5, 3): 1,
(2, 4): 1,
(2, 3): 1,
(4, 3): 1,
(4, 2): 2,
(2, 2): 2,
(1, 2): 4,
(1, 1): 1})
或:
In [39]: res.most_common()
Out[39]:
[((1, 2), 4),
((4, 2), 2),
((2, 2), 2),
((3, 5), 1),
((3, 3), 1),
((5, 3), 1),
((2, 4), 1),
((2, 3), 1),
((4, 3), 1),
((1, 1), 1)]
PS 生成的数据集可能看起来不同 - 例如像多索引 Pandas DataFrame 或其他东西。
我试图增加 a
数组的维度并将 np.isin()
与所有对组合的列表一起使用,但我仍然无法摆脱循环。
更新:
(a) Are you interested only in the frequency of combinations of 2 numbers (and not interested in frequency of combinations of 3
numbers)?
是的,我只对成对组合(2 个数字)感兴趣
(b) Do you want to consider (3,5) as distinct from (5,3) or do you want to consider them as two occurrences of the same thing?
实际上这两种方法都很好——如果需要,我总是可以预先对我的数组进行排序:
a = np.sort(a, axis=1)
更新2:
Do you want the distinction between (a,b) and (b,a) to happen only due
to the source column of a and b, or even otherwise? Do understand this
question, please consider three rows [[1,2,1], [3,1,2], [1,2,5]]
.
What do you think should be the output here? What should be the
distinct 2-tuples and what should be their frequencies?
In [40]: a = np.array([[1,2,1],[3,1,2],[1,2,5]])
In [41]: a
Out[41]:
array([[1, 2, 1],
[3, 1, 2],
[1, 2, 5]])
我希望得到以下结果:
In [42]: pair_freq(a).most_common()
Out[42]:
[((1, 2), 3),
((1, 1), 1),
((2, 1), 1),
((3, 1), 1),
((3, 2), 1),
((1, 5), 1),
((2, 5), 1)]
因为它更灵活,所以我想将 (a, b) 和 (b, a) 算作同一对元素我可以这样做:
In [43]: pair_freq(a, sort=True).most_common()
Out[43]: [((1, 2), 4), ((1, 1), 1), ((1, 3), 1), ((2, 3), 1), ((1, 5), 1), ((2, 5), 1)]
如果你的元素不是太大的非负整数bincount
很快:
from collections import Counter
from itertools import combinations
import numpy as np
def pairs(a):
M = a.max() + 1
a = a.T
return sum(np.bincount((M * a[j] + a[j+1:]).ravel(), None, M*M)
for j in range(len(a) - 1)).reshape(M, M)
def pairs_F_3(a):
M = a.max() + 1
return (np.bincount(a[1:].ravel() + M*a[:2].ravel(), None, M*M) +
np.bincount(a[2].ravel() + M*a[0].ravel(), None, M*M))
def pairs_F(a):
M = a.max() + 1
a = np.ascontiguousarray(a.T) # contiguous columns (rows after .T)
# appear to be typically perform better
# thanks @ning chen
return sum(np.bincount((M * a[j] + a[j+1:]).ravel(), None, M*M)
for j in range(len(a) - 1)).reshape(M, M)
def pairs_dict(a):
p = pairs_F(a)
# p is a 2D table with the frequency of (y, x) at position y, x
y, x = np.where(p)
c = p[y, x]
return {(yi, xi): ci for yi, xi, ci in zip(y, x, c)}
def pair_freq(a, sort=False, sort_axis=-1):
a = np.asarray(a)
if sort:
a = np.sort(a, axis=sort_axis)
res = Counter()
for row in a:
res.update(combinations(row, 2))
return res
from timeit import timeit
A = [np.random.randint(0, 1000, (1000, 120)),
np.random.randint(0, 100, (100000, 12))]
for a in A:
print('shape:', a.shape, 'range:', a.max() + 1)
res2 = pairs_dict(a)
res = pair_freq(a)
print(f'results equal: {res==res2}')
print('bincount', timeit(lambda:pairs(a), number=10)*100, 'ms')
print('bc(F) ', timeit(lambda:pairs_F(a), number=10)*100, 'ms')
print('bc->dict', timeit(lambda:pairs_dict(a), number=10)*100, 'ms')
print('Counter ', timeit(lambda:pair_freq(a), number=4)*250,'ms')
样本运行:
shape: (1000, 120) range: 1000
results equal: True
bincount 461.14772390574217 ms
bc(F) 435.3669326752424 ms
bc->dict 932.1215840056539 ms
Counter 3473.3258984051645 ms
shape: (100000, 12) range: 100
results equal: True
bincount 89.80463854968548 ms
bc(F) 43.449611216783524 ms
bc->dict 46.470773220062256 ms
Counter 1987.6734036952257 ms
我有一个想法,代码如下。
我的代码最大的缺点是它 运行 随着列的增加非常缓慢,并且比@Paul Panzer 的代码慢。我向 Paul Panzer 道歉。
如果你想更快,就忽略num_to_items的功能。因为 (1, 1)
等于 1*2**20 + 1
.
import numpy as np
from random import choice
from itertools import izip
from scipy.sparse import csr_matrix, csc_matrix
from scipy import sparse as sp
c_10 = np.array([[choice(range(1, 10)) for _ in range(3)] for _ in range(1000)])
c_1000 = np.array([[choice(range(1, 1000)) for _ in range(3)] for _ in range(1000)])
def _bit_to_items(num):
return (num >> 20, num & 0b1111111111111111111)
def unique_bit_shit(c):
cc = c << 20 # suppose that: 2**20 > max(c)
dialog_mtx_1 = np.array([[1, 0, 0],
[1, 0, 0],
[0, 1, 0]])
dialog_mtx_2 = np.array([[0, 1, 0],
[0, 0, 1],
[0, 0, 1]])
dialog_mtx_1 = dialog_mtx_1.T
dialog_mtx_2 = dialog_mtx_2.T
pairs = cc.dot(dialog_mtx_1) + c.dot(dialog_mtx_2)
pairs_num, count = np.unique(pairs, return_counts=True)
return [(_bit_to_items(num), v) for num, v in izip(pairs_num, count)]
def _dot_to_items(num):
# 2**20 is 1048576
return (num / 1048576, num % 1048576)
def unique_dot(c):
dialog_mtx_3 = np.array([[2**20, 2**20, 0],
[1, 0, 2**20],
[0, 1, 1]])
pairs = c.dot(dialog_mtx_3)
pairs_num, count = np.unique(pairs, return_counts=True)
return [(_dot_to_items(num), v) for num, v in izip(pairs_num, count)]
假设我有以下二维数组:
import numpy as np
np.random.seed(123)
a = np.random.randint(1, 6, size=(5, 3))
产生:
In [371]: a
Out[371]:
array([[3, 5, 3],
[2, 4, 3],
[4, 2, 2],
[1, 2, 2],
[1, 1, 2]])
是否有比 the following solution 更有效的(Numpy、Pandas 等)方法来计算所有数字对的频率?
from collections import Counter
from itertools import combinations
def pair_freq(a, sort=False, sort_axis=-1):
a = np.asarray(a)
if sort:
a = np.sort(a, axis=sort_axis)
res = Counter()
for row in a:
res.update(combinations(row, 2))
return res
res = pair_freq(a)
制作类似的东西:
In [38]: res
Out[38]:
Counter({(3, 5): 1,
(3, 3): 1,
(5, 3): 1,
(2, 4): 1,
(2, 3): 1,
(4, 3): 1,
(4, 2): 2,
(2, 2): 2,
(1, 2): 4,
(1, 1): 1})
或:
In [39]: res.most_common()
Out[39]:
[((1, 2), 4),
((4, 2), 2),
((2, 2), 2),
((3, 5), 1),
((3, 3), 1),
((5, 3), 1),
((2, 4), 1),
((2, 3), 1),
((4, 3), 1),
((1, 1), 1)]
PS 生成的数据集可能看起来不同 - 例如像多索引 Pandas DataFrame 或其他东西。
我试图增加 a
数组的维度并将 np.isin()
与所有对组合的列表一起使用,但我仍然无法摆脱循环。
更新:
(a) Are you interested only in the frequency of combinations of 2 numbers (and not interested in frequency of combinations of 3 numbers)?
是的,我只对成对组合(2 个数字)感兴趣
(b) Do you want to consider (3,5) as distinct from (5,3) or do you want to consider them as two occurrences of the same thing?
实际上这两种方法都很好——如果需要,我总是可以预先对我的数组进行排序:
a = np.sort(a, axis=1)
更新2:
Do you want the distinction between (a,b) and (b,a) to happen only due to the source column of a and b, or even otherwise? Do understand this question, please consider three rows
[[1,2,1], [3,1,2], [1,2,5]]
. What do you think should be the output here? What should be the distinct 2-tuples and what should be their frequencies?
In [40]: a = np.array([[1,2,1],[3,1,2],[1,2,5]])
In [41]: a
Out[41]:
array([[1, 2, 1],
[3, 1, 2],
[1, 2, 5]])
我希望得到以下结果:
In [42]: pair_freq(a).most_common()
Out[42]:
[((1, 2), 3),
((1, 1), 1),
((2, 1), 1),
((3, 1), 1),
((3, 2), 1),
((1, 5), 1),
((2, 5), 1)]
因为它更灵活,所以我想将 (a, b) 和 (b, a) 算作同一对元素我可以这样做:
In [43]: pair_freq(a, sort=True).most_common()
Out[43]: [((1, 2), 4), ((1, 1), 1), ((1, 3), 1), ((2, 3), 1), ((1, 5), 1), ((2, 5), 1)]
如果你的元素不是太大的非负整数bincount
很快:
from collections import Counter
from itertools import combinations
import numpy as np
def pairs(a):
M = a.max() + 1
a = a.T
return sum(np.bincount((M * a[j] + a[j+1:]).ravel(), None, M*M)
for j in range(len(a) - 1)).reshape(M, M)
def pairs_F_3(a):
M = a.max() + 1
return (np.bincount(a[1:].ravel() + M*a[:2].ravel(), None, M*M) +
np.bincount(a[2].ravel() + M*a[0].ravel(), None, M*M))
def pairs_F(a):
M = a.max() + 1
a = np.ascontiguousarray(a.T) # contiguous columns (rows after .T)
# appear to be typically perform better
# thanks @ning chen
return sum(np.bincount((M * a[j] + a[j+1:]).ravel(), None, M*M)
for j in range(len(a) - 1)).reshape(M, M)
def pairs_dict(a):
p = pairs_F(a)
# p is a 2D table with the frequency of (y, x) at position y, x
y, x = np.where(p)
c = p[y, x]
return {(yi, xi): ci for yi, xi, ci in zip(y, x, c)}
def pair_freq(a, sort=False, sort_axis=-1):
a = np.asarray(a)
if sort:
a = np.sort(a, axis=sort_axis)
res = Counter()
for row in a:
res.update(combinations(row, 2))
return res
from timeit import timeit
A = [np.random.randint(0, 1000, (1000, 120)),
np.random.randint(0, 100, (100000, 12))]
for a in A:
print('shape:', a.shape, 'range:', a.max() + 1)
res2 = pairs_dict(a)
res = pair_freq(a)
print(f'results equal: {res==res2}')
print('bincount', timeit(lambda:pairs(a), number=10)*100, 'ms')
print('bc(F) ', timeit(lambda:pairs_F(a), number=10)*100, 'ms')
print('bc->dict', timeit(lambda:pairs_dict(a), number=10)*100, 'ms')
print('Counter ', timeit(lambda:pair_freq(a), number=4)*250,'ms')
样本运行:
shape: (1000, 120) range: 1000
results equal: True
bincount 461.14772390574217 ms
bc(F) 435.3669326752424 ms
bc->dict 932.1215840056539 ms
Counter 3473.3258984051645 ms
shape: (100000, 12) range: 100
results equal: True
bincount 89.80463854968548 ms
bc(F) 43.449611216783524 ms
bc->dict 46.470773220062256 ms
Counter 1987.6734036952257 ms
我有一个想法,代码如下。 我的代码最大的缺点是它 运行 随着列的增加非常缓慢,并且比@Paul Panzer 的代码慢。我向 Paul Panzer 道歉。
如果你想更快,就忽略num_to_items的功能。因为 (1, 1)
等于 1*2**20 + 1
.
import numpy as np
from random import choice
from itertools import izip
from scipy.sparse import csr_matrix, csc_matrix
from scipy import sparse as sp
c_10 = np.array([[choice(range(1, 10)) for _ in range(3)] for _ in range(1000)])
c_1000 = np.array([[choice(range(1, 1000)) for _ in range(3)] for _ in range(1000)])
def _bit_to_items(num):
return (num >> 20, num & 0b1111111111111111111)
def unique_bit_shit(c):
cc = c << 20 # suppose that: 2**20 > max(c)
dialog_mtx_1 = np.array([[1, 0, 0],
[1, 0, 0],
[0, 1, 0]])
dialog_mtx_2 = np.array([[0, 1, 0],
[0, 0, 1],
[0, 0, 1]])
dialog_mtx_1 = dialog_mtx_1.T
dialog_mtx_2 = dialog_mtx_2.T
pairs = cc.dot(dialog_mtx_1) + c.dot(dialog_mtx_2)
pairs_num, count = np.unique(pairs, return_counts=True)
return [(_bit_to_items(num), v) for num, v in izip(pairs_num, count)]
def _dot_to_items(num):
# 2**20 is 1048576
return (num / 1048576, num % 1048576)
def unique_dot(c):
dialog_mtx_3 = np.array([[2**20, 2**20, 0],
[1, 0, 2**20],
[0, 1, 1]])
pairs = c.dot(dialog_mtx_3)
pairs_num, count = np.unique(pairs, return_counts=True)
return [(_dot_to_items(num), v) for num, v in izip(pairs_num, count)]