numpy 1D 数组:屏蔽重复 n 次以上的元素
numpy 1D array: mask elements that repeat more than n times
Q: 给定一个整数数组,如
[1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5]
我需要屏蔽重复超过 N
次的元素。目标是检索布尔掩码数组。
我想出了一个相当复杂的解决方案:
import numpy as np
bins = np.array([1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5])
N = 3
splits = np.split(bins, np.where(np.diff(bins) != 0)[0]+1)
mask = []
for s in splits:
if s.shape[0] <= N:
mask.append(np.ones(s.shape[0]).astype(np.bool_))
else:
mask.append(np.append(np.ones(N), np.zeros(s.shape[0]-N)).astype(np.bool_))
mask = np.concatenate(mask)
例如
bins[mask]
Out[90]: array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5])
有更好的方法吗?
总结: 这是 MSeifert 基准图的精简版(感谢您指点我 simple_benchmark
)。显示四个最高性能的选项:
, modified by seems to be a great way of solving this problem as it is pretty straight forward and numpy
-only. If you're fine with using numba
, 提出的想法胜过另一个。
我选择接受 MSeifert 的答案作为解决方案,因为它是更一般的答案:它正确地处理具有(非唯一)连续重复元素块的任意数组。如果 numba
不行, 也值得一看。
您可以通过索引来做到这一点。对于任何 N,代码将是:
N = 3
bins = np.array([1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,6])
mask = [True for _ in range(N)] + list(bins[:-N] != bins[N:])
bins[mask]
输出:
array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6]
方法 #1: 这是一种矢量化方式 -
from scipy.ndimage.morphology import binary_dilation
def keep_N_per_group(a, N):
k = np.ones(N,dtype=bool)
m = np.r_[True,a[:-1]!=a[1:]]
return a[binary_dilation(m,k,origin=-(N//2))]
样本运行-
In [42]: a
Out[42]: array([1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5])
In [43]: keep_N_per_group(a, N=3)
Out[43]: array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5])
方法 #2: 更紧凑的版本 -
def keep_N_per_group_v2(a, N):
k = np.ones(N,dtype=bool)
return a[binary_dilation(np.ediff1d(a,to_begin=a[0])!=0,k,origin=-(N//2))]
方法 #3: 使用分组计数和 np.repeat
(虽然不会给我们掩码)-
def keep_N_per_group_v3(a, N):
m = np.r_[True,a[:-1]!=a[1:],True]
idx = np.flatnonzero(m)
c = np.diff(idx)
return np.repeat(a[idx[:-1]],np.minimum(c,N))
方法 #4: 使用 view-based
方法 -
from skimage.util import view_as_windows
def keep_N_per_group_v4(a, N):
m = np.r_[True,a[:-1]!=a[1:]]
w = view_as_windows(m,N)
idx = np.flatnonzero(m)
v = idx<len(w)
w[idx[v]] = 1
if v.all()==0:
m[idx[v.argmin()]:] = 1
return a[m]
方法 #5: 使用 view-based
方法,没有来自 flatnonzero
-
的索引
def keep_N_per_group_v5(a, N):
m = np.r_[True,a[:-1]!=a[1:]]
w = view_as_windows(m,N)
last_idx = len(a)-m[::-1].argmax()-1
w[m[:-N+1]] = 1
m[last_idx:last_idx+N] = 1
return a[m]
您可以使用 while 循环来检查后 N 个位置的数组元素是否等于当前位置。请注意,此解决方案假定数组是有序的。
import numpy as np
bins = [1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5]
N = 3
counter = N
while counter < len(bins):
drop_condition = (bins[counter] == bins[counter - N])
if drop_condition:
bins = np.delete(bins, counter)
else:
# move on to next element
counter += 1
更好的方法是使用 numpy
的 unique()
函数。您将在数组中获得唯一条目以及它们出现的频率计数:
bins = np.array([1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5])
N = 3
unique, index,count = np.unique(bins, return_index=True, return_counts=True)
mask = np.full(bins.shape, True, dtype=bool)
for i,c in zip(index,count):
if c>N:
mask[i+N:i+c] = False
bins[mask]
输出:
array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5])
解决方案
您可以使用 numpy.unique
。变量 final_mask
可用于从数组 bins
中提取目标元素。
import numpy as np
bins = np.array([1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5])
repeat_max = 3
unique, counts = np.unique(bins, return_counts=True)
mod_counts = np.array([x if x<=repeat_max else repeat_max for x in counts])
mask = np.arange(bins.size)
#final_values = np.hstack([bins[bins==value][:count] for value, count in zip(unique, mod_counts)])
final_mask = np.hstack([mask[bins==value][:count] for value, count in zip(unique, mod_counts)])
bins[final_mask]
输出:
array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5])
您可以使用 grouby 对长度超过 N.
的公共元素和过滤器列表进行分组
import numpy as np
from itertools import groupby, chain
def ifElse(condition, exec1, exec2):
if condition : return exec1
else : return exec2
def solve(bins, N = None):
xss = groupby(bins)
xss = map(lambda xs : list(xs[1]), xss)
xss = map(lambda xs : ifElse(len(xs) > N, xs[:N], xs), xss)
xs = chain.from_iterable(xss)
return list(xs)
bins = np.array([1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5])
solve(bins, N = 3)
免责声明:这只是@FlorianH 想法的更合理实现:
def f(a,N):
mask = np.empty(a.size,bool)
mask[:N] = True
np.not_equal(a[N:],a[:-N],out=mask[N:])
return mask
对于更大的阵列,这会产生巨大的差异:
a = np.arange(1000).repeat(np.random.randint(0,10,1000))
N = 3
print(timeit(lambda:f(a,N),number=1000)*1000,"us")
# 5.443050000394578 us
# compare to
print(timeit(lambda:[True for _ in range(N)] + list(bins[:-N] != bins[N:]),number=1000)*1000,"us")
# 76.18969900067896 us
我想提出一个使用 numba 的解决方案,应该很容易理解。我假设您想要 "mask" 个连续的重复项:
import numpy as np
import numba as nb
@nb.njit
def mask_more_n(arr, n):
mask = np.ones(arr.shape, np.bool_)
current = arr[0]
count = 0
for idx, item in enumerate(arr):
if item == current:
count += 1
else:
current = item
count = 1
mask[idx] = count <= n
return mask
例如:
>>> bins = np.array([1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5])
>>> bins[mask_more_n(bins, 3)]
array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5])
>>> bins[mask_more_n(bins, 2)]
array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5])
性能:
使用 simple_benchmark
- 但是我没有包括所有方法。这是一个 log-log 比例:
似乎 numba 解决方案无法击败 Paul Panzer 的解决方案,后者对于大型数组似乎更快一些(并且不需要额外的依赖)。
然而,两者似乎都优于其他解决方案,但它们使用 return 掩码而不是 "filtered" 数组。
import numpy as np
import numba as nb
from simple_benchmark import BenchmarkBuilder, MultiArgument
b = BenchmarkBuilder()
bins = np.array([1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5])
@nb.njit
def mask_more_n(arr, n):
mask = np.ones(arr.shape, np.bool_)
current = arr[0]
count = 0
for idx, item in enumerate(arr):
if item == current:
count += 1
else:
current = item
count = 1
mask[idx] = count <= n
return mask
@b.add_function(warmups=True)
def MSeifert(arr, n):
return mask_more_n(arr, n)
from scipy.ndimage.morphology import binary_dilation
@b.add_function()
def Divakar_1(a, N):
k = np.ones(N,dtype=bool)
m = np.r_[True,a[:-1]!=a[1:]]
return a[binary_dilation(m,k,origin=-(N//2))]
@b.add_function()
def Divakar_2(a, N):
k = np.ones(N,dtype=bool)
return a[binary_dilation(np.ediff1d(a,to_begin=a[0])!=0,k,origin=-(N//2))]
@b.add_function()
def Divakar_3(a, N):
m = np.r_[True,a[:-1]!=a[1:],True]
idx = np.flatnonzero(m)
c = np.diff(idx)
return np.repeat(a[idx[:-1]],np.minimum(c,N))
from skimage.util import view_as_windows
@b.add_function()
def Divakar_4(a, N):
m = np.r_[True,a[:-1]!=a[1:]]
w = view_as_windows(m,N)
idx = np.flatnonzero(m)
v = idx<len(w)
w[idx[v]] = 1
if v.all()==0:
m[idx[v.argmin()]:] = 1
return a[m]
@b.add_function()
def Divakar_5(a, N):
m = np.r_[True,a[:-1]!=a[1:]]
w = view_as_windows(m,N)
last_idx = len(a)-m[::-1].argmax()-1
w[m[:-N+1]] = 1
m[last_idx:last_idx+N] = 1
return a[m]
@b.add_function()
def PaulPanzer(a,N):
mask = np.empty(a.size,bool)
mask[:N] = True
np.not_equal(a[N:],a[:-N],out=mask[N:])
return mask
import random
@b.add_arguments('array size')
def argument_provider():
for exp in range(2, 20):
size = 2**exp
yield size, MultiArgument([np.array([random.randint(0, 5) for _ in range(size)]), 3])
r = b.run()
import matplotlib.pyplot as plt
plt.figure(figsize=[10, 8])
r.plot()
Q: 给定一个整数数组,如
[1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5]
我需要屏蔽重复超过 N
次的元素。目标是检索布尔掩码数组。
我想出了一个相当复杂的解决方案:
import numpy as np
bins = np.array([1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5])
N = 3
splits = np.split(bins, np.where(np.diff(bins) != 0)[0]+1)
mask = []
for s in splits:
if s.shape[0] <= N:
mask.append(np.ones(s.shape[0]).astype(np.bool_))
else:
mask.append(np.append(np.ones(N), np.zeros(s.shape[0]-N)).astype(np.bool_))
mask = np.concatenate(mask)
例如
bins[mask]
Out[90]: array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5])
有更好的方法吗?
总结: 这是 MSeifert 基准图的精简版(感谢您指点我 simple_benchmark
)。显示四个最高性能的选项:
numpy
-only. If you're fine with using numba
,
我选择接受 MSeifert 的答案作为解决方案,因为它是更一般的答案:它正确地处理具有(非唯一)连续重复元素块的任意数组。如果 numba
不行,
您可以通过索引来做到这一点。对于任何 N,代码将是:
N = 3
bins = np.array([1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,6])
mask = [True for _ in range(N)] + list(bins[:-N] != bins[N:])
bins[mask]
输出:
array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6]
方法 #1: 这是一种矢量化方式 -
from scipy.ndimage.morphology import binary_dilation
def keep_N_per_group(a, N):
k = np.ones(N,dtype=bool)
m = np.r_[True,a[:-1]!=a[1:]]
return a[binary_dilation(m,k,origin=-(N//2))]
样本运行-
In [42]: a
Out[42]: array([1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5])
In [43]: keep_N_per_group(a, N=3)
Out[43]: array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5])
方法 #2: 更紧凑的版本 -
def keep_N_per_group_v2(a, N):
k = np.ones(N,dtype=bool)
return a[binary_dilation(np.ediff1d(a,to_begin=a[0])!=0,k,origin=-(N//2))]
方法 #3: 使用分组计数和 np.repeat
(虽然不会给我们掩码)-
def keep_N_per_group_v3(a, N):
m = np.r_[True,a[:-1]!=a[1:],True]
idx = np.flatnonzero(m)
c = np.diff(idx)
return np.repeat(a[idx[:-1]],np.minimum(c,N))
方法 #4: 使用 view-based
方法 -
from skimage.util import view_as_windows
def keep_N_per_group_v4(a, N):
m = np.r_[True,a[:-1]!=a[1:]]
w = view_as_windows(m,N)
idx = np.flatnonzero(m)
v = idx<len(w)
w[idx[v]] = 1
if v.all()==0:
m[idx[v.argmin()]:] = 1
return a[m]
方法 #5: 使用 view-based
方法,没有来自 flatnonzero
-
def keep_N_per_group_v5(a, N):
m = np.r_[True,a[:-1]!=a[1:]]
w = view_as_windows(m,N)
last_idx = len(a)-m[::-1].argmax()-1
w[m[:-N+1]] = 1
m[last_idx:last_idx+N] = 1
return a[m]
您可以使用 while 循环来检查后 N 个位置的数组元素是否等于当前位置。请注意,此解决方案假定数组是有序的。
import numpy as np
bins = [1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5]
N = 3
counter = N
while counter < len(bins):
drop_condition = (bins[counter] == bins[counter - N])
if drop_condition:
bins = np.delete(bins, counter)
else:
# move on to next element
counter += 1
更好的方法是使用 numpy
的 unique()
函数。您将在数组中获得唯一条目以及它们出现的频率计数:
bins = np.array([1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5])
N = 3
unique, index,count = np.unique(bins, return_index=True, return_counts=True)
mask = np.full(bins.shape, True, dtype=bool)
for i,c in zip(index,count):
if c>N:
mask[i+N:i+c] = False
bins[mask]
输出:
array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5])
解决方案
您可以使用 numpy.unique
。变量 final_mask
可用于从数组 bins
中提取目标元素。
import numpy as np
bins = np.array([1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5])
repeat_max = 3
unique, counts = np.unique(bins, return_counts=True)
mod_counts = np.array([x if x<=repeat_max else repeat_max for x in counts])
mask = np.arange(bins.size)
#final_values = np.hstack([bins[bins==value][:count] for value, count in zip(unique, mod_counts)])
final_mask = np.hstack([mask[bins==value][:count] for value, count in zip(unique, mod_counts)])
bins[final_mask]
输出:
array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5])
您可以使用 grouby 对长度超过 N.
的公共元素和过滤器列表进行分组import numpy as np
from itertools import groupby, chain
def ifElse(condition, exec1, exec2):
if condition : return exec1
else : return exec2
def solve(bins, N = None):
xss = groupby(bins)
xss = map(lambda xs : list(xs[1]), xss)
xss = map(lambda xs : ifElse(len(xs) > N, xs[:N], xs), xss)
xs = chain.from_iterable(xss)
return list(xs)
bins = np.array([1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5])
solve(bins, N = 3)
免责声明:这只是@FlorianH 想法的更合理实现:
def f(a,N):
mask = np.empty(a.size,bool)
mask[:N] = True
np.not_equal(a[N:],a[:-N],out=mask[N:])
return mask
对于更大的阵列,这会产生巨大的差异:
a = np.arange(1000).repeat(np.random.randint(0,10,1000))
N = 3
print(timeit(lambda:f(a,N),number=1000)*1000,"us")
# 5.443050000394578 us
# compare to
print(timeit(lambda:[True for _ in range(N)] + list(bins[:-N] != bins[N:]),number=1000)*1000,"us")
# 76.18969900067896 us
我想提出一个使用 numba 的解决方案,应该很容易理解。我假设您想要 "mask" 个连续的重复项:
import numpy as np
import numba as nb
@nb.njit
def mask_more_n(arr, n):
mask = np.ones(arr.shape, np.bool_)
current = arr[0]
count = 0
for idx, item in enumerate(arr):
if item == current:
count += 1
else:
current = item
count = 1
mask[idx] = count <= n
return mask
例如:
>>> bins = np.array([1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5])
>>> bins[mask_more_n(bins, 3)]
array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5])
>>> bins[mask_more_n(bins, 2)]
array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5])
性能:
使用 simple_benchmark
- 但是我没有包括所有方法。这是一个 log-log 比例:
似乎 numba 解决方案无法击败 Paul Panzer 的解决方案,后者对于大型数组似乎更快一些(并且不需要额外的依赖)。
然而,两者似乎都优于其他解决方案,但它们使用 return 掩码而不是 "filtered" 数组。
import numpy as np
import numba as nb
from simple_benchmark import BenchmarkBuilder, MultiArgument
b = BenchmarkBuilder()
bins = np.array([1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5])
@nb.njit
def mask_more_n(arr, n):
mask = np.ones(arr.shape, np.bool_)
current = arr[0]
count = 0
for idx, item in enumerate(arr):
if item == current:
count += 1
else:
current = item
count = 1
mask[idx] = count <= n
return mask
@b.add_function(warmups=True)
def MSeifert(arr, n):
return mask_more_n(arr, n)
from scipy.ndimage.morphology import binary_dilation
@b.add_function()
def Divakar_1(a, N):
k = np.ones(N,dtype=bool)
m = np.r_[True,a[:-1]!=a[1:]]
return a[binary_dilation(m,k,origin=-(N//2))]
@b.add_function()
def Divakar_2(a, N):
k = np.ones(N,dtype=bool)
return a[binary_dilation(np.ediff1d(a,to_begin=a[0])!=0,k,origin=-(N//2))]
@b.add_function()
def Divakar_3(a, N):
m = np.r_[True,a[:-1]!=a[1:],True]
idx = np.flatnonzero(m)
c = np.diff(idx)
return np.repeat(a[idx[:-1]],np.minimum(c,N))
from skimage.util import view_as_windows
@b.add_function()
def Divakar_4(a, N):
m = np.r_[True,a[:-1]!=a[1:]]
w = view_as_windows(m,N)
idx = np.flatnonzero(m)
v = idx<len(w)
w[idx[v]] = 1
if v.all()==0:
m[idx[v.argmin()]:] = 1
return a[m]
@b.add_function()
def Divakar_5(a, N):
m = np.r_[True,a[:-1]!=a[1:]]
w = view_as_windows(m,N)
last_idx = len(a)-m[::-1].argmax()-1
w[m[:-N+1]] = 1
m[last_idx:last_idx+N] = 1
return a[m]
@b.add_function()
def PaulPanzer(a,N):
mask = np.empty(a.size,bool)
mask[:N] = True
np.not_equal(a[N:],a[:-N],out=mask[N:])
return mask
import random
@b.add_arguments('array size')
def argument_provider():
for exp in range(2, 20):
size = 2**exp
yield size, MultiArgument([np.array([random.randint(0, 5) for _ in range(size)]), 3])
r = b.run()
import matplotlib.pyplot as plt
plt.figure(figsize=[10, 8])
r.plot()