如何按 bin-count 条件合并直方图 bin(边和计数)?
How to merge histogram bins (edges and counts) by bin-count condition?
问题
我有一个要处理的数据直方图。更具体地说,我想合并计数小于给定阈值的垃圾箱。举个例子可能会更清楚。
import numpy as np
np.random.seed(327)
data = np.random.normal(loc=50, scale=10, size=100).astype(int)
edges = np.arange(0, 101, 10).astype(int)
counts, edges = np.histogram(data, edges)
# print("\n .. {} DATA:\n{}\n".format(data.shape, data))
# print("\n .. {} EDGES:\n{}\n".format(edges.shape, edges))
# print("\n .. {} COUNTS:\n{}\n".format(counts.shape, counts))
上面的print
命令如果没有注释掉,会输出如下:
.. (100,) DATA:
[67 46 47 32 59 61 49 46 45 72 67 51 41 37 44 56 38 61 45 45 42 39 49 55
32 35 52 40 55 34 52 51 39 55 50 62 47 43 48 39 53 54 75 38 53 44 46 39
50 49 31 46 55 64 64 52 41 34 32 33 58 65 38 64 37 47 58 43 49 49 50 57
71 44 41 39 47 51 47 63 55 52 43 43 49 65 48 43 44 38 64 49 62 41 40 67
47 55 57 54]
.. (11,) EDGES:
[ 0 10 20 30 40 50 60 70 80 90 100]
.. (10,) COUNTS:
[ 0 0 0 19 38 26 14 3 0 0]
请注意 counts
表明 data
包含一个峰。假设我选择了一个 bin 阈值 threshold=5
,这样任何包含少于 5
计数(0, ..., 4
计数;不包括 5
)的 bin 都与 next 合并 垃圾箱。这里,next被认为是朝向中心峰的方向。
期望输出
通过我想要的合并算法,我将获得以下输出:
edges = [30, 40, 50, 60, 80]
counts = [19, 38, 26, 17]
尝试解决
以下是我解决这个问题的错误尝试:
def agglomerate_bins(edges, counts, threshold):
condition = (counts >= threshold)
indices = {}
indices['all'] = condition
indices['above'] = np.where(condition == True)[0]
indices['below'] = np.where(condition != True)[0]
# merge left-side bins rightward
left_edges = [edges[0]]
left_counts = []
ileft, istop = indices['below'][0], indices['above'][0]
while ileft < istop:
cc = counts[ileft]
while cc < threshold:
ileft += 1
cc += counts[ileft]
ee = edges[ileft]
left_edges.append(ee)
left_counts.append(cc)
ileft += 1
# merge right-side bins leftward
right_edges, right_counts = [], []
iright, istop = indices['below'][-1], indices['above'][-1]
while iright > istop:
cc = counts[iright]
while cc < threshold:
iright -= 1
cc += counts[iright]
ee = edges[iright]
right_edges.append(ee)
right_counts.append(cc)
iright -= 1
# group modified bins with bins above threshold
middle_edges = edges[indices['above']].tolist()
middle_counts = edges[indices['above']].tolist()
mod_edges = np.array(left_edges + middle_edges + right_edges[::-1])
mod_counts = np.array(left_counts + middle_counts + right_counts[::-1])
return mod_edges, mod_counts
mod_edges, mod_counts = agglomerate_bins(edges, counts, threshold=5)
# print("\n .. {} MODIFIED EDGES:\n{}\n".format(mod_edges.shape, mod_edges))
# print("\n .. {} MODIFIED COUNTS:\n{}\n".format(mod_counts.shape, mod_counts))
上面的print
命令如果没有注释掉,会输出如下:
.. (7,) MODIFIED EDGES:
[ 0 30 30 40 50 60 60]
.. (6,) MODIFIED COUNTS:
[19 30 40 50 60 17]
我认为解决方案涉及遍历计数和边合并计数并删除 'unused' 边。这会捕获 [ ..., 1,2,3,...] => [..., 6, ...]。 counts
和 edges
被转换为允许轻松弹出不需要的项目的列表,这对于 np.arrays 来说效率不高。
import numpy as np
np.random.seed(327)
data = np.random.normal(loc=50, scale=10, size=100).astype(int)
edges = np.arange(0, 101, 10).astype(int)
counts, edges = np.histogram(data, edges)
def combine_edges( counts, edges, threshold ):
max_ix = counts.argmax()
c_list = list( counts ) # Lists can be popped from
e_list = list( edges ) # Lists can be popped from
def eliminate_left( ix ):
# Sum the count and eliminate the edge relevant to ix
# Before the peak (max_ix)
nonlocal max_ix
max_ix -= 1 # max_ix will change too.
c_list[ix+1]+=c_list[ix]
c_list.pop(ix)
e_list.pop(ix+1)
def eliminate_right( ix ):
# Sum the count and eliminate the edge relevant to ix
# after the peak (max_ix)
c_list[ix-1]+=c_list[ix]
c_list.pop(ix)
e_list.pop(ix)
def first_lt():
# Find the first ix less than the threshold
for ix, ct in enumerate( c_list[:max_ix] ):
if ct < threshold:
return ix # if ct < threshold return the index and exit the function
# The function only reaches here if no ct values are less than the threshold
return -1 # If zero items < threshold return -1
def last_lt():
# Find the last ix less than the threshold
for ix, ct in zip( range(len(c_list)-1, max_ix, -1), c_list[::-1]):
# ix reduces from len(c_list)-1, c_list is accessed in reverse order.
if ct < threshold:
return ix
return -1 # If no items < threshold return -1
cont = True
while cont:
# Each iteration removes any counts less than threshold
# before the peak. This process would combine e.g. counts of [...,1,2,3,...] into [..., 6, ...]
ix = first_lt()
if ix < 0:
cont = False # If first_lt returns -1 stop while loop
else:
eliminate_left( ix )
cont = True
while cont:
ix = last_lt()
if ix < 0:
cont = False # If last_lt returns -1 stop while loop
else:
eliminate_right( ix )
return np.array( c_list ), np.array( e_list )
c, e = combine_edges( counts, edges, 5)
print( c, '\n', e )
# [19 38 26 17]
# [ 0 40 50 60 100]
cts, edgs = np.histogram(data, e)
print( cts, '\n', edgs )
# [19 38 26 17]
# [ 0 40 50 60 100]
这感觉很笨拙,所以可能有更好的方法,但它确实有效。它是否按要求处理连续小于阈值的项目?
编辑 回答有关 first_lt
工作原理的评论。上面代码中的注释已更新
只有一个的替代实现 return。
def first_lt():
result = -1 # Set default
for ix, ct in enumerate( c_list[:max_ix] ):
if ct < threshold:
result = ix # If ct < threshold set result to ix
break # Break out of the loop
return result
first_lt
使用 print 语句显示执行时发生的情况。
def first_lt():
print('first_lt:',end=' ')
for ix, ct in enumerate( c_list[:max_ix] ):
print(ix,ct, end=': ')
if ct < threshold:
print('Return ix.')
return ix
print('Exiting loop, return -1')
return -1
假设当前直方图hist
和binsbin_edges
由np.hist()
函数返回,我们要合并小bins(即hist
的值为小于某个阈值)到更大的,代码如下所示,其中输入是当前的 hist 和 bins,输出是新的。
def merge_hist_bins(hist, bin_edges, \
hist_value_thred = 1, # i.e., 1% if is_percentile True;
is_percentile = False
):
total = np.sum(hist)
if is_percentile:
hist_thred = int(total*hist_value_thred*0.01)
else:
hist_thred = int(hist_value_thred)
print ("[***] hist_thred = ", hist_thred)
assert len(hist) == len(bin_edges) - 1
bin_dict = {}
i_rightmost = 0
for i in range(0, len(hist)):
if i < i_rightmost:
continue
edge_left = bin_edges[i]
j = i
tmp_hist_sum = 0
while tmp_hist_sum < hist_thred and j < len(hist):
tmp_hist_sum += hist[j]
j += 1
edge_right = bin_edges[j]
else:
bin_dict[(edge_left, edge_right)] = tmp_hist_sum
i_rightmost = j
idx = 0
new_hist = []
new_bin_edges = [bin_edges[0]]
for k , v in bin_dict.items():
new_hist.append(v)
new_bin_edges.append(k[1])
print ("key {} : {}".format(k, v))
idx += 1
print ("[***] done, hist_thred = ", hist_thred)
print ("[***] old bin # = {}, new bin # = {}".format(len(bin_edges), len(new_bin_edges)))
return np.array(new_hist), np.array(new_bin_edges), hist_thred
我们将使用以下函数显示直方图:
def show_hist(bin_edges, hist, fig_file = None):
d_min = bin_edges[0]
d_max = bin_edges[-1]
d_num = len(bin_edges)
fig, ax = plt.subplots() #create figure and axes
plt.hist(x=bin_edges[:-1], bins=bin_edges, weights=hist)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('My Very Own Histogram')
# Figure size in inches (default)
plt.text(x=0.5, y=0.5, \
s=r'$D_{min}=$'+"{}".format(d_min) + r', $D_{max}=$'+"{}".format(\
d_max) + r', $D_{num}=$'+"{}".format(d_num),
transform=ax.transAxes)
if fig_file:
plt.savefig("./results/{}.png".format(fig_file))
print ("saved ", "./results/{}.png".format(fig_file))
plt.show()
txt_fn = "./results/" + npz_file + ".csv"
comment = "#right_bin_edge, hist_value"
file_lists = [ "{},{}".format(i, j if j > 50 else 0.5) for (i,j) in zip(bin_edges[1:], hist)]
file_lists = [comment] + file_lists
write_to_file(txt_fn, file_lists)
见前直方图
及之后
bin合并。在本例中,输入 hist bin # = 256,new hist bin # = 95,阈值为 12%
of sum(hist)
.
在 here 查看完整的代码和注释。
问题
我有一个要处理的数据直方图。更具体地说,我想合并计数小于给定阈值的垃圾箱。举个例子可能会更清楚。
import numpy as np
np.random.seed(327)
data = np.random.normal(loc=50, scale=10, size=100).astype(int)
edges = np.arange(0, 101, 10).astype(int)
counts, edges = np.histogram(data, edges)
# print("\n .. {} DATA:\n{}\n".format(data.shape, data))
# print("\n .. {} EDGES:\n{}\n".format(edges.shape, edges))
# print("\n .. {} COUNTS:\n{}\n".format(counts.shape, counts))
上面的print
命令如果没有注释掉,会输出如下:
.. (100,) DATA:
[67 46 47 32 59 61 49 46 45 72 67 51 41 37 44 56 38 61 45 45 42 39 49 55
32 35 52 40 55 34 52 51 39 55 50 62 47 43 48 39 53 54 75 38 53 44 46 39
50 49 31 46 55 64 64 52 41 34 32 33 58 65 38 64 37 47 58 43 49 49 50 57
71 44 41 39 47 51 47 63 55 52 43 43 49 65 48 43 44 38 64 49 62 41 40 67
47 55 57 54]
.. (11,) EDGES:
[ 0 10 20 30 40 50 60 70 80 90 100]
.. (10,) COUNTS:
[ 0 0 0 19 38 26 14 3 0 0]
请注意 counts
表明 data
包含一个峰。假设我选择了一个 bin 阈值 threshold=5
,这样任何包含少于 5
计数(0, ..., 4
计数;不包括 5
)的 bin 都与 next 合并 垃圾箱。这里,next被认为是朝向中心峰的方向。
期望输出
通过我想要的合并算法,我将获得以下输出:
edges = [30, 40, 50, 60, 80]
counts = [19, 38, 26, 17]
尝试解决
以下是我解决这个问题的错误尝试:
def agglomerate_bins(edges, counts, threshold):
condition = (counts >= threshold)
indices = {}
indices['all'] = condition
indices['above'] = np.where(condition == True)[0]
indices['below'] = np.where(condition != True)[0]
# merge left-side bins rightward
left_edges = [edges[0]]
left_counts = []
ileft, istop = indices['below'][0], indices['above'][0]
while ileft < istop:
cc = counts[ileft]
while cc < threshold:
ileft += 1
cc += counts[ileft]
ee = edges[ileft]
left_edges.append(ee)
left_counts.append(cc)
ileft += 1
# merge right-side bins leftward
right_edges, right_counts = [], []
iright, istop = indices['below'][-1], indices['above'][-1]
while iright > istop:
cc = counts[iright]
while cc < threshold:
iright -= 1
cc += counts[iright]
ee = edges[iright]
right_edges.append(ee)
right_counts.append(cc)
iright -= 1
# group modified bins with bins above threshold
middle_edges = edges[indices['above']].tolist()
middle_counts = edges[indices['above']].tolist()
mod_edges = np.array(left_edges + middle_edges + right_edges[::-1])
mod_counts = np.array(left_counts + middle_counts + right_counts[::-1])
return mod_edges, mod_counts
mod_edges, mod_counts = agglomerate_bins(edges, counts, threshold=5)
# print("\n .. {} MODIFIED EDGES:\n{}\n".format(mod_edges.shape, mod_edges))
# print("\n .. {} MODIFIED COUNTS:\n{}\n".format(mod_counts.shape, mod_counts))
上面的print
命令如果没有注释掉,会输出如下:
.. (7,) MODIFIED EDGES:
[ 0 30 30 40 50 60 60]
.. (6,) MODIFIED COUNTS:
[19 30 40 50 60 17]
我认为解决方案涉及遍历计数和边合并计数并删除 'unused' 边。这会捕获 [ ..., 1,2,3,...] => [..., 6, ...]。 counts
和 edges
被转换为允许轻松弹出不需要的项目的列表,这对于 np.arrays 来说效率不高。
import numpy as np
np.random.seed(327)
data = np.random.normal(loc=50, scale=10, size=100).astype(int)
edges = np.arange(0, 101, 10).astype(int)
counts, edges = np.histogram(data, edges)
def combine_edges( counts, edges, threshold ):
max_ix = counts.argmax()
c_list = list( counts ) # Lists can be popped from
e_list = list( edges ) # Lists can be popped from
def eliminate_left( ix ):
# Sum the count and eliminate the edge relevant to ix
# Before the peak (max_ix)
nonlocal max_ix
max_ix -= 1 # max_ix will change too.
c_list[ix+1]+=c_list[ix]
c_list.pop(ix)
e_list.pop(ix+1)
def eliminate_right( ix ):
# Sum the count and eliminate the edge relevant to ix
# after the peak (max_ix)
c_list[ix-1]+=c_list[ix]
c_list.pop(ix)
e_list.pop(ix)
def first_lt():
# Find the first ix less than the threshold
for ix, ct in enumerate( c_list[:max_ix] ):
if ct < threshold:
return ix # if ct < threshold return the index and exit the function
# The function only reaches here if no ct values are less than the threshold
return -1 # If zero items < threshold return -1
def last_lt():
# Find the last ix less than the threshold
for ix, ct in zip( range(len(c_list)-1, max_ix, -1), c_list[::-1]):
# ix reduces from len(c_list)-1, c_list is accessed in reverse order.
if ct < threshold:
return ix
return -1 # If no items < threshold return -1
cont = True
while cont:
# Each iteration removes any counts less than threshold
# before the peak. This process would combine e.g. counts of [...,1,2,3,...] into [..., 6, ...]
ix = first_lt()
if ix < 0:
cont = False # If first_lt returns -1 stop while loop
else:
eliminate_left( ix )
cont = True
while cont:
ix = last_lt()
if ix < 0:
cont = False # If last_lt returns -1 stop while loop
else:
eliminate_right( ix )
return np.array( c_list ), np.array( e_list )
c, e = combine_edges( counts, edges, 5)
print( c, '\n', e )
# [19 38 26 17]
# [ 0 40 50 60 100]
cts, edgs = np.histogram(data, e)
print( cts, '\n', edgs )
# [19 38 26 17]
# [ 0 40 50 60 100]
这感觉很笨拙,所以可能有更好的方法,但它确实有效。它是否按要求处理连续小于阈值的项目?
编辑 回答有关 first_lt
工作原理的评论。上面代码中的注释已更新
只有一个的替代实现 return。
def first_lt():
result = -1 # Set default
for ix, ct in enumerate( c_list[:max_ix] ):
if ct < threshold:
result = ix # If ct < threshold set result to ix
break # Break out of the loop
return result
first_lt
使用 print 语句显示执行时发生的情况。
def first_lt():
print('first_lt:',end=' ')
for ix, ct in enumerate( c_list[:max_ix] ):
print(ix,ct, end=': ')
if ct < threshold:
print('Return ix.')
return ix
print('Exiting loop, return -1')
return -1
假设当前直方图hist
和binsbin_edges
由np.hist()
函数返回,我们要合并小bins(即hist
的值为小于某个阈值)到更大的,代码如下所示,其中输入是当前的 hist 和 bins,输出是新的。
def merge_hist_bins(hist, bin_edges, \
hist_value_thred = 1, # i.e., 1% if is_percentile True;
is_percentile = False
):
total = np.sum(hist)
if is_percentile:
hist_thred = int(total*hist_value_thred*0.01)
else:
hist_thred = int(hist_value_thred)
print ("[***] hist_thred = ", hist_thred)
assert len(hist) == len(bin_edges) - 1
bin_dict = {}
i_rightmost = 0
for i in range(0, len(hist)):
if i < i_rightmost:
continue
edge_left = bin_edges[i]
j = i
tmp_hist_sum = 0
while tmp_hist_sum < hist_thred and j < len(hist):
tmp_hist_sum += hist[j]
j += 1
edge_right = bin_edges[j]
else:
bin_dict[(edge_left, edge_right)] = tmp_hist_sum
i_rightmost = j
idx = 0
new_hist = []
new_bin_edges = [bin_edges[0]]
for k , v in bin_dict.items():
new_hist.append(v)
new_bin_edges.append(k[1])
print ("key {} : {}".format(k, v))
idx += 1
print ("[***] done, hist_thred = ", hist_thred)
print ("[***] old bin # = {}, new bin # = {}".format(len(bin_edges), len(new_bin_edges)))
return np.array(new_hist), np.array(new_bin_edges), hist_thred
我们将使用以下函数显示直方图:
def show_hist(bin_edges, hist, fig_file = None):
d_min = bin_edges[0]
d_max = bin_edges[-1]
d_num = len(bin_edges)
fig, ax = plt.subplots() #create figure and axes
plt.hist(x=bin_edges[:-1], bins=bin_edges, weights=hist)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('My Very Own Histogram')
# Figure size in inches (default)
plt.text(x=0.5, y=0.5, \
s=r'$D_{min}=$'+"{}".format(d_min) + r', $D_{max}=$'+"{}".format(\
d_max) + r', $D_{num}=$'+"{}".format(d_num),
transform=ax.transAxes)
if fig_file:
plt.savefig("./results/{}.png".format(fig_file))
print ("saved ", "./results/{}.png".format(fig_file))
plt.show()
txt_fn = "./results/" + npz_file + ".csv"
comment = "#right_bin_edge, hist_value"
file_lists = [ "{},{}".format(i, j if j > 50 else 0.5) for (i,j) in zip(bin_edges[1:], hist)]
file_lists = [comment] + file_lists
write_to_file(txt_fn, file_lists)
见前直方图
及之后
bin合并。在本例中,输入 hist bin # = 256,new hist bin # = 95,阈值为 12%
of sum(hist)
.
在 here 查看完整的代码和注释。