使用 heapq 实现 MinHeap 时保持状态
Persisting state when implementing MinHeap with heapq
我正在尝试实现一个 StreamingMedian
对象以通过连续调用 get_median()
来保持中位数。为此,我还通过 heapq
模块实现了 MinHeap
和 MaxHeap
class。
不过我 运行 遇到了一个非常奇怪的错误。出于某种原因,当我 运行 以下命令时:
print("Before streaming medians", MinHeap(), sep="\t") # is empty
b = StreamingMedian()
b.add_val(5)
b.add_val(100)
assert b.get_median() == 52.5
print("After streaming medians, for MaxHeap", MaxHeap(), sep='\t') # is empty
print("After streaming medians, for MinHeap", MinHeap(), sep='\t') # should be empty
print("After streaming medians, for MinHeap with input",
MinHeap([]), sep="\t") # is empty
我得到以下输出:
Before streaming medians []
After streaming medians, for MaxHeap []
After streaming medians, for MinHeap [100]
After streaming medians, for MinHeap with input []
class 实现可以在下面找到。我在 Python 3.5.2 :: Anaconda 自定义(64 位)上 运行 宁此。
import heapq
class MinHeap(object):
def __init__(self, l=[]):
self.heap = l
heapq.heapify(l)
def peek(self):
return self.heap[0]
def pop(self):
return heapq.heappop(self.heap)
def push(self, x):
heapq.heappush(self.heap, x)
def pushpop(self, x):
return heapq.heappushpop(self.heap, x)
def replace(self, x):
return heapq.heapreplace(self.heap, x)
def __len__(self):
return len(self.heap)
def __str__(self):
return str(self.heap)
class MaxHeap(MinHeap):
def _invert_sign(self, l):
return [-1 * a for a in l]
def __init__(self, l=[]):
super().__init__(self._invert_sign(l))
def push(self, x):
super().push(-1 * x)
def pushpop(self, x):
return super().pushpop(-1 * x)
def replace(self, x):
return super().replace(-1 * x)
def pop(self):
return -1 * super().pop()
def peek(self):
return -1 * super().peek()
def __str__(self):
return str(self._invert_sign(self.heap))
class StreamingMedian():
def __init__(self):
self.min_heap = MinHeap()
self.max_heap = MaxHeap()
def get_median(self):
min_heap_has_x_more = len(self.min_heap) - len(self.max_heap)
if min_heap_has_x_more > 0:
return self.min_heap.peek()
elif min_heap_has_x_more < 0:
return self.max_heap.peek()
else:
return (self.min_heap.peek() + self.max_heap.peek())/2
def add_val(self, x):
if len(self.min_heap) + len(self.max_heap) == 0:
self.max_heap.push(x)
else:
med = self.get_median()
if x > med:
self.min_heap.push(x)
self._ensure_balance()
elif x < med:
self.max_heap.push(x)
self._ensure_balance()
else:
self.max_heap.push(x)
self._ensure_balance()
def _ensure_balance(self):
size_diff = len(self.min_heap) - len(self.max_heap)
if abs(size_diff) > 1:
if size_diff > 0: # min_heap has 2 more elements
self.max_heap.push(self.min_heap.pop())
else: # max_heap has 2 more elements
self.min_heap.push(self.max_heap.pop())
assert abs(len(self.min_heap) - len(self.max_heap)) < 2
print("Before streaming medians", MinHeap(), sep="\t")
b = StreamingMedian()
b.add_val(5)
b.add_val(100)
assert b.get_median() == 52.5
print("After streaming medians, for MaxHeap", MaxHeap(), sep='\t') # is empty
print("After streaming medians, for MinHeap", MinHeap(), sep='\t') # should be empty
print("After streaming medians, for MinHeap with input", MinHeap([]), sep="\t") # is empty
问题
问题的发生是因为 python 中默认参数的计算方式。它们是在第一次调用函数时计算的,之后该函数与最初计算的值一起使用。如果默认值是可变的(如列表),那么这会带来问题。
那么 MinHeap
发生的事情是:
MinHeap
最初创建并且 l
参数默认参数被分配给一个内存地址。
StreamingMedian
修改 MinHeap
的 self.heap
与 l
指向的内容相同。
- 再次调用
MinHeap
时,l
已有内存地址,再次使用此内存地址与self.heap
绑定。
MaxHeap
不会发生这种情况,因为:
MaxHeap
最初创建并且 l
参数默认参数被分配给一个内存地址。
_invert_sign
创建一个分配给 self.heap
的新列表。默认参数 l
永远不会被修改。
- 再次初始化
MaxHeap
时,l
已经有了内存地址,再次使用,但从未被修改,构造了另一个副本,所以永远不会被修改。
解决方案
而不是:
class MinHeap(object):
def __init__(self, l=[]):
self.heap = l
heapq.heapify(l)
我们应该使用:
class MinHeap(object):
def __init__(self, l=None):
if l is None:
l = []
self.heap = l
heapq.heapify(l)
应该对 MaxHeap
进行类似的更改
我正在尝试实现一个 StreamingMedian
对象以通过连续调用 get_median()
来保持中位数。为此,我还通过 heapq
模块实现了 MinHeap
和 MaxHeap
class。
不过我 运行 遇到了一个非常奇怪的错误。出于某种原因,当我 运行 以下命令时:
print("Before streaming medians", MinHeap(), sep="\t") # is empty
b = StreamingMedian()
b.add_val(5)
b.add_val(100)
assert b.get_median() == 52.5
print("After streaming medians, for MaxHeap", MaxHeap(), sep='\t') # is empty
print("After streaming medians, for MinHeap", MinHeap(), sep='\t') # should be empty
print("After streaming medians, for MinHeap with input",
MinHeap([]), sep="\t") # is empty
我得到以下输出:
Before streaming medians []
After streaming medians, for MaxHeap []
After streaming medians, for MinHeap [100]
After streaming medians, for MinHeap with input []
class 实现可以在下面找到。我在 Python 3.5.2 :: Anaconda 自定义(64 位)上 运行 宁此。
import heapq
class MinHeap(object):
def __init__(self, l=[]):
self.heap = l
heapq.heapify(l)
def peek(self):
return self.heap[0]
def pop(self):
return heapq.heappop(self.heap)
def push(self, x):
heapq.heappush(self.heap, x)
def pushpop(self, x):
return heapq.heappushpop(self.heap, x)
def replace(self, x):
return heapq.heapreplace(self.heap, x)
def __len__(self):
return len(self.heap)
def __str__(self):
return str(self.heap)
class MaxHeap(MinHeap):
def _invert_sign(self, l):
return [-1 * a for a in l]
def __init__(self, l=[]):
super().__init__(self._invert_sign(l))
def push(self, x):
super().push(-1 * x)
def pushpop(self, x):
return super().pushpop(-1 * x)
def replace(self, x):
return super().replace(-1 * x)
def pop(self):
return -1 * super().pop()
def peek(self):
return -1 * super().peek()
def __str__(self):
return str(self._invert_sign(self.heap))
class StreamingMedian():
def __init__(self):
self.min_heap = MinHeap()
self.max_heap = MaxHeap()
def get_median(self):
min_heap_has_x_more = len(self.min_heap) - len(self.max_heap)
if min_heap_has_x_more > 0:
return self.min_heap.peek()
elif min_heap_has_x_more < 0:
return self.max_heap.peek()
else:
return (self.min_heap.peek() + self.max_heap.peek())/2
def add_val(self, x):
if len(self.min_heap) + len(self.max_heap) == 0:
self.max_heap.push(x)
else:
med = self.get_median()
if x > med:
self.min_heap.push(x)
self._ensure_balance()
elif x < med:
self.max_heap.push(x)
self._ensure_balance()
else:
self.max_heap.push(x)
self._ensure_balance()
def _ensure_balance(self):
size_diff = len(self.min_heap) - len(self.max_heap)
if abs(size_diff) > 1:
if size_diff > 0: # min_heap has 2 more elements
self.max_heap.push(self.min_heap.pop())
else: # max_heap has 2 more elements
self.min_heap.push(self.max_heap.pop())
assert abs(len(self.min_heap) - len(self.max_heap)) < 2
print("Before streaming medians", MinHeap(), sep="\t")
b = StreamingMedian()
b.add_val(5)
b.add_val(100)
assert b.get_median() == 52.5
print("After streaming medians, for MaxHeap", MaxHeap(), sep='\t') # is empty
print("After streaming medians, for MinHeap", MinHeap(), sep='\t') # should be empty
print("After streaming medians, for MinHeap with input", MinHeap([]), sep="\t") # is empty
问题
问题的发生是因为 python 中默认参数的计算方式。它们是在第一次调用函数时计算的,之后该函数与最初计算的值一起使用。如果默认值是可变的(如列表),那么这会带来问题。
那么 MinHeap
发生的事情是:
MinHeap
最初创建并且l
参数默认参数被分配给一个内存地址。StreamingMedian
修改MinHeap
的self.heap
与l
指向的内容相同。- 再次调用
MinHeap
时,l
已有内存地址,再次使用此内存地址与self.heap
绑定。
MaxHeap
不会发生这种情况,因为:
MaxHeap
最初创建并且l
参数默认参数被分配给一个内存地址。_invert_sign
创建一个分配给self.heap
的新列表。默认参数l
永远不会被修改。- 再次初始化
MaxHeap
时,l
已经有了内存地址,再次使用,但从未被修改,构造了另一个副本,所以永远不会被修改。
解决方案
而不是:
class MinHeap(object):
def __init__(self, l=[]):
self.heap = l
heapq.heapify(l)
我们应该使用:
class MinHeap(object):
def __init__(self, l=None):
if l is None:
l = []
self.heap = l
heapq.heapify(l)
应该对 MaxHeap