如何在 Python 自定义字典中实现整数类型键的一致性哈希?
How to implement consistent hashing of integer type keys in Python custom dictionary?
我正在尝试在 Python 中从头开始构建字典。我已经完成了大部分工作,但遇到了一个小问题。首先,我首先要说我正在使用内置的 Python hash() 来获取键的 hash_result (键可以是 int 或 str)然后索引由 hash_result 字典容量的百分比。如果密钥是一串字符,则一切正常。一旦键是整数,我的自定义字典就会崩溃。有时一切正常,其他时候,键在添加到字典时得到哈希值 0(例如),但在字典中搜索键 returns 一个 KeyError,因为键映射到索引 0 而不是 4。我相信起初,索引是通过 hash(key) % capacity(例如 4) 计算的,但是一旦容量增加 x2,函数 hash(key) % capacity(now 8 because x2) 返回的索引不同,这导致了问题。我在维基百科上看到了这个公式(hash(key) % capacity)。我有兴趣了解这是否是我面临的问题,如果不是,究竟是什么导致了这种不良行为以及如何解决它。
下面是我的代码:
class MyDictionary:
__LOAD_FACTOR_LIMIT = 0.75
__DEFAULT_CAPACITY = 4
def __init__(self):
self.__capacity = self.__DEFAULT_CAPACITY
self.__keys = [[] for i in range(self.__capacity)]
self.__values = [[] for i in range(self.__capacity)]
@property
def keys(self):
return [item for current_list in self.__keys for item in current_list]
@property
def values(self):
return [value for value_list in self.__values for value in value_list]
def __setitem__(self, key, value):
while self.__compute_load_factor() >= self.__LOAD_FACTOR_LIMIT:
self.__extend_dict()
index_hash = self.__hash_function(key)
if self.__is_key_in_dict(index_hash, key):
self.__set_value_to_an_existing_key(index_hash, key, value)
return
self.__set_value_to_a_new_key(index_hash, key, value)
def __getitem__(self, key):
index_hash = self.__hash_function(key)
if self.__is_key_in_dict(index_hash, key):
index_bucket = self.__get_index_bucket(index_hash, key)
return self.__values[index_hash][index_bucket]
raise KeyError('Key is not in dictionary!')
def __str__(self):
key_values = zip(self.keys, self.values)
result = '{' + ", ".join([f"{key}: {value}"
if isinstance(key, int) else f"'{key}': {value}"
for key, value in key_values]) + '}'
return result
def __hash_function(self, key):
index_hash = hash(key) % self.__capacity
return index_hash
def __is_key_in_dict(self, index_hash, key):
if key in self.__keys[index_hash]:
return True
return False
def __get_index_bucket(self, index_hash, key):
index_bucket = self.__keys[index_hash].index(key)
return index_bucket
def __extend_dict(self):
self.__keys += [[] for i in range(self.__capacity)]
self.__values += [[] for i in range(self.__capacity)]
self.__capacity *= 2
def __set_value_to_an_existing_key(self, index_hash, key, value):
index_bucket = self.__get_index_bucket(index_hash, key)
self.__values[index_hash][index_bucket] = value
def __set_value_to_a_new_key(self, index_hash, key, value):
self.__keys[index_hash].append(key)
self.__values[index_hash].append(value)
def __compute_load_factor(self):
k = len(self.__keys)
n = len([bucket for bucket in self.__keys if bucket])
return n / k
def get(self, key, return_value=None):
try:
index_hash = self.__hash_function(key)
index_bucket = self.__get_index_bucket(index_hash, key)
if self.__is_key_in_dict(index_hash, key):
return self.__keys[index_hash][index_bucket]
raise KeyError('Key is not in dictionary!')
except KeyError:
return return_value
def add(self):
pass
def pop(self):
pass
def clear(self):
self.__capacity = self.__DEFAULT_CAPACITY
self.__keys = [[] for i in range(self.__capacity)]
self.__values = [[] for i in range(self.__capacity)]
def items(self):
zipped_key_value = zip(self.keys, self.values)
return [item for item in zipped_key_value]
dictionary = MyDictionary()
dictionary.add()
dictionary[4] = 'hey'
dictionary['2'] = 'cya'
dictionary['4'] = 'welcome'
dictionary['5'] = 'welcome'
dictionary['32'] = 'heya'
dictionary['31'] = 'heya'
dictionary['36'] = 'heya'
dictionary['34'] = 'heya'
print(dictionary[4])
这是因为当负载超过阈值时,您通过调用 __extend_dict
方法来增加容量(存储在 __capacity
属性中),这使得存储桶的索引存储的现有值不再有效,因为您总是通过对容量取模来导出索引。
因此,每次增加字典的容量时,您都应该在新索引处重新插入现有的键和值:
def __extend_dict(self):
self.__capacity *= 2
new_keys = [[] for _ in range(self.__capacity)]
new_values = [[] for _ in range(self.__capacity)]
for keys, values in zip(self.__keys, self.__values):
for key, value in zip(keys, values):
index_hash = self.__hash_function(key)
new_keys[index_hash].append(key)
new_values[index_hash].append(value)
self.__keys = new_keys
self.__values = new_values
我正在尝试在 Python 中从头开始构建字典。我已经完成了大部分工作,但遇到了一个小问题。首先,我首先要说我正在使用内置的 Python hash() 来获取键的 hash_result (键可以是 int 或 str)然后索引由 hash_result 字典容量的百分比。如果密钥是一串字符,则一切正常。一旦键是整数,我的自定义字典就会崩溃。有时一切正常,其他时候,键在添加到字典时得到哈希值 0(例如),但在字典中搜索键 returns 一个 KeyError,因为键映射到索引 0 而不是 4。我相信起初,索引是通过 hash(key) % capacity(例如 4) 计算的,但是一旦容量增加 x2,函数 hash(key) % capacity(now 8 because x2) 返回的索引不同,这导致了问题。我在维基百科上看到了这个公式(hash(key) % capacity)。我有兴趣了解这是否是我面临的问题,如果不是,究竟是什么导致了这种不良行为以及如何解决它。 下面是我的代码:
class MyDictionary:
__LOAD_FACTOR_LIMIT = 0.75
__DEFAULT_CAPACITY = 4
def __init__(self):
self.__capacity = self.__DEFAULT_CAPACITY
self.__keys = [[] for i in range(self.__capacity)]
self.__values = [[] for i in range(self.__capacity)]
@property
def keys(self):
return [item for current_list in self.__keys for item in current_list]
@property
def values(self):
return [value for value_list in self.__values for value in value_list]
def __setitem__(self, key, value):
while self.__compute_load_factor() >= self.__LOAD_FACTOR_LIMIT:
self.__extend_dict()
index_hash = self.__hash_function(key)
if self.__is_key_in_dict(index_hash, key):
self.__set_value_to_an_existing_key(index_hash, key, value)
return
self.__set_value_to_a_new_key(index_hash, key, value)
def __getitem__(self, key):
index_hash = self.__hash_function(key)
if self.__is_key_in_dict(index_hash, key):
index_bucket = self.__get_index_bucket(index_hash, key)
return self.__values[index_hash][index_bucket]
raise KeyError('Key is not in dictionary!')
def __str__(self):
key_values = zip(self.keys, self.values)
result = '{' + ", ".join([f"{key}: {value}"
if isinstance(key, int) else f"'{key}': {value}"
for key, value in key_values]) + '}'
return result
def __hash_function(self, key):
index_hash = hash(key) % self.__capacity
return index_hash
def __is_key_in_dict(self, index_hash, key):
if key in self.__keys[index_hash]:
return True
return False
def __get_index_bucket(self, index_hash, key):
index_bucket = self.__keys[index_hash].index(key)
return index_bucket
def __extend_dict(self):
self.__keys += [[] for i in range(self.__capacity)]
self.__values += [[] for i in range(self.__capacity)]
self.__capacity *= 2
def __set_value_to_an_existing_key(self, index_hash, key, value):
index_bucket = self.__get_index_bucket(index_hash, key)
self.__values[index_hash][index_bucket] = value
def __set_value_to_a_new_key(self, index_hash, key, value):
self.__keys[index_hash].append(key)
self.__values[index_hash].append(value)
def __compute_load_factor(self):
k = len(self.__keys)
n = len([bucket for bucket in self.__keys if bucket])
return n / k
def get(self, key, return_value=None):
try:
index_hash = self.__hash_function(key)
index_bucket = self.__get_index_bucket(index_hash, key)
if self.__is_key_in_dict(index_hash, key):
return self.__keys[index_hash][index_bucket]
raise KeyError('Key is not in dictionary!')
except KeyError:
return return_value
def add(self):
pass
def pop(self):
pass
def clear(self):
self.__capacity = self.__DEFAULT_CAPACITY
self.__keys = [[] for i in range(self.__capacity)]
self.__values = [[] for i in range(self.__capacity)]
def items(self):
zipped_key_value = zip(self.keys, self.values)
return [item for item in zipped_key_value]
dictionary = MyDictionary()
dictionary.add()
dictionary[4] = 'hey'
dictionary['2'] = 'cya'
dictionary['4'] = 'welcome'
dictionary['5'] = 'welcome'
dictionary['32'] = 'heya'
dictionary['31'] = 'heya'
dictionary['36'] = 'heya'
dictionary['34'] = 'heya'
print(dictionary[4])
这是因为当负载超过阈值时,您通过调用 __extend_dict
方法来增加容量(存储在 __capacity
属性中),这使得存储桶的索引存储的现有值不再有效,因为您总是通过对容量取模来导出索引。
因此,每次增加字典的容量时,您都应该在新索引处重新插入现有的键和值:
def __extend_dict(self):
self.__capacity *= 2
new_keys = [[] for _ in range(self.__capacity)]
new_values = [[] for _ in range(self.__capacity)]
for keys, values in zip(self.__keys, self.__values):
for key, value in zip(keys, values):
index_hash = self.__hash_function(key)
new_keys[index_hash].append(key)
new_values[index_hash].append(value)
self.__keys = new_keys
self.__values = new_values