在列表中使用哈希函数时获取相同字符串的不同哈希摘要值
Getting different hash digest values for the same string when using hash functions in a list
我的程序中同一个词似乎得到了不同的摘要值。我不确定这是因为我将哈希函数保存在列表中(这样我就可以添加到列表中)
当我使用直接散列函数时,同一个词的散列摘要是相同的。当我使用列表中的哈希值时,情况有所不同。我做错了什么?
什么有效
import hashlib
bloom_len = 100
def bytes_to_int(hash_value):
return int.from_bytes(hash_value, byteorder='big') #big-endiang format
def bloom_index(hashint):
return hashint % bloom_len
def hashIt(word):
m1 = hashlib.md5()
m2 = hashlib.sha1()
m3 = hashlib.sha256()
m4 = hashlib.sha3_512()
m5 = hashlib.blake2s()
m1.update(word)
m2.update(word)
m3.update(word)
m4.update(word)
m5.update(word)
hash_values = [m1.digest(), m2.digest(), m3.digest(), m4.digest(), m5.digest()]
hashints = list(map(bytes_to_int, hash_values))
indices = list(map(bloom_index, hashints))
print(indices)
inputWord = 'sent'
word = inputWord.encode('utf-8')
hashIt(word)
inputWord = 'blue'
word = inputWord.encode('utf-8')
hashIt(word)
inputWord = 'sent'
word = inputWord.encode('utf-8')
hashIt(word)
什么不起作用
import hashlib
class BloomFilter():
def __init__(self, length = 100):
self.bloomFilterLen = length
self.bloomFilterArray = [0] * self.bloomFilterLen
m1 = hashlib.md5()
m2 = hashlib.sha3_512()
m3 = hashlib.blake2s()
self.hashes = [m1, m2, m3]
def encode(self, inputWord):
encoded_word = inputWord.encode('utf-8')
return encoded_word
def bytes_to_int(self, hash_value):
return int.from_bytes(hash_value, byteorder='big')
def bloom_index(self, hashint):
return hashint % self.bloomFilterLen
def getIndices(self, inputWord):
word = self.encode(inputWord)
print(word)
hashDigests = []
for hashFunction in self.hashes:
hashFunction.update(word)
print('hashFunction ', hashFunction , '\n')
print('hashDigest ', hashFunction.digest() , '\n')
hashDigests.append(hashFunction.digest())
hashInts = [self.bytes_to_int(h) for h in hashDigests]
#print('hashInts ', hashInts)
bloomFilterIndices = [self.bloom_index(hInt) for hInt in hashInts]
return bloomFilterIndices
def insert(self, inputWord):
bloomFilterIndices = self.getIndices(inputWord)
for index in bloomFilterIndices:
self.bloomFilterArray[index] = 1
print(bloomFilterIndices)
def lookup(self, inputWord):
bloomFilterIndices = self.getIndices(inputWord)
print('Inside lookup')
print(bloomFilterIndices)
for idx in bloomFilterIndices:
print('idx value ', idx)
print('self.bloomFilterArray[idx] value ', self.bloomFilterArray[idx])
if self.bloomFilterArray[idx] == 0:
# Indicates word not present in the bloom filter
return False
return True
if __name__ == '__main__':
word = 'sent'
bloomFilter = BloomFilter()
bloomFilter.insert(word)
print(bloomFilter.lookup(word))
从第一个程序开始 - 我一直得到相同的整数索引
- “已发送”的索引
[61, 82, 5, 53, 87]
- “blue”的索引
[95, 25, 24, 69, 85]
- “已发送”的索引
[61, 82, 5, 53, 87]
对于非工作程序,整数索引不同,当我打印出哈希摘要时也不同
- “sent”的索引 - 第一次通过 add
[61, 53, 87]
HashDigest
来自 MD5
for 'sent'
hashDigest b'x\x91\x83\xb7\xe9\x86F\xc1\x1d_\x05D\xc8\xf3\xc4\xc9'
- “sent”的索引 - 第二次通过
lookup
[70, 89, 8]
HashDigest
来自 MD5
for 'sent'
hashDigest b'\x95\x17bC\x17\x80\xb5\x9d]x\xca$\xda\x89\x06\x16'
所以我更改了__init中的代码__
来自
m1 = hashlib.md5()
m2 = hashlib.sha3_512()
m3 = hashlib.blake2s()
self.hashes = [m1, m2, m3]
到
self.hashes = ['md5', 'sha3_512', 'blake2s']
然后在方法 getIndices() 的 for 循环内
从
更改而来
for hashFunction in self.hashes:
hashFunction.update(word)
到
for hashName in self.hashes:
hashFunction = hashlib.new(hashName)
hashFunction.update(word)
现在可以使用了!
哈希函数对象不能重复使用,您可以将这些函数对象移动到您的 getIndices 函数中:
import hashlib
class BloomFilter():
def __init__(self, length = 100):
self.bloomFilterLen = length
self.bloomFilterArray = [0] * self.bloomFilterLen
def encode(self, inputWord):
encoded_word = inputWord.encode('utf-8')
return encoded_word
def bytes_to_int(self, hash_value):
return int.from_bytes(hash_value, byteorder='big')
def bloom_index(self, hashint):
return hashint % self.bloomFilterLen
def getIndices(self, inputWord):
m1 = hashlib.md5()
m2 = hashlib.sha3_512()
m3 = hashlib.blake2s()
hashes = [m1, m2, m3]
word = self.encode(inputWord)
print(word)
hashDigests = []
for hashFunction in hashes:
hashFunction.update(word)
print('hashFunction ', hashFunction , '\n')
print('hashDigest ', hashFunction.digest() , '\n')
hashDigests.append(hashFunction.digest())
hashInts = [self.bytes_to_int(h) for h in hashDigests]
#print('hashInts ', hashInts)
bloomFilterIndices = [self.bloom_index(hInt) for hInt in hashInts]
return bloomFilterIndices
def insert(self, inputWord):
bloomFilterIndices = self.getIndices(inputWord)
for index in bloomFilterIndices:
self.bloomFilterArray[index] = 1
print(bloomFilterIndices)
bloomFilterIndices = self.getIndices(inputWord)
print(bloomFilterIndices)
def lookup(self, inputWord):
print('Inside lookup')
bloomFilterIndices = self.getIndices(inputWord)
print(bloomFilterIndices)
for idx in bloomFilterIndices:
print('idx value ', idx)
print('self.bloomFilterArray[idx] value ', self.bloomFilterArray[idx])
if self.bloomFilterArray[idx] == 0:
# Indicates word not present in the bloom filter
return False
return True
if __name__ == '__main__':
word = 'sent'
bloomFilter = BloomFilter()
bloomFilter.insert(word)
print(bloomFilter.lookup(word))
我的程序中同一个词似乎得到了不同的摘要值。我不确定这是因为我将哈希函数保存在列表中(这样我就可以添加到列表中)
当我使用直接散列函数时,同一个词的散列摘要是相同的。当我使用列表中的哈希值时,情况有所不同。我做错了什么?
什么有效
import hashlib
bloom_len = 100
def bytes_to_int(hash_value):
return int.from_bytes(hash_value, byteorder='big') #big-endiang format
def bloom_index(hashint):
return hashint % bloom_len
def hashIt(word):
m1 = hashlib.md5()
m2 = hashlib.sha1()
m3 = hashlib.sha256()
m4 = hashlib.sha3_512()
m5 = hashlib.blake2s()
m1.update(word)
m2.update(word)
m3.update(word)
m4.update(word)
m5.update(word)
hash_values = [m1.digest(), m2.digest(), m3.digest(), m4.digest(), m5.digest()]
hashints = list(map(bytes_to_int, hash_values))
indices = list(map(bloom_index, hashints))
print(indices)
inputWord = 'sent'
word = inputWord.encode('utf-8')
hashIt(word)
inputWord = 'blue'
word = inputWord.encode('utf-8')
hashIt(word)
inputWord = 'sent'
word = inputWord.encode('utf-8')
hashIt(word)
什么不起作用
import hashlib
class BloomFilter():
def __init__(self, length = 100):
self.bloomFilterLen = length
self.bloomFilterArray = [0] * self.bloomFilterLen
m1 = hashlib.md5()
m2 = hashlib.sha3_512()
m3 = hashlib.blake2s()
self.hashes = [m1, m2, m3]
def encode(self, inputWord):
encoded_word = inputWord.encode('utf-8')
return encoded_word
def bytes_to_int(self, hash_value):
return int.from_bytes(hash_value, byteorder='big')
def bloom_index(self, hashint):
return hashint % self.bloomFilterLen
def getIndices(self, inputWord):
word = self.encode(inputWord)
print(word)
hashDigests = []
for hashFunction in self.hashes:
hashFunction.update(word)
print('hashFunction ', hashFunction , '\n')
print('hashDigest ', hashFunction.digest() , '\n')
hashDigests.append(hashFunction.digest())
hashInts = [self.bytes_to_int(h) for h in hashDigests]
#print('hashInts ', hashInts)
bloomFilterIndices = [self.bloom_index(hInt) for hInt in hashInts]
return bloomFilterIndices
def insert(self, inputWord):
bloomFilterIndices = self.getIndices(inputWord)
for index in bloomFilterIndices:
self.bloomFilterArray[index] = 1
print(bloomFilterIndices)
def lookup(self, inputWord):
bloomFilterIndices = self.getIndices(inputWord)
print('Inside lookup')
print(bloomFilterIndices)
for idx in bloomFilterIndices:
print('idx value ', idx)
print('self.bloomFilterArray[idx] value ', self.bloomFilterArray[idx])
if self.bloomFilterArray[idx] == 0:
# Indicates word not present in the bloom filter
return False
return True
if __name__ == '__main__':
word = 'sent'
bloomFilter = BloomFilter()
bloomFilter.insert(word)
print(bloomFilter.lookup(word))
从第一个程序开始 - 我一直得到相同的整数索引
- “已发送”的索引
[61, 82, 5, 53, 87]
- “blue”的索引
[95, 25, 24, 69, 85]
- “已发送”的索引
[61, 82, 5, 53, 87]
对于非工作程序,整数索引不同,当我打印出哈希摘要时也不同
- “sent”的索引 - 第一次通过 add
[61, 53, 87]
HashDigest
来自 MD5
for 'sent'
hashDigest b'x\x91\x83\xb7\xe9\x86F\xc1\x1d_\x05D\xc8\xf3\xc4\xc9'
- “sent”的索引 - 第二次通过
lookup
[70, 89, 8]
HashDigest
来自 MD5
for 'sent'
hashDigest b'\x95\x17bC\x17\x80\xb5\x9d]x\xca$\xda\x89\x06\x16'
所以我更改了__init中的代码__
来自
m1 = hashlib.md5()
m2 = hashlib.sha3_512()
m3 = hashlib.blake2s()
self.hashes = [m1, m2, m3]
到
self.hashes = ['md5', 'sha3_512', 'blake2s']
然后在方法 getIndices() 的 for 循环内
从
更改而来 for hashFunction in self.hashes:
hashFunction.update(word)
到
for hashName in self.hashes:
hashFunction = hashlib.new(hashName)
hashFunction.update(word)
现在可以使用了!
哈希函数对象不能重复使用,您可以将这些函数对象移动到您的 getIndices 函数中:
import hashlib
class BloomFilter():
def __init__(self, length = 100):
self.bloomFilterLen = length
self.bloomFilterArray = [0] * self.bloomFilterLen
def encode(self, inputWord):
encoded_word = inputWord.encode('utf-8')
return encoded_word
def bytes_to_int(self, hash_value):
return int.from_bytes(hash_value, byteorder='big')
def bloom_index(self, hashint):
return hashint % self.bloomFilterLen
def getIndices(self, inputWord):
m1 = hashlib.md5()
m2 = hashlib.sha3_512()
m3 = hashlib.blake2s()
hashes = [m1, m2, m3]
word = self.encode(inputWord)
print(word)
hashDigests = []
for hashFunction in hashes:
hashFunction.update(word)
print('hashFunction ', hashFunction , '\n')
print('hashDigest ', hashFunction.digest() , '\n')
hashDigests.append(hashFunction.digest())
hashInts = [self.bytes_to_int(h) for h in hashDigests]
#print('hashInts ', hashInts)
bloomFilterIndices = [self.bloom_index(hInt) for hInt in hashInts]
return bloomFilterIndices
def insert(self, inputWord):
bloomFilterIndices = self.getIndices(inputWord)
for index in bloomFilterIndices:
self.bloomFilterArray[index] = 1
print(bloomFilterIndices)
bloomFilterIndices = self.getIndices(inputWord)
print(bloomFilterIndices)
def lookup(self, inputWord):
print('Inside lookup')
bloomFilterIndices = self.getIndices(inputWord)
print(bloomFilterIndices)
for idx in bloomFilterIndices:
print('idx value ', idx)
print('self.bloomFilterArray[idx] value ', self.bloomFilterArray[idx])
if self.bloomFilterArray[idx] == 0:
# Indicates word not present in the bloom filter
return False
return True
if __name__ == '__main__':
word = 'sent'
bloomFilter = BloomFilter()
bloomFilter.insert(word)
print(bloomFilter.lookup(word))