Java,使 murmur2 哈希在字节数组的一部分上工作
Java, make murmur2 hash work on portion of byte array
我在字节数组上使用 murmur2 哈希,但我只想对字节的一个子集进行哈希处理,murmur2 只允许我从 0 开始对数组进行哈希处理,但我想指定一个非 0 的起始偏移量以及数组中的结束偏移量。
*
* @param data byte array to hash
* @param length length of the array to hash
* @param seed initial seed value
* @return 32 bit hash of the given array
*/
public static int hash32(final byte[] data, int length, int seed) {
// 'm' and 'r' are mixing constants generated offline.
// They're not really 'magic', they just happen to work well.
final int m = 0x5bd1e995;
final int r = 24;
// Initialize the hash to a random value
int h = seed^length;
int length4 = length/4;
for (int i=0; i<length4; i++) {
final int i4 = i*4;
int k = (data[i4+0]&0xff) +((data[i4+1]&0xff)<<8)
+((data[i4+2]&0xff)<<16) +((data[i4+3]&0xff)<<24);
k *= m;
k ^= k >>> r;
k *= m;
h *= m;
h ^= k;
}
// Handle the last few bytes of the input array
switch (length%4) {
case 3: h ^= (data[(length&~3) +2]&0xff) << 16;
case 2: h ^= (data[(length&~3) +1]&0xff) << 8;
case 1: h ^= (data[length&~3]&0xff);
h *= m;
}
h ^= h >>> 13;
h *= m;
h ^= h >>> 15;
return h;
}
我尝试了各种更改,但它总是导致我的哈希冲突测试从 0 变为非常高的数字。我不想使用 murmur3,因为它不适合像 murmur2 这样的单一小方法,murmur2 在我的测试中也快一点。
这是我的碰撞测试器,供任何想要尝试的人使用
HashSet<Integer> hs = new HashSet<>(100000000,(float) 1.0);
long collide = 0;
long totalLoops = 0;
byte[] ba = new byte[4];
long sTime = System.currentTimeMillis();
int hash;
for(byte d=0; d<5; d++) {
ba[0] = d;
for(byte i=-128; i<127; i++) {
ba[1] = i;
for(byte k=-128; k<127; k++) {
ba[2] = k;
for(byte j=-128; j<127; j++) {
ba[3] = j;
hash = hash32(ba,ba.length,0x9747b28c);
if(hs.contains(hash)) {
collide++;
} else {
hs.add(hash);
}
totalLoops++;
}
}
}
}
注意:以上碰撞测试需要8GB内存的电脑。
我发现了一个 murmurhash3 实现,它为感兴趣的人使用偏移量,问题已解决。
public static int murmurhash3_x86_32(byte[] 数据, int 偏移量, int len, int seed) {
final int c1 = 0xcc9e2d51;
final int c2 = 0x1b873593;
int h1 = seed;
int roundedEnd = offset + (len & 0xfffffffc); // round down to 4 byte block
for (int i=offset; i<roundedEnd; i+=4) {
// little endian load order
int k1 = (data[i] & 0xff) | ((data[i+1] & 0xff) << 8) | ((data[i+2] & 0xff) << 16) | (data[i+3] << 24);
k1 *= c1;
k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15);
k1 *= c2;
h1 ^= k1;
h1 = (h1 << 13) | (h1 >>> 19); // ROTL32(h1,13);
h1 = h1*5+0xe6546b64;
}
// tail
int k1 = 0;
switch(len & 0x03) {
case 3:
k1 = (data[roundedEnd + 2] & 0xff) << 16;
// fallthrough
case 2:
k1 |= (data[roundedEnd + 1] & 0xff) << 8;
// fallthrough
case 1:
k1 |= (data[roundedEnd] & 0xff);
k1 *= c1;
k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15);
k1 *= c2;
h1 ^= k1;
}
// finalization
h1 ^= len;
// fmix(h1);
h1 ^= h1 >>> 16;
h1 *= 0x85ebca6b;
h1 ^= h1 >>> 13;
h1 *= 0xc2b2ae35;
h1 ^= h1 >>> 16;
return h1;
}
我在字节数组上使用 murmur2 哈希,但我只想对字节的一个子集进行哈希处理,murmur2 只允许我从 0 开始对数组进行哈希处理,但我想指定一个非 0 的起始偏移量以及数组中的结束偏移量。
*
* @param data byte array to hash
* @param length length of the array to hash
* @param seed initial seed value
* @return 32 bit hash of the given array
*/
public static int hash32(final byte[] data, int length, int seed) {
// 'm' and 'r' are mixing constants generated offline.
// They're not really 'magic', they just happen to work well.
final int m = 0x5bd1e995;
final int r = 24;
// Initialize the hash to a random value
int h = seed^length;
int length4 = length/4;
for (int i=0; i<length4; i++) {
final int i4 = i*4;
int k = (data[i4+0]&0xff) +((data[i4+1]&0xff)<<8)
+((data[i4+2]&0xff)<<16) +((data[i4+3]&0xff)<<24);
k *= m;
k ^= k >>> r;
k *= m;
h *= m;
h ^= k;
}
// Handle the last few bytes of the input array
switch (length%4) {
case 3: h ^= (data[(length&~3) +2]&0xff) << 16;
case 2: h ^= (data[(length&~3) +1]&0xff) << 8;
case 1: h ^= (data[length&~3]&0xff);
h *= m;
}
h ^= h >>> 13;
h *= m;
h ^= h >>> 15;
return h;
}
我尝试了各种更改,但它总是导致我的哈希冲突测试从 0 变为非常高的数字。我不想使用 murmur3,因为它不适合像 murmur2 这样的单一小方法,murmur2 在我的测试中也快一点。
这是我的碰撞测试器,供任何想要尝试的人使用
HashSet<Integer> hs = new HashSet<>(100000000,(float) 1.0);
long collide = 0;
long totalLoops = 0;
byte[] ba = new byte[4];
long sTime = System.currentTimeMillis();
int hash;
for(byte d=0; d<5; d++) {
ba[0] = d;
for(byte i=-128; i<127; i++) {
ba[1] = i;
for(byte k=-128; k<127; k++) {
ba[2] = k;
for(byte j=-128; j<127; j++) {
ba[3] = j;
hash = hash32(ba,ba.length,0x9747b28c);
if(hs.contains(hash)) {
collide++;
} else {
hs.add(hash);
}
totalLoops++;
}
}
}
}
注意:以上碰撞测试需要8GB内存的电脑。
我发现了一个 murmurhash3 实现,它为感兴趣的人使用偏移量,问题已解决。
public static int murmurhash3_x86_32(byte[] 数据, int 偏移量, int len, int seed) {
final int c1 = 0xcc9e2d51;
final int c2 = 0x1b873593;
int h1 = seed;
int roundedEnd = offset + (len & 0xfffffffc); // round down to 4 byte block
for (int i=offset; i<roundedEnd; i+=4) {
// little endian load order
int k1 = (data[i] & 0xff) | ((data[i+1] & 0xff) << 8) | ((data[i+2] & 0xff) << 16) | (data[i+3] << 24);
k1 *= c1;
k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15);
k1 *= c2;
h1 ^= k1;
h1 = (h1 << 13) | (h1 >>> 19); // ROTL32(h1,13);
h1 = h1*5+0xe6546b64;
}
// tail
int k1 = 0;
switch(len & 0x03) {
case 3:
k1 = (data[roundedEnd + 2] & 0xff) << 16;
// fallthrough
case 2:
k1 |= (data[roundedEnd + 1] & 0xff) << 8;
// fallthrough
case 1:
k1 |= (data[roundedEnd] & 0xff);
k1 *= c1;
k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15);
k1 *= c2;
h1 ^= k1;
}
// finalization
h1 ^= len;
// fmix(h1);
h1 ^= h1 >>> 16;
h1 *= 0x85ebca6b;
h1 ^= h1 >>> 13;
h1 *= 0xc2b2ae35;
h1 ^= h1 >>> 16;
return h1;
}