如何计算 unordered_set C++ 中的冲突
How to count collisions in unordered_set c++
我想计算一些关于我的哈希函数的统计数据(比如 max/avg 碰撞量)。我编写了虚拟哈希函数(将所有键映射到 1)并等待看到 max/avg 碰撞的数量等于键的数量。但是我对不同的功能有相同的数量。有人可以解释一下吗?
代码:
#include <iostream>
#include <unordered_set>
struct DummyHash
{
size_t operator()(int key) const
{
return static_cast<size_t>(1);
}
};
int main()
{
std::unordered_set<int, DummyHash> a;
std::unordered_set<int> b;
int c = 10000;
for (int i = 0; i < c; i++)
{
a.insert(i);
}
std::cout << "a ended" << std::endl;
for (int i = 0; i < c; i++)
{
b.insert(i);
}
std::cout << "b ended" << std::endl;
std::cout << "a = " << a.max_load_factor() << ' ' << a.load_factor() << ' '
<< a.max_size() << ' ' << a.max_bucket_count() << ' ' << a.bucket_count() << '\n';
std::cout << "b = " << b.max_load_factor() << ' ' << b.load_factor() << ' '
<< b.max_size() << ' ' << b.max_bucket_count() << ' ' << b.bucket_count() << '\n';
return 0;
}
结果:
a ended
b ended
a = 1 0.659065 768614336404564650 768614336404564650 15173
b = 1 0.659065 1152921504606846975 1152921504606846975 15173
您使用的函数不提供碰撞计数,您可能希望在 https://en.cppreference.com/w/cpp/container/unordered_set
上阅读它们的文档
计算桶冲突统计数据的一种方法是检查每个桶中的元素数量:
struct BucketStats {
size_t occupied = 0;
size_t total_collisions = 0;
size_t max_collisions = 0;
template<class... Args>
BucketStats(std::unordered_set<Args...> const& c)
{
for(auto bucket = c.bucket_count(); bucket--;) {
auto bucket_size = c.bucket_size(bucket);
occupied += bucket_size > 0;
if(bucket_size > 1) {
auto collisions = bucket_size - 1;
total_collisions += collisions;
max_collisions = std::max(max_collisions, collisions);
}
}
}
double avg_collisions() const {
return occupied ? static_cast<double>(total_collisions) / occupied : 0;
}
friend std::ostream& operator<<(std::ostream& s, BucketStats const& b) {
return s
<< "used buckets: " << b.occupied
<< "; total collisions: " << b.total_collisions
<< "; max collisions in a bucket: " << b.max_collisions
<< "; avg collisions per bucket: " << b.avg_collisions()
;
}
};
// ...
std::cout << BucketStats(a) << '\n';
std::cout << BucketStats(b) << '\n';
输出:
used buckets: 1; total collisions: 9999; max collisions in a bucket: 9999; avg collisions per bucket: 9999
used buckets: 10000; total collisions: 0; max collisions in a bucket: 0; avg collisions per bucket: 0
std::unordered_map
将增加 bucket_count
以试图使 load_factor
接近 max_load_factor
。
也就是说bucket_count
只取决于地图中元素的数量,不受碰撞次数的影响。
要检查冲突,计算桶大小 > 1 的所有元素。
size_t collisions = 0, empty = 0;
for (auto bucket = a.bucket_count(); bucket--;) {
if (a.bucket_size(bucket) == 0)
empty++;
else
collisions += a.bucket_size(bucket) - 1;
}
std::cout << "a = " << a.max_load_factor() << ' ' << a.load_factor() << ' '
<< ' ' << a.bucket_count() << ' ' << collisions << ' ' << empty << '\n';
empty = 0, collisions = 0;
for (auto bucket = b.bucket_count(); bucket--;) {
if (b.bucket_size(bucket) == 0)
empty++;
else
collisions += b.bucket_size(bucket) - 1;
}
std::cout << "b = " << b.max_load_factor() << ' ' << b.load_factor() << ' '
<< ' ' << b.bucket_count() << ' ' << collisions << ' ' << empty << '\n';
版画
a = 1 0.610352 16384 9999 16383
b = 1 0.610352 16384 4773 11157
也就是说,如果散列函数不好,则在 16384 个空桶中有 9999 次冲突和 16383 次冲突。
无关:如果您关心哈希 table 性能,请查看 dense_hash_map
,它实现了线性探测以获得更好的性能。
我想计算一些关于我的哈希函数的统计数据(比如 max/avg 碰撞量)。我编写了虚拟哈希函数(将所有键映射到 1)并等待看到 max/avg 碰撞的数量等于键的数量。但是我对不同的功能有相同的数量。有人可以解释一下吗? 代码:
#include <iostream>
#include <unordered_set>
struct DummyHash
{
size_t operator()(int key) const
{
return static_cast<size_t>(1);
}
};
int main()
{
std::unordered_set<int, DummyHash> a;
std::unordered_set<int> b;
int c = 10000;
for (int i = 0; i < c; i++)
{
a.insert(i);
}
std::cout << "a ended" << std::endl;
for (int i = 0; i < c; i++)
{
b.insert(i);
}
std::cout << "b ended" << std::endl;
std::cout << "a = " << a.max_load_factor() << ' ' << a.load_factor() << ' '
<< a.max_size() << ' ' << a.max_bucket_count() << ' ' << a.bucket_count() << '\n';
std::cout << "b = " << b.max_load_factor() << ' ' << b.load_factor() << ' '
<< b.max_size() << ' ' << b.max_bucket_count() << ' ' << b.bucket_count() << '\n';
return 0;
}
结果:
a ended
b ended
a = 1 0.659065 768614336404564650 768614336404564650 15173
b = 1 0.659065 1152921504606846975 1152921504606846975 15173
您使用的函数不提供碰撞计数,您可能希望在 https://en.cppreference.com/w/cpp/container/unordered_set
上阅读它们的文档计算桶冲突统计数据的一种方法是检查每个桶中的元素数量:
struct BucketStats {
size_t occupied = 0;
size_t total_collisions = 0;
size_t max_collisions = 0;
template<class... Args>
BucketStats(std::unordered_set<Args...> const& c)
{
for(auto bucket = c.bucket_count(); bucket--;) {
auto bucket_size = c.bucket_size(bucket);
occupied += bucket_size > 0;
if(bucket_size > 1) {
auto collisions = bucket_size - 1;
total_collisions += collisions;
max_collisions = std::max(max_collisions, collisions);
}
}
}
double avg_collisions() const {
return occupied ? static_cast<double>(total_collisions) / occupied : 0;
}
friend std::ostream& operator<<(std::ostream& s, BucketStats const& b) {
return s
<< "used buckets: " << b.occupied
<< "; total collisions: " << b.total_collisions
<< "; max collisions in a bucket: " << b.max_collisions
<< "; avg collisions per bucket: " << b.avg_collisions()
;
}
};
// ...
std::cout << BucketStats(a) << '\n';
std::cout << BucketStats(b) << '\n';
输出:
used buckets: 1; total collisions: 9999; max collisions in a bucket: 9999; avg collisions per bucket: 9999
used buckets: 10000; total collisions: 0; max collisions in a bucket: 0; avg collisions per bucket: 0
std::unordered_map
将增加 bucket_count
以试图使 load_factor
接近 max_load_factor
。
也就是说bucket_count
只取决于地图中元素的数量,不受碰撞次数的影响。
要检查冲突,计算桶大小 > 1 的所有元素。
size_t collisions = 0, empty = 0;
for (auto bucket = a.bucket_count(); bucket--;) {
if (a.bucket_size(bucket) == 0)
empty++;
else
collisions += a.bucket_size(bucket) - 1;
}
std::cout << "a = " << a.max_load_factor() << ' ' << a.load_factor() << ' '
<< ' ' << a.bucket_count() << ' ' << collisions << ' ' << empty << '\n';
empty = 0, collisions = 0;
for (auto bucket = b.bucket_count(); bucket--;) {
if (b.bucket_size(bucket) == 0)
empty++;
else
collisions += b.bucket_size(bucket) - 1;
}
std::cout << "b = " << b.max_load_factor() << ' ' << b.load_factor() << ' '
<< ' ' << b.bucket_count() << ' ' << collisions << ' ' << empty << '\n';
版画
a = 1 0.610352 16384 9999 16383
b = 1 0.610352 16384 4773 11157
也就是说,如果散列函数不好,则在 16384 个空桶中有 9999 次冲突和 16383 次冲突。
无关:如果您关心哈希 table 性能,请查看 dense_hash_map
,它实现了线性探测以获得更好的性能。