unordered_maps 的矢量,在地图中搜索速度太慢
Vector of unordered_maps, searching in maps too slow
我写了一个小程序,用一些样本数据创建了一个包含 200 万张地图的向量,然后查询了一些值。
我知道此时我可以使用数据库,但我只是想尝试一下性能优化。
代码:
#include <iostream>
#include <vector>
#include <unordered_map>
#include <map>
#include <string>
#include <chrono>
using namespace std;
static int NUM_OF_MAPS = 2 * 1000 * 1000;
void buildVector(vector<unordered_map <string, int>> &maps);
void find(string key, int value, vector<unordered_map <string, int>> &maps);
int main() {
auto startPrg = chrono::steady_clock::now();
vector<unordered_map <string, int>> maps;
buildVector(maps);
for (int i = 0; i < 10; i++) {
string s(1, 'a'+ i);
find(s, i, maps);
}
auto endPrg = chrono::steady_clock::now();
cout << "program duration: " << chrono::duration_cast<chrono::microseconds>(endPrg - startPrg).count() / 1000.0 << " ms" << endl;
return 0;
}
void find(string key, int value, vector<unordered_map <string, int>> &maps) {
auto start = chrono::steady_clock::now();
int matches = 0;
for (unordered_map <string, int> &map : maps) {
unordered_map<string,int>::const_iterator got = map.find(key);
if (got != map.end() && got->second == value) {
matches++;
}
}
auto end = chrono::steady_clock::now();
cout << matches << " matches for " << key << " = " << value << " in " << chrono::duration_cast<chrono::microseconds>(end - start).count() / 1000.0 << " ms" << endl;
}
void buildVector(vector<unordered_map <string, int>> &maps) {
auto start = chrono::steady_clock::now();
maps.reserve(NUM_OF_MAPS);
int entryCounter = 0;
unordered_map <string, int> map;
for (int i = 0; i < NUM_OF_MAPS; i++) {
map["a"] = entryCounter++;
map["b"] = entryCounter++;
map["c"] = entryCounter++;
map["d"] = entryCounter++;
map["e"] = entryCounter++;
map["f"] = entryCounter++;
maps.push_back(map);
entryCounter %= 100;
}
auto end = chrono::steady_clock::now();
cout << "build vector: " << chrono::duration_cast<chrono::microseconds>(end - start).count() / 1000.0 << " ms (" << maps.size() << ")" << endl;
}
输出:
build vector: 697.381 ms (2000000)
40000 matches for a = 0 in 67.873 ms
40000 matches for b = 1 in 64.176 ms
40000 matches for c = 2 in 60.484 ms
40000 matches for d = 3 in 68.102 ms
40000 matches for e = 4 in 62.71 ms
40000 matches for f = 5 in 65.723 ms
0 matches for g = 6 in 64.407 ms
0 matches for h = 7 in 45.401 ms
0 matches for i = 8 in 65.307 ms
0 matches for j = 9 in 64.371 ms
program duration: 1326.42 ms
我在Java中做了同样的事情,只是为了比较速度,得到了以下结果:
build vector: 2536.971578 ms (2000000)
40000 matches for a = 0 in 59.293339 ms
40000 matches for b = 1 in 56.306123 ms
40000 matches for c = 2 in 53.503208 ms
40000 matches for d = 3 in 51.174979 ms
40000 matches for e = 4 in 50.967731 ms
40000 matches for f = 5 in 53.68969 ms
0 matches for g = 6 in 41.927401 ms
0 matches for h = 7 in 36.160645 ms
0 matches for i = 8 in 33.535616 ms
0 matches for j = 9 in 36.56883 ms
program duration: 3016.979919 ms
虽然 C++ 在创建数据方面要快得多,但在查询部分却非常慢(与 Java 相比)。有什么方法可以让 C++ 在该部分也击败 Java 吗?
Java代码:
static int NUM_OF_MAPS = 2 * 1000 * 1000;
public static void run() {
long startPrg = System.nanoTime();
List<Map<String,Integer>> maps = new ArrayList<>(NUM_OF_MAPS);
buildVector(maps);
for (int i = 0; i < 10; i++) {
String s = String.valueOf((char)('a' + i));
find(s, i, maps);
}
long endPrg = System.nanoTime();
System.out.println("program duration: " + (endPrg - startPrg) / 1000000.0 + " ms");
}
static void find(String key, Integer value, List<Map<String,Integer>> maps) {
long start = System.nanoTime();
int matches = 0;
for (Map<String,Integer> map : maps) {
Integer got = map.get(key);
if (got != null && got.equals(value)) {
matches++;
}
}
long end = System.nanoTime();
System.out.println(matches + " matches for " + key + " = " + value + " in " + (end - start) / 1000000.0 + " ms");
}
static void buildVector(List<Map<String,Integer>> maps) {
long start = System.nanoTime();
int entryCounter = 0;
Map<String,Integer> map = new HashMap<>();
for (int i = 0; i < NUM_OF_MAPS; i++) {
map.put("a", entryCounter++);
map.put("b", entryCounter++);
map.put("c", entryCounter++);
map.put("d", entryCounter++);
map.put("e", entryCounter++);
map.put("f", entryCounter++);
maps.add(new HashMap<>(map));
entryCounter %= 100;
}
long end = System.nanoTime();
System.out.println("build vector: " + (end - start) / 1000000.0 + " ms (" + maps.size() + ")");
}
编辑:Sry 复制了两次 Java 代码而不是 C++ 代码。
C++ 代码并不太慢。 java 代码经过更好的哈希优化。
- 在c++中,是unordered_map负责计算hash。因此,您集合中的每个容器 将在
unordered_map<string,int>::const_iterator got = map.find(key)
期间对字符串进行哈希处理。
- 在java中,HashMap依赖于object的hashCode方法。问题是,String class 只能在初始化和修改字符串时计算散列。
在 hash(string) -> int
计算方面,你的 在 c++ 中的查找方法是 O(NUM_OF_MAPS)
,而在 java 中它是 O(1)
.
要添加到 UmNyobe 的答案中,您可以通过创建自己的字符串类型来缓存计算的哈希值来提高性能:
class hashed_string : public std::string
{
public:
hashed_string( const std::string& str )
: std::string( str ), hash( std::hash( str ) )
{
}
size_t getHash() { return hash; }
private:
size_t hash;
};
namespace std
{
template<> struct hash< hashed_string >
{
typedef hashed_string argument_type;
typedef std::size_t result_type;
result_type operator()(argument_type const& s) const noexcept
{
return s.getHash();
}
};
}
您需要扩展 hashed_string
的实现以防止修改底层字符串或在修改字符串时重新计算哈希。通过使字符串成为成员而不是基础 class.
可能更容易实现
我写了一个小程序,用一些样本数据创建了一个包含 200 万张地图的向量,然后查询了一些值。
我知道此时我可以使用数据库,但我只是想尝试一下性能优化。
代码:
#include <iostream>
#include <vector>
#include <unordered_map>
#include <map>
#include <string>
#include <chrono>
using namespace std;
static int NUM_OF_MAPS = 2 * 1000 * 1000;
void buildVector(vector<unordered_map <string, int>> &maps);
void find(string key, int value, vector<unordered_map <string, int>> &maps);
int main() {
auto startPrg = chrono::steady_clock::now();
vector<unordered_map <string, int>> maps;
buildVector(maps);
for (int i = 0; i < 10; i++) {
string s(1, 'a'+ i);
find(s, i, maps);
}
auto endPrg = chrono::steady_clock::now();
cout << "program duration: " << chrono::duration_cast<chrono::microseconds>(endPrg - startPrg).count() / 1000.0 << " ms" << endl;
return 0;
}
void find(string key, int value, vector<unordered_map <string, int>> &maps) {
auto start = chrono::steady_clock::now();
int matches = 0;
for (unordered_map <string, int> &map : maps) {
unordered_map<string,int>::const_iterator got = map.find(key);
if (got != map.end() && got->second == value) {
matches++;
}
}
auto end = chrono::steady_clock::now();
cout << matches << " matches for " << key << " = " << value << " in " << chrono::duration_cast<chrono::microseconds>(end - start).count() / 1000.0 << " ms" << endl;
}
void buildVector(vector<unordered_map <string, int>> &maps) {
auto start = chrono::steady_clock::now();
maps.reserve(NUM_OF_MAPS);
int entryCounter = 0;
unordered_map <string, int> map;
for (int i = 0; i < NUM_OF_MAPS; i++) {
map["a"] = entryCounter++;
map["b"] = entryCounter++;
map["c"] = entryCounter++;
map["d"] = entryCounter++;
map["e"] = entryCounter++;
map["f"] = entryCounter++;
maps.push_back(map);
entryCounter %= 100;
}
auto end = chrono::steady_clock::now();
cout << "build vector: " << chrono::duration_cast<chrono::microseconds>(end - start).count() / 1000.0 << " ms (" << maps.size() << ")" << endl;
}
输出:
build vector: 697.381 ms (2000000)
40000 matches for a = 0 in 67.873 ms
40000 matches for b = 1 in 64.176 ms
40000 matches for c = 2 in 60.484 ms
40000 matches for d = 3 in 68.102 ms
40000 matches for e = 4 in 62.71 ms
40000 matches for f = 5 in 65.723 ms
0 matches for g = 6 in 64.407 ms
0 matches for h = 7 in 45.401 ms
0 matches for i = 8 in 65.307 ms
0 matches for j = 9 in 64.371 ms
program duration: 1326.42 ms
我在Java中做了同样的事情,只是为了比较速度,得到了以下结果:
build vector: 2536.971578 ms (2000000)
40000 matches for a = 0 in 59.293339 ms
40000 matches for b = 1 in 56.306123 ms
40000 matches for c = 2 in 53.503208 ms
40000 matches for d = 3 in 51.174979 ms
40000 matches for e = 4 in 50.967731 ms
40000 matches for f = 5 in 53.68969 ms
0 matches for g = 6 in 41.927401 ms
0 matches for h = 7 in 36.160645 ms
0 matches for i = 8 in 33.535616 ms
0 matches for j = 9 in 36.56883 ms
program duration: 3016.979919 ms
虽然 C++ 在创建数据方面要快得多,但在查询部分却非常慢(与 Java 相比)。有什么方法可以让 C++ 在该部分也击败 Java 吗?
Java代码:
static int NUM_OF_MAPS = 2 * 1000 * 1000;
public static void run() {
long startPrg = System.nanoTime();
List<Map<String,Integer>> maps = new ArrayList<>(NUM_OF_MAPS);
buildVector(maps);
for (int i = 0; i < 10; i++) {
String s = String.valueOf((char)('a' + i));
find(s, i, maps);
}
long endPrg = System.nanoTime();
System.out.println("program duration: " + (endPrg - startPrg) / 1000000.0 + " ms");
}
static void find(String key, Integer value, List<Map<String,Integer>> maps) {
long start = System.nanoTime();
int matches = 0;
for (Map<String,Integer> map : maps) {
Integer got = map.get(key);
if (got != null && got.equals(value)) {
matches++;
}
}
long end = System.nanoTime();
System.out.println(matches + " matches for " + key + " = " + value + " in " + (end - start) / 1000000.0 + " ms");
}
static void buildVector(List<Map<String,Integer>> maps) {
long start = System.nanoTime();
int entryCounter = 0;
Map<String,Integer> map = new HashMap<>();
for (int i = 0; i < NUM_OF_MAPS; i++) {
map.put("a", entryCounter++);
map.put("b", entryCounter++);
map.put("c", entryCounter++);
map.put("d", entryCounter++);
map.put("e", entryCounter++);
map.put("f", entryCounter++);
maps.add(new HashMap<>(map));
entryCounter %= 100;
}
long end = System.nanoTime();
System.out.println("build vector: " + (end - start) / 1000000.0 + " ms (" + maps.size() + ")");
}
编辑:Sry 复制了两次 Java 代码而不是 C++ 代码。
C++ 代码并不太慢。 java 代码经过更好的哈希优化。
- 在c++中,是unordered_map负责计算hash。因此,您集合中的每个容器 将在
unordered_map<string,int>::const_iterator got = map.find(key)
期间对字符串进行哈希处理。 - 在java中,HashMap依赖于object的hashCode方法。问题是,String class 只能在初始化和修改字符串时计算散列。
在 hash(string) -> int
计算方面,你的 在 c++ 中的查找方法是 O(NUM_OF_MAPS)
,而在 java 中它是 O(1)
.
要添加到 UmNyobe 的答案中,您可以通过创建自己的字符串类型来缓存计算的哈希值来提高性能:
class hashed_string : public std::string
{
public:
hashed_string( const std::string& str )
: std::string( str ), hash( std::hash( str ) )
{
}
size_t getHash() { return hash; }
private:
size_t hash;
};
namespace std
{
template<> struct hash< hashed_string >
{
typedef hashed_string argument_type;
typedef std::size_t result_type;
result_type operator()(argument_type const& s) const noexcept
{
return s.getHash();
}
};
}
您需要扩展 hashed_string
的实现以防止修改底层字符串或在修改字符串时重新计算哈希。通过使字符串成为成员而不是基础 class.