为什么在编译时生成具有 ~30K 条目的静态 HashMap 会消耗如此多的资源?
Why does generating a static HashMap with ~30K entries at compile time consume so many resources?
我正在尝试编写一个 build.rs
脚本来创建一个最新的 HashMap
,将 MAC 地址的前 6 个字符与其对应的供应商映射。
它有 29231 个键值对,这导致 cargo check
在我的源代码上花费了 7 多分钟。在此之前,不到二十秒。它还使用了 全部 8GB 的笔记本电脑可用 RAM,我无法在那 7-8 分钟内使用它。
我认为这是一个 rustc
/cargo
错误,或者我做错了什么,我很确定是后者。生成这样的代码的正确方法是什么?
main.rs
use std::collections::{HashMap, HashSet};
use rustc_hash::{FxHashMap, FxHashSet, FxHasher};
type CustomHasher = BuildHasherDefault<FxHasher>;
include!(concat!(env!("OUT_DIR"), "/map_oui.rs"));
map_oui.rs
#[rustfmt::skip]
lazy_static! {
static ref MAP_MACS: FxHashMap<&'static [u8; 6], &'static str> = {
let mut map_macs = HashMap::with_capacity_and_hasher(29231, CustomHasher::default());
map_macs.insert(b"002272", "American Micro-Fuel Device Corp.");
map_macs.insert(b"00D0EF", "IGT");
//...
build.rs
use std::env;
use std::fs::File;
use std::io::prelude::*;
use std::io::{BufReader, BufWriter};
use std::path::Path;
fn main() {
let out_dir = env::var_os("OUT_DIR").unwrap();
let dest_path = Path::new(&out_dir).join("map_oui.rs");
let handle = File::create(dest_path).unwrap();
let mut writer = BufWriter::new(handle);
let response = ureq::get("http://standards-oui.ieee.org/oui.txt")
.call()
.expect("Conection Error");
let mut reader = BufReader::new(response.into_reader());
let mut line = Vec::new();
writer
.write(
b"#[rustfmt::skip]
lazy_static! {
static ref MAP_MACS: FxHashMap<&'static [u8; 6], &'static str> = {
let mut map_macs = HashMap::with_capacity_and_hasher(29231, CustomHasher::default());\n",
)
.unwrap();
loop {
match reader.read_until('\n' as u8, &mut line) {
Ok(bytes_read) => {
if bytes_read == 0 {
break;
}
if line.get(12..=18).map_or(false, |s| s == b"base 16") {
let mac_oui = String::from_utf8_lossy(&line[0..6]);
let vendor = String::from_utf8_lossy(&line[22..]);
writer.write(b" map_macs.insert(b\"").unwrap();
writer.write(mac_oui.as_bytes()).unwrap();
writer.write(b"\", \"").unwrap();
writer.write(vendor.trim().as_bytes()).unwrap();
writer.write(b"\");\n").unwrap();
}
line.clear();
}
Err(_) => (),
}
}
writer
.write(
b" map_macs
};
}
",
)
.unwrap();
writer.flush().unwrap();
println!("cargo:rerun-if-changed=build.rs");
}
我听从了@Thomas 和@Shepmaster 的建议并且成功了。当前 build.rs
生成一个 const MAP_MACS: [([u8; 6], &str); 29246]
并且我围绕数组的二进制搜索编写了一个名为 vendor_lookup
的包装函数。但是,最好知道如何将 HashMap 与自定义 Hasher 结合使用。
main.rs
include!(concat!(env!("OUT_DIR"), "/map_oui.rs"));
fn vendor_lookup(mac_oui: &[u8; 6]) -> &'static str {
let idx = MAP_MACS
.binary_search_by(|probe| probe.0.cmp(mac_oui))
.unwrap(); // this should be a `?`
MAP_MACS[idx].1
}
fn main() {
assert_eq!(vendor_lookup(b"4C3C16"), "Samsung Electronics Co.,Ltd");
}
map_oui.rs
const MAP_MACS: [([u8; 6], &str); 29246] = [
([48, 48, 48, 48, 48, 48], "XEROX CORPORATION"),
([48, 48, 48, 48, 48, 49], "XEROX CORPORATION"),
([48, 48, 48, 48, 48, 50], "XEROX CORPORATION"),
//---snip---
]
build.rs
use std::env;
use std::fs::File;
use std::io::prelude::*;
use std::io::{BufReader, BufWriter};
use std::path::Path;
fn main() {
let response = ureq::get("http://standards-oui.ieee.org/oui.txt")
.call()
.expect("Conection Error");
let mut reader = BufReader::new(response.into_reader());
let mut data: Vec<(Vec<u8>, String)> = Vec::new();
let mut line = Vec::new();
while reader.read_until(b'\n', &mut line).unwrap() != 0 {
if line.get(12..=18).map_or(false, |s| s == b"base 16") {
let mac_oui = line[0..6].to_owned();
let vendor = String::from_utf8_lossy(&line[22..]).trim().to_owned();
data.push((mac_oui, vendor));
}
line.clear();
}
data.sort_unstable();
let out_dir = env::var_os("OUT_DIR").unwrap();
let dest_path = Path::new(&out_dir).join("map_oui.rs");
let handle = File::create(dest_path).unwrap();
let mut writer = BufWriter::new(handle);
writeln!(
&mut writer,
"const MAP_MACS: [([u8; 6], &str); {}] = [",
data.len()
)
.unwrap();
for (key, value) in data {
writeln!(&mut writer, " ({:?}, \"{}\"),", key, value).unwrap();
}
writeln!(&mut writer, "];").unwrap();
writer.flush().unwrap();
println!("cargo:rerun-if-changed=build.rs");
}
我正在尝试编写一个 build.rs
脚本来创建一个最新的 HashMap
,将 MAC 地址的前 6 个字符与其对应的供应商映射。
它有 29231 个键值对,这导致 cargo check
在我的源代码上花费了 7 多分钟。在此之前,不到二十秒。它还使用了 全部 8GB 的笔记本电脑可用 RAM,我无法在那 7-8 分钟内使用它。
我认为这是一个 rustc
/cargo
错误,或者我做错了什么,我很确定是后者。生成这样的代码的正确方法是什么?
main.rs
use std::collections::{HashMap, HashSet};
use rustc_hash::{FxHashMap, FxHashSet, FxHasher};
type CustomHasher = BuildHasherDefault<FxHasher>;
include!(concat!(env!("OUT_DIR"), "/map_oui.rs"));
map_oui.rs
#[rustfmt::skip]
lazy_static! {
static ref MAP_MACS: FxHashMap<&'static [u8; 6], &'static str> = {
let mut map_macs = HashMap::with_capacity_and_hasher(29231, CustomHasher::default());
map_macs.insert(b"002272", "American Micro-Fuel Device Corp.");
map_macs.insert(b"00D0EF", "IGT");
//...
build.rs
use std::env;
use std::fs::File;
use std::io::prelude::*;
use std::io::{BufReader, BufWriter};
use std::path::Path;
fn main() {
let out_dir = env::var_os("OUT_DIR").unwrap();
let dest_path = Path::new(&out_dir).join("map_oui.rs");
let handle = File::create(dest_path).unwrap();
let mut writer = BufWriter::new(handle);
let response = ureq::get("http://standards-oui.ieee.org/oui.txt")
.call()
.expect("Conection Error");
let mut reader = BufReader::new(response.into_reader());
let mut line = Vec::new();
writer
.write(
b"#[rustfmt::skip]
lazy_static! {
static ref MAP_MACS: FxHashMap<&'static [u8; 6], &'static str> = {
let mut map_macs = HashMap::with_capacity_and_hasher(29231, CustomHasher::default());\n",
)
.unwrap();
loop {
match reader.read_until('\n' as u8, &mut line) {
Ok(bytes_read) => {
if bytes_read == 0 {
break;
}
if line.get(12..=18).map_or(false, |s| s == b"base 16") {
let mac_oui = String::from_utf8_lossy(&line[0..6]);
let vendor = String::from_utf8_lossy(&line[22..]);
writer.write(b" map_macs.insert(b\"").unwrap();
writer.write(mac_oui.as_bytes()).unwrap();
writer.write(b"\", \"").unwrap();
writer.write(vendor.trim().as_bytes()).unwrap();
writer.write(b"\");\n").unwrap();
}
line.clear();
}
Err(_) => (),
}
}
writer
.write(
b" map_macs
};
}
",
)
.unwrap();
writer.flush().unwrap();
println!("cargo:rerun-if-changed=build.rs");
}
我听从了@Thomas 和@Shepmaster 的建议并且成功了。当前 build.rs
生成一个 const MAP_MACS: [([u8; 6], &str); 29246]
并且我围绕数组的二进制搜索编写了一个名为 vendor_lookup
的包装函数。但是,最好知道如何将 HashMap 与自定义 Hasher 结合使用。
main.rs
include!(concat!(env!("OUT_DIR"), "/map_oui.rs"));
fn vendor_lookup(mac_oui: &[u8; 6]) -> &'static str {
let idx = MAP_MACS
.binary_search_by(|probe| probe.0.cmp(mac_oui))
.unwrap(); // this should be a `?`
MAP_MACS[idx].1
}
fn main() {
assert_eq!(vendor_lookup(b"4C3C16"), "Samsung Electronics Co.,Ltd");
}
map_oui.rs
const MAP_MACS: [([u8; 6], &str); 29246] = [
([48, 48, 48, 48, 48, 48], "XEROX CORPORATION"),
([48, 48, 48, 48, 48, 49], "XEROX CORPORATION"),
([48, 48, 48, 48, 48, 50], "XEROX CORPORATION"),
//---snip---
]
build.rs
use std::env;
use std::fs::File;
use std::io::prelude::*;
use std::io::{BufReader, BufWriter};
use std::path::Path;
fn main() {
let response = ureq::get("http://standards-oui.ieee.org/oui.txt")
.call()
.expect("Conection Error");
let mut reader = BufReader::new(response.into_reader());
let mut data: Vec<(Vec<u8>, String)> = Vec::new();
let mut line = Vec::new();
while reader.read_until(b'\n', &mut line).unwrap() != 0 {
if line.get(12..=18).map_or(false, |s| s == b"base 16") {
let mac_oui = line[0..6].to_owned();
let vendor = String::from_utf8_lossy(&line[22..]).trim().to_owned();
data.push((mac_oui, vendor));
}
line.clear();
}
data.sort_unstable();
let out_dir = env::var_os("OUT_DIR").unwrap();
let dest_path = Path::new(&out_dir).join("map_oui.rs");
let handle = File::create(dest_path).unwrap();
let mut writer = BufWriter::new(handle);
writeln!(
&mut writer,
"const MAP_MACS: [([u8; 6], &str); {}] = [",
data.len()
)
.unwrap();
for (key, value) in data {
writeln!(&mut writer, " ({:?}, \"{}\"),", key, value).unwrap();
}
writeln!(&mut writer, "];").unwrap();
writer.flush().unwrap();
println!("cargo:rerun-if-changed=build.rs");
}