为什么不区分大小写的 HashSet<T> 性能如此糟糕
Why is case-insensitive HashSet<T> performance so bad
为什么 HashSet<string>(StringComparer.InvariantCultureIgnoreCase)
(Perf_HashSet_CaseInsensitive
) 的表现(表现)如此糟糕? Perf_HashSet
解决方法相对性能提高了 20 倍。
// perf Contains(): 1M iterations, 25 size (unsuccessful lookup)
Test Duration (ms)
Perf_HashSet 43 <-
Perf_Dictionary 49
Perf_HybridDictionary 63
Perf_ListDictionary 223
Perf_List 225
Perf_HashSet_CaseInsensitive 903 <-
代码:
// <TargetFramework>net5.0</TargetFramework>
[TestFixture]
public class ContainsPerfTests
{
private const int iterations = 1_000_000;
private const int size = 25;
[Test]
[Explicit]
public void Perf_List()
{
var list = new List<string>();
for (int i = 0; i < size; i++)
{
list.Add(Guid.NewGuid().ToString().ToLowerInvariant());
}
var x = Guid.NewGuid().ToString();
var sw = Stopwatch.StartNew();
for (int i = 0; i < iterations; i++)
{
var contains = list.Contains(x.ToLowerInvariant());
}
sw.Stop();
Console.WriteLine(sw.ElapsedMilliseconds);
}
[Test]
[Explicit]
public void Perf_HashSet()
{
var hashSet = new HashSet<string>();
for (int i = 0; i < size; i++)
{
hashSet.Add(Guid.NewGuid().ToString().ToLowerInvariant());
}
var x = Guid.NewGuid().ToString();
var sw = Stopwatch.StartNew();
for (int i = 0; i < iterations; i++)
{
var contains = hashSet.Contains(x.ToLowerInvariant());
}
sw.Stop();
Console.WriteLine(sw.ElapsedMilliseconds);
}
[Test]
[Explicit]
public void Perf_HashSet_CaseInsensitive()
{
var hashSetCaseInsensitive = new HashSet<string>(StringComparer.InvariantCultureIgnoreCase);
for (int i = 0; i < size; i++)
{
hashSetCaseInsensitive.Add(Guid.NewGuid().ToString().ToLowerInvariant());
}
var x = Guid.NewGuid().ToString();
var sw = Stopwatch.StartNew();
for (int i = 0; i < iterations; i++)
{
var contains = hashSetCaseInsensitive.Contains(x);
}
sw.Stop();
Console.WriteLine(sw.ElapsedMilliseconds);
}
[Test]
[Explicit]
public void Perf_Dictionary()
{
var dictionary = new Dictionary<string, bool>();
for (int i = 0; i < size; i++)
{
dictionary.Add(Guid.NewGuid().ToString().ToLowerInvariant(), false);
}
var x = Guid.NewGuid().ToString();
var sw = Stopwatch.StartNew();
for (int i = 0; i < iterations; i++)
{
var contains = dictionary.ContainsKey(x.ToLowerInvariant());
}
sw.Stop();
Console.WriteLine(sw.ElapsedMilliseconds);
}
[Test]
[Explicit]
public void Perf_HybridDictionary()
{
var hybridDictionary = new HybridDictionary(caseInsensitive: true);
for (int i = 0; i < size; i++)
{
hybridDictionary.Add(Guid.NewGuid().ToString().ToLowerInvariant(), null);
}
var x = Guid.NewGuid().ToString();
var sw = Stopwatch.StartNew();
for (int i = 0; i < iterations; i++)
{
var contains = hybridDictionary.Contains(x);
}
sw.Stop();
Console.WriteLine(sw.ElapsedMilliseconds);
}
[Test]
[Explicit]
public void Perf_ListDictionary()
{
var listDictionary = new ListDictionary();
for (int i = 0; i < size; i++)
{
listDictionary.Add(Guid.NewGuid().ToString().ToLowerInvariant(), null);
}
var x = Guid.NewGuid().ToString();
var sw = Stopwatch.StartNew();
for (int i = 0; i < iterations; i++)
{
var contains = listDictionary.Contains(x.ToLowerInvariant());
}
sw.Stop();
Console.WriteLine(sw.ElapsedMilliseconds);
}
}
造成这种巨大性能差异的主要原因是因为您对不区分大小写的哈希集使用区分区域性的比较,而区分大小写的哈希集使用序号。
在不向散列集传递任何相等比较器的情况下,散列集使用该类型的默认相等比较器,它调用 Equals
方法。请注意 String.Equals
performs an ordinal (case-sensitive and culture-insensitive) comparison.
而 StringComparer.InvariantCultureIgnoreCase
:
performs a case-insensitive string comparison using the word comparison rules of the invariant culture.
重要的是要注意“不变文化”与“文化不敏感”或“序数”不同:
Console.WriteLine(StringComparer.InvariantCultureIgnoreCase.Equals(
"ss", "ß")); // 真的
Console.WriteLine(StringComparer.OrdinalIgnoreCase.Equals(
"ss", "ß")); // 假
不变文化仍然有整理规则,查找和应用这些规则需要时间。
将InvariantCultureIgnoreCase
更改为OrdinalIgnoreCase
应该会使执行时间几乎相同。正如评论所指出的那样,这并不意味着您的基准测试没有其他问题。
为什么 HashSet<string>(StringComparer.InvariantCultureIgnoreCase)
(Perf_HashSet_CaseInsensitive
) 的表现(表现)如此糟糕? Perf_HashSet
解决方法相对性能提高了 20 倍。
// perf Contains(): 1M iterations, 25 size (unsuccessful lookup)
Test Duration (ms)
Perf_HashSet 43 <-
Perf_Dictionary 49
Perf_HybridDictionary 63
Perf_ListDictionary 223
Perf_List 225
Perf_HashSet_CaseInsensitive 903 <-
代码:
// <TargetFramework>net5.0</TargetFramework>
[TestFixture]
public class ContainsPerfTests
{
private const int iterations = 1_000_000;
private const int size = 25;
[Test]
[Explicit]
public void Perf_List()
{
var list = new List<string>();
for (int i = 0; i < size; i++)
{
list.Add(Guid.NewGuid().ToString().ToLowerInvariant());
}
var x = Guid.NewGuid().ToString();
var sw = Stopwatch.StartNew();
for (int i = 0; i < iterations; i++)
{
var contains = list.Contains(x.ToLowerInvariant());
}
sw.Stop();
Console.WriteLine(sw.ElapsedMilliseconds);
}
[Test]
[Explicit]
public void Perf_HashSet()
{
var hashSet = new HashSet<string>();
for (int i = 0; i < size; i++)
{
hashSet.Add(Guid.NewGuid().ToString().ToLowerInvariant());
}
var x = Guid.NewGuid().ToString();
var sw = Stopwatch.StartNew();
for (int i = 0; i < iterations; i++)
{
var contains = hashSet.Contains(x.ToLowerInvariant());
}
sw.Stop();
Console.WriteLine(sw.ElapsedMilliseconds);
}
[Test]
[Explicit]
public void Perf_HashSet_CaseInsensitive()
{
var hashSetCaseInsensitive = new HashSet<string>(StringComparer.InvariantCultureIgnoreCase);
for (int i = 0; i < size; i++)
{
hashSetCaseInsensitive.Add(Guid.NewGuid().ToString().ToLowerInvariant());
}
var x = Guid.NewGuid().ToString();
var sw = Stopwatch.StartNew();
for (int i = 0; i < iterations; i++)
{
var contains = hashSetCaseInsensitive.Contains(x);
}
sw.Stop();
Console.WriteLine(sw.ElapsedMilliseconds);
}
[Test]
[Explicit]
public void Perf_Dictionary()
{
var dictionary = new Dictionary<string, bool>();
for (int i = 0; i < size; i++)
{
dictionary.Add(Guid.NewGuid().ToString().ToLowerInvariant(), false);
}
var x = Guid.NewGuid().ToString();
var sw = Stopwatch.StartNew();
for (int i = 0; i < iterations; i++)
{
var contains = dictionary.ContainsKey(x.ToLowerInvariant());
}
sw.Stop();
Console.WriteLine(sw.ElapsedMilliseconds);
}
[Test]
[Explicit]
public void Perf_HybridDictionary()
{
var hybridDictionary = new HybridDictionary(caseInsensitive: true);
for (int i = 0; i < size; i++)
{
hybridDictionary.Add(Guid.NewGuid().ToString().ToLowerInvariant(), null);
}
var x = Guid.NewGuid().ToString();
var sw = Stopwatch.StartNew();
for (int i = 0; i < iterations; i++)
{
var contains = hybridDictionary.Contains(x);
}
sw.Stop();
Console.WriteLine(sw.ElapsedMilliseconds);
}
[Test]
[Explicit]
public void Perf_ListDictionary()
{
var listDictionary = new ListDictionary();
for (int i = 0; i < size; i++)
{
listDictionary.Add(Guid.NewGuid().ToString().ToLowerInvariant(), null);
}
var x = Guid.NewGuid().ToString();
var sw = Stopwatch.StartNew();
for (int i = 0; i < iterations; i++)
{
var contains = listDictionary.Contains(x.ToLowerInvariant());
}
sw.Stop();
Console.WriteLine(sw.ElapsedMilliseconds);
}
}
造成这种巨大性能差异的主要原因是因为您对不区分大小写的哈希集使用区分区域性的比较,而区分大小写的哈希集使用序号。
在不向散列集传递任何相等比较器的情况下,散列集使用该类型的默认相等比较器,它调用 Equals
方法。请注意 String.Equals
performs an ordinal (case-sensitive and culture-insensitive) comparison.
而 StringComparer.InvariantCultureIgnoreCase
:
performs a case-insensitive string comparison using the word comparison rules of the invariant culture.
重要的是要注意“不变文化”与“文化不敏感”或“序数”不同:
Console.WriteLine(StringComparer.InvariantCultureIgnoreCase.Equals( "ss", "ß")); // 真的 Console.WriteLine(StringComparer.OrdinalIgnoreCase.Equals( "ss", "ß")); // 假
不变文化仍然有整理规则,查找和应用这些规则需要时间。
将InvariantCultureIgnoreCase
更改为OrdinalIgnoreCase
应该会使执行时间几乎相同。正如评论所指出的那样,这并不意味着您的基准测试没有其他问题。