为什么不区分大小写的 HashSet<T> 性能如此糟糕

Why is case-insensitive HashSet<T> performance so bad

为什么 HashSet<string>(StringComparer.InvariantCultureIgnoreCase) (Perf_HashSet_CaseInsensitive) 的表现(表现)如此糟糕? Perf_HashSet 解决方法相对性能提高了 20 倍。

// perf Contains(): 1M iterations, 25 size (unsuccessful lookup)
Test                         Duration (ms)
Perf_HashSet                            43 <-
Perf_Dictionary                         49 
Perf_HybridDictionary                   63 
Perf_ListDictionary                    223 
Perf_List                              225 
Perf_HashSet_CaseInsensitive           903 <-

代码:

// <TargetFramework>net5.0</TargetFramework>
[TestFixture]
public class ContainsPerfTests
{
    private const int iterations = 1_000_000;
    private const int size = 25;

    [Test]
    [Explicit]
    public void Perf_List()
    {
        var list = new List<string>();
        for (int i = 0; i < size; i++)
        {
            list.Add(Guid.NewGuid().ToString().ToLowerInvariant());
        }
        var x = Guid.NewGuid().ToString();
        var sw = Stopwatch.StartNew();
        for (int i = 0; i < iterations; i++)
        {
            var contains = list.Contains(x.ToLowerInvariant());
        }
        sw.Stop();
        Console.WriteLine(sw.ElapsedMilliseconds);
    }

    [Test]
    [Explicit]
    public void Perf_HashSet()
    {
        var hashSet = new HashSet<string>();
        for (int i = 0; i < size; i++)
        {
            hashSet.Add(Guid.NewGuid().ToString().ToLowerInvariant());
        }
        var x = Guid.NewGuid().ToString();
        var sw = Stopwatch.StartNew();
        for (int i = 0; i < iterations; i++)
        {
            var contains = hashSet.Contains(x.ToLowerInvariant());
        }
        sw.Stop();
        Console.WriteLine(sw.ElapsedMilliseconds);
    }

    [Test]
    [Explicit]
    public void Perf_HashSet_CaseInsensitive()
    {
        var hashSetCaseInsensitive = new HashSet<string>(StringComparer.InvariantCultureIgnoreCase);
        for (int i = 0; i < size; i++)
        {
            hashSetCaseInsensitive.Add(Guid.NewGuid().ToString().ToLowerInvariant());
        }
        var x = Guid.NewGuid().ToString();
        var sw = Stopwatch.StartNew();
        for (int i = 0; i < iterations; i++)
        {
            var contains = hashSetCaseInsensitive.Contains(x);
        }
        sw.Stop();
        Console.WriteLine(sw.ElapsedMilliseconds);
    }

    [Test]
    [Explicit]
    public void Perf_Dictionary()
    {
        var dictionary = new Dictionary<string, bool>();
        for (int i = 0; i < size; i++)
        {
            dictionary.Add(Guid.NewGuid().ToString().ToLowerInvariant(), false);
        }
        var x = Guid.NewGuid().ToString();
        var sw = Stopwatch.StartNew();
        for (int i = 0; i < iterations; i++)
        {
            var contains = dictionary.ContainsKey(x.ToLowerInvariant());
        }
        sw.Stop();
        Console.WriteLine(sw.ElapsedMilliseconds);
    }

    [Test]
    [Explicit]
    public void Perf_HybridDictionary()
    {
        var hybridDictionary = new HybridDictionary(caseInsensitive: true);
        for (int i = 0; i < size; i++)
        {
            hybridDictionary.Add(Guid.NewGuid().ToString().ToLowerInvariant(), null);
        }
        var x = Guid.NewGuid().ToString();
        var sw = Stopwatch.StartNew();
        for (int i = 0; i < iterations; i++)
        {
            var contains = hybridDictionary.Contains(x);
        }
        sw.Stop();
        Console.WriteLine(sw.ElapsedMilliseconds);
    }

    [Test]
    [Explicit]
    public void Perf_ListDictionary()
    {
        var listDictionary = new ListDictionary();
        for (int i = 0; i < size; i++)
        {
            listDictionary.Add(Guid.NewGuid().ToString().ToLowerInvariant(), null);
        }
        var x = Guid.NewGuid().ToString();
        var sw = Stopwatch.StartNew();
        for (int i = 0; i < iterations; i++)
        {
            var contains = listDictionary.Contains(x.ToLowerInvariant());
        }
        sw.Stop();
        Console.WriteLine(sw.ElapsedMilliseconds);
    }
}

造成这种巨大性能差异的主要原因是因为您对不区分大小写的哈希集使用区分区域性的比较,而区分大小写的哈希集使用序号。

在不向散列集传递任何相等比较器的情况下,散列集使用该类型的默认相等比较器,它调用 Equals 方法。请注意 String.Equals

performs an ordinal (case-sensitive and culture-insensitive) comparison.

StringComparer.InvariantCultureIgnoreCase:

performs a case-insensitive string comparison using the word comparison rules of the invariant culture.

重要的是要注意“不变文化”与“文化不敏感”或“序数”不同:

Console.WriteLine(StringComparer.InvariantCultureIgnoreCase.Equals( "ss", "ß")); // 真的 Console.WriteLine(StringComparer.OrdinalIgnoreCase.Equals( "ss", "ß")); // 假

不变文化仍然有整理规则,查找和应用这些规则需要时间。

InvariantCultureIgnoreCase更改为OrdinalIgnoreCase应该会使执行时间几乎相同。正如评论所指出的那样,这并不意味着您的基准测试没有其他问题。