比较字符串元素的最快方法

Question

我有一个包含很多字符串 (>5000) 的列表，我必须在其中获取第一个元素并将其与所有后续元素进行比较。例如。考虑这个字符串列表： { one, two, three, four, five, six, seven, eight, nine, ten }。现在我取 one 并将其与 two, three, four, ... 进行比较然后我取 two 并将其与 three, four, ...

进行比较

我相信查找是造成这需要这么长时间的问题。在普通硬盘（7200rpm）上大约需要 30 秒，在固态硬盘上需要 10 秒。我只是不知道如何才能进一步加快速度。列表中的所有字符串均按升序排列，按此顺序检查它们很重要。如果它可以大大加快速度，我不介意有一个无序列表。

我查看了哈希集，但我需要检查顺序，这样即使使用快速包含方法也无法正常工作。

编辑：看来我还不够清楚，正如 Dusan 想要的，这里是完整的代码。我的问题案例：我有很多目录，名称相似，并且正在获取一个仅包含所有目录名称的集合，并将它们相互比较以获得相似性并编写它。因此，硬盘和固态硬盘之间的比较。但这很奇怪，因为我不是立即写作，而是把它放在一个字段中，最后再写作。速度还是有差距的。为什么我不包括整个代码？因为我相信我的核心问题是从列表中查找值以及比较两个字符串。其他一切都应该已经足够快，添加到列表，查看黑名单（哈希集）并获取目录名称列表。

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.IO;
using System.Text.RegularExpressions;
using System.Diagnostics;
using System.Threading;

namespace Similarity
{
    /// <summary>
    /// Credit http://www.dotnetperls.com/levenshtein
    /// Contains approximate string matching
    /// </summary>
    internal static class LevenshteinDistance
    {
        /// <summary>
        /// Compute the distance between two strings.
        /// </summary>
        public static int Compute(string s, string t)
        {
            int n = s.Length;
            int m = t.Length;
            int[,] d = new int[n + 1, m + 1];

            // Step 1
            if (n == 0)
            {
                return m;
            }

            if (m == 0)
            {
                return n;
            }

            // Step 2
            for (int i = 0; i <= n; d[i, 0] = i++)
            {
            }

            for (int j = 0; j <= m; d[0, j] = j++)
            {
            }

            // Step 3
            for (int i = 1; i <= n; i++)
            {
                //Step 4
                for (int j = 1; j <= m; j++)
                {
                    // Step 5
                    int cost = (t[j - 1] == s[i - 1]) ? 0 : 1;

                    // Step 6
                    d[i, j] = Math.Min(
                        Math.Min(d[i - 1, j] + 1, d[i, j - 1] + 1),
                        d[i - 1, j - 1] + cost);
                }
            }
            // Step 7
            return d[n, m];
        }
    }

    internal class Program
    {
        #region Properties

        private static HashSet<string> _blackList = new HashSet<string>();

        public static HashSet<string> blackList
        {
            get
            {
                return _blackList;
            }
        }

        public static void AddBlackListEntry(string line)
        {
            blackList.Add(line);
        }

        private static List<string> _similar = new List<string>();

        public static List<string> similar
        {
            get
            {
                return _similar;
            }
        }

        public static void AddSimilarEntry(string s)
        {
            similar.Add(s);
        }

        #endregion Properties

        private static void Main(string[] args)
        {
            Clean();
            var directories = Directory.EnumerateDirectories(Directory.GetCurrentDirectory(), "*", SearchOption.TopDirectoryOnly)
                                .Select(x => new DirectoryInfo(x).Name).OrderBy(y => new DirectoryInfo(y).Name).ToList();

            using (StreamWriter sw = new StreamWriter(@"result.txt"))
            {
                foreach (var item in directories)
                {
                    Console.WriteLine(item);
                    sw.WriteLine(item);
                }
                Console.WriteLine("Amount of directories: " + directories.Count());
            }

            if (directories.Count != 0)
            {
                StartSimilarityCheck(directories);
            }
            else
            {
                Console.WriteLine("No directories");
            }

            WriteResult(similar);

            Console.WriteLine("Finish. Press any key to exit...");
            Console.ReadKey();
        }

        private static void StartSimilarityCheck(List<string> whiteList)
        {
            int counter = 0; // how many did we check yet?
            var watch = Stopwatch.StartNew();
            foreach (var dirName in whiteList)
            {
                bool insertDirName = true;
                if (!IsBlackList(dirName))
                {
                    // start the next element
                    for (int i = counter + 1; i <= whiteList.Count; i++)
                    {
                        // end of index reached
                        if (i == whiteList.Count)
                        {
                            break;
                        }

                        int similiariy = LevenshteinDistance.Compute(dirName, whiteList[i]);

                        // low score means high similarity
                        if (similiariy < 2)
                        {
                            if (insertDirName)
                            {
                                //Writer(dirName);
                                AddSimilarEntry(dirName);
                                insertDirName = false;
                            }
                            //Writer(whiteList[i]);
                            AddSimilarEntry(dirName);
                            AddBlackListEntry(whiteList[i]);
                        }
                    }
                }
                Console.WriteLine(counter);
                //Console.WriteLine("Skip: {0}", dirName);
                counter++;
            }
            watch.Stop();
            Console.WriteLine("Time: " + watch.ElapsedMilliseconds / 1000);
        }

        private static void WriteResult(List<string> list)
        {
            using (StreamWriter sw = new StreamWriter(@"similar.txt", true, Encoding.UTF8, 65536))
            {
                foreach (var item in list)
                {
                    sw.WriteLine(item);
                }
            }
        }

        private static void Clean()
        {
            // yeah hardcoded file names incoming. Better than global variables??
            try
            {
                if (File.Exists(@"similar.txt"))
                {
                    File.Delete(@"similar.txt");
                }

                if (File.Exists(@"result.txt"))
                {
                    File.Delete(@"result.txt");
                }
            }
            catch (Exception)
            {
                throw;
            }
        }

        private static void Writer(string s)
        {
            using (StreamWriter sw = new StreamWriter(@"similar.txt", true, Encoding.UTF8, 65536))
            {
                sw.WriteLine(s);
            }
        }

        private static bool IsBlackList(string name)
        {
            return blackList.Contains(name);
        }
}

要修复第二个 for 循环的瓶颈，请插入一个 if 条件，该条件检查是否 similiariy >= 我们想要的，如果是这样，则中断循环。现在它在 1 秒内运行。谢谢大家

Answer 1

您的内部循环使用了奇怪的双重检查。这可能会阻止重要的 JIT 优化，删除冗余范围检查。

//foreach (var item myList)
for (int j = 0; j < myList.Count-1; j++)
{
    string item1 = myList[j];

    for (int i = j + 1; i < myList.Count; i++)
    {
       string item2 = myList[i];
       // if (i == myList.Count)
       ...
    }
}

Answer 2

反对票的数量是疯狂的，但是哦好吧...感谢评论，我找到了性能问题/瓶颈的原因。

StartSimilarityCheck() 中的第二个 for 循环遍历所有条目，这本身没有问题，但在性能问题和效率下查看时，很糟糕。解决方案是只检查附近的字符串，但我们如何知道它们是否存在？

首先，我们得到一个按升序排序的列表。这让我们对相似的字符串有了一个粗略的了解。现在我们定义一个 threshold 的 Levenshtein 分数（分数越小，两个字符串之间的相似度越高）。如果分数高于阈值，则表示它们不太相似，因此我们可以跳出内循环。这样可以节省时间，程序可以非常快地完成。请注意，这种方式不是防弹的，恕我直言，因为如果第一个字符串是 0Directory 它将位于列表的开头部分，但像 zDirectory 这样的字符串将更靠后并且可能会被遗漏。如果我错了请纠正我..

    private static void StartSimilarityCheck(List<string> whiteList)
    {
        var watch = Stopwatch.StartNew();
        for (int j = 0; j < whiteList.Count - 1; j++)
        {
            string dirName = whiteList[j];
            bool insertDirName = true;
            int threshold = 2;
            if (!IsBlackList(dirName))
            {
                // start the next element
                for (int i = j + 1; i < whiteList.Count; i++)
                {
                    // end of index reached
                    if (i == whiteList.Count)
                    {
                        break;
                    }

                    int similiarity = LevenshteinDistance.Compute(dirName, whiteList[i]);
                    if (similiarity >= threshold)
                    {
                        break;
                    }
                    // low score means high similarity
                    if (similiarity <= threshold)
                    {
                        if (insertDirName)
                        {
                            AddSimilarEntry(dirName);
                            AddSimilarEntry(whiteList[i]);
                            AddBlackListEntry(whiteList[i]);
                            insertDirName = false;
                        }
                        else
                        {
                            AddBlackListEntry(whiteList[i]);
                        }
                    }
                }
            }
            Console.WriteLine(j);
        }
        watch.Stop();
        Console.WriteLine("Ms: " + watch.ElapsedMilliseconds);
        Console.WriteLine("Similar entries: " + similar.Count);
    }

比较字符串元素的最快方法

fastest way to compare string elements with each other

c#

string

performance

string-comparison