使用线程更快地解析多个 Html 页面

Using threads to parse multiple Html pages faster

这是我正在尝试做的事情:

  1. 从 url 中获取一个 html 页,其中包含多个 link 内部
  2. 访问每个link
  3. 从访问过的 link 中提取一些数据并使用它创建对象

到目前为止我所做的只是简单而缓慢的方式:

public List<Link> searchLinks(string name)
    {
        List<Link> foundLinks = new List<Link>();
        // getHtmlDocument() just returns HtmlDocument using input url.
        HtmlDocument doc = getHtmlDocument(AU_SEARCH_URL + fixSpaces(name));
        var link_list = doc.DocumentNode.SelectNodes(@"/html/body/div[@id='parent-container']/div[@id='main-content']/ol[@id='searchresult']/li/h2/a");
        foreach (var link in link_list)
        {
            // TODO Threads

            // getObject() creates object using data gathered
            foundLinks.Add(getObject(link.InnerText, link.Attributes["href"].Value, getLatestEpisode(link.Attributes["href"].Value)));
        }
        return foundLinks;
    }

为了做到这一点 faster/efficient 我需要实现线程,但我不确定应该如何处理它,因为我不能随机启动线程,我需要等待它们完成, thread.Join() 解决了 'wait for threads to finish' 问题,但我认为它变得不再快,因为线程将在较早的线程完成后启动。

将工作卸载到多个线程的最简单方法是 use Parallel.ForEach() in place of your current loop。像这样:

Parallel.ForEach(link_list, link =>
{
    foundLinks.Add(getObject(link.InnerText, link.Attributes["href"].Value, getLatestEpisode(link.Attributes["href"].Value)));
});

我不确定您的整体代码中是否还有其他线程问题。 (例如,请注意,这将不再保证数据将以相同的顺序添加到 foundLinks。)但只要没有明确阻止并发工作的发生,那么这将利用线程在多个 CPU 核心上处理工作。

也许你应该使用线程池:

来自 MSDN 的示例:

using System;
using System.Threading;

public class Fibonacci
{
private int _n;
private int _fibOfN;
private ManualResetEvent _doneEvent;

public int N { get { return _n; } }
public int FibOfN { get { return _fibOfN; } }

// Constructor. 
public Fibonacci(int n, ManualResetEvent doneEvent)
{
    _n = n;
    _doneEvent = doneEvent;
}

// Wrapper method for use with thread pool. 
public void ThreadPoolCallback(Object threadContext)
{
    int threadIndex = (int)threadContext;
    Console.WriteLine("thread {0} started...", threadIndex);
    _fibOfN = Calculate(_n);
    Console.WriteLine("thread {0} result calculated...", threadIndex);
    _doneEvent.Set();
}

// Recursive method that calculates the Nth Fibonacci number. 
public int Calculate(int n)
{
    if (n <= 1)
    {
        return n;
    }

    return Calculate(n - 1) + Calculate(n - 2);
}
}

public class ThreadPoolExample
{
static void Main()
{
    const int FibonacciCalculations = 10;

    // One event is used for each Fibonacci object.
    ManualResetEvent[] doneEvents = new ManualResetEvent[FibonacciCalculations];
    Fibonacci[] fibArray = new Fibonacci[FibonacciCalculations];
    Random r = new Random();

    // Configure and start threads using ThreadPool.
    Console.WriteLine("launching {0} tasks...", FibonacciCalculations);
    for (int i = 0; i < FibonacciCalculations; i++)
    {
        doneEvents[i] = new ManualResetEvent(false);
        Fibonacci f = new Fibonacci(r.Next(20, 40), doneEvents[i]);
        fibArray[i] = f;
        ThreadPool.QueueUserWorkItem(f.ThreadPoolCallback, i);
    }

    // Wait for all threads in pool to calculate.
    WaitHandle.WaitAll(doneEvents);
    Console.WriteLine("All calculations are complete.");

    // Display the results. 
    for (int i= 0; i<FibonacciCalculations; i++)
    {
        Fibonacci f = fibArray[i];
        Console.WriteLine("Fibonacci({0}) = {1}", f.N, f.FibOfN);
    }
}
}