使用线程更快地解析多个 Html 页面
Using threads to parse multiple Html pages faster
这是我正在尝试做的事情:
- 从 url 中获取一个 html 页,其中包含多个 link 内部
- 访问每个link
- 从访问过的 link 中提取一些数据并使用它创建对象
到目前为止我所做的只是简单而缓慢的方式:
public List<Link> searchLinks(string name)
{
List<Link> foundLinks = new List<Link>();
// getHtmlDocument() just returns HtmlDocument using input url.
HtmlDocument doc = getHtmlDocument(AU_SEARCH_URL + fixSpaces(name));
var link_list = doc.DocumentNode.SelectNodes(@"/html/body/div[@id='parent-container']/div[@id='main-content']/ol[@id='searchresult']/li/h2/a");
foreach (var link in link_list)
{
// TODO Threads
// getObject() creates object using data gathered
foundLinks.Add(getObject(link.InnerText, link.Attributes["href"].Value, getLatestEpisode(link.Attributes["href"].Value)));
}
return foundLinks;
}
为了做到这一点 faster/efficient 我需要实现线程,但我不确定应该如何处理它,因为我不能随机启动线程,我需要等待它们完成, thread.Join() 解决了 'wait for threads to finish' 问题,但我认为它变得不再快,因为线程将在较早的线程完成后启动。
将工作卸载到多个线程的最简单方法是 use Parallel.ForEach()
in place of your current loop。像这样:
Parallel.ForEach(link_list, link =>
{
foundLinks.Add(getObject(link.InnerText, link.Attributes["href"].Value, getLatestEpisode(link.Attributes["href"].Value)));
});
我不确定您的整体代码中是否还有其他线程问题。 (例如,请注意,这将不再保证数据将以相同的顺序添加到 foundLinks
。)但只要没有明确阻止并发工作的发生,那么这将利用线程在多个 CPU 核心上处理工作。
也许你应该使用线程池:
来自 MSDN 的示例:
using System;
using System.Threading;
public class Fibonacci
{
private int _n;
private int _fibOfN;
private ManualResetEvent _doneEvent;
public int N { get { return _n; } }
public int FibOfN { get { return _fibOfN; } }
// Constructor.
public Fibonacci(int n, ManualResetEvent doneEvent)
{
_n = n;
_doneEvent = doneEvent;
}
// Wrapper method for use with thread pool.
public void ThreadPoolCallback(Object threadContext)
{
int threadIndex = (int)threadContext;
Console.WriteLine("thread {0} started...", threadIndex);
_fibOfN = Calculate(_n);
Console.WriteLine("thread {0} result calculated...", threadIndex);
_doneEvent.Set();
}
// Recursive method that calculates the Nth Fibonacci number.
public int Calculate(int n)
{
if (n <= 1)
{
return n;
}
return Calculate(n - 1) + Calculate(n - 2);
}
}
public class ThreadPoolExample
{
static void Main()
{
const int FibonacciCalculations = 10;
// One event is used for each Fibonacci object.
ManualResetEvent[] doneEvents = new ManualResetEvent[FibonacciCalculations];
Fibonacci[] fibArray = new Fibonacci[FibonacciCalculations];
Random r = new Random();
// Configure and start threads using ThreadPool.
Console.WriteLine("launching {0} tasks...", FibonacciCalculations);
for (int i = 0; i < FibonacciCalculations; i++)
{
doneEvents[i] = new ManualResetEvent(false);
Fibonacci f = new Fibonacci(r.Next(20, 40), doneEvents[i]);
fibArray[i] = f;
ThreadPool.QueueUserWorkItem(f.ThreadPoolCallback, i);
}
// Wait for all threads in pool to calculate.
WaitHandle.WaitAll(doneEvents);
Console.WriteLine("All calculations are complete.");
// Display the results.
for (int i= 0; i<FibonacciCalculations; i++)
{
Fibonacci f = fibArray[i];
Console.WriteLine("Fibonacci({0}) = {1}", f.N, f.FibOfN);
}
}
}
这是我正在尝试做的事情:
- 从 url 中获取一个 html 页,其中包含多个 link 内部
- 访问每个link
- 从访问过的 link 中提取一些数据并使用它创建对象
到目前为止我所做的只是简单而缓慢的方式:
public List<Link> searchLinks(string name)
{
List<Link> foundLinks = new List<Link>();
// getHtmlDocument() just returns HtmlDocument using input url.
HtmlDocument doc = getHtmlDocument(AU_SEARCH_URL + fixSpaces(name));
var link_list = doc.DocumentNode.SelectNodes(@"/html/body/div[@id='parent-container']/div[@id='main-content']/ol[@id='searchresult']/li/h2/a");
foreach (var link in link_list)
{
// TODO Threads
// getObject() creates object using data gathered
foundLinks.Add(getObject(link.InnerText, link.Attributes["href"].Value, getLatestEpisode(link.Attributes["href"].Value)));
}
return foundLinks;
}
为了做到这一点 faster/efficient 我需要实现线程,但我不确定应该如何处理它,因为我不能随机启动线程,我需要等待它们完成, thread.Join() 解决了 'wait for threads to finish' 问题,但我认为它变得不再快,因为线程将在较早的线程完成后启动。
将工作卸载到多个线程的最简单方法是 use Parallel.ForEach()
in place of your current loop。像这样:
Parallel.ForEach(link_list, link =>
{
foundLinks.Add(getObject(link.InnerText, link.Attributes["href"].Value, getLatestEpisode(link.Attributes["href"].Value)));
});
我不确定您的整体代码中是否还有其他线程问题。 (例如,请注意,这将不再保证数据将以相同的顺序添加到 foundLinks
。)但只要没有明确阻止并发工作的发生,那么这将利用线程在多个 CPU 核心上处理工作。
也许你应该使用线程池:
来自 MSDN 的示例:
using System;
using System.Threading;
public class Fibonacci
{
private int _n;
private int _fibOfN;
private ManualResetEvent _doneEvent;
public int N { get { return _n; } }
public int FibOfN { get { return _fibOfN; } }
// Constructor.
public Fibonacci(int n, ManualResetEvent doneEvent)
{
_n = n;
_doneEvent = doneEvent;
}
// Wrapper method for use with thread pool.
public void ThreadPoolCallback(Object threadContext)
{
int threadIndex = (int)threadContext;
Console.WriteLine("thread {0} started...", threadIndex);
_fibOfN = Calculate(_n);
Console.WriteLine("thread {0} result calculated...", threadIndex);
_doneEvent.Set();
}
// Recursive method that calculates the Nth Fibonacci number.
public int Calculate(int n)
{
if (n <= 1)
{
return n;
}
return Calculate(n - 1) + Calculate(n - 2);
}
}
public class ThreadPoolExample
{
static void Main()
{
const int FibonacciCalculations = 10;
// One event is used for each Fibonacci object.
ManualResetEvent[] doneEvents = new ManualResetEvent[FibonacciCalculations];
Fibonacci[] fibArray = new Fibonacci[FibonacciCalculations];
Random r = new Random();
// Configure and start threads using ThreadPool.
Console.WriteLine("launching {0} tasks...", FibonacciCalculations);
for (int i = 0; i < FibonacciCalculations; i++)
{
doneEvents[i] = new ManualResetEvent(false);
Fibonacci f = new Fibonacci(r.Next(20, 40), doneEvents[i]);
fibArray[i] = f;
ThreadPool.QueueUserWorkItem(f.ThreadPoolCallback, i);
}
// Wait for all threads in pool to calculate.
WaitHandle.WaitAll(doneEvents);
Console.WriteLine("All calculations are complete.");
// Display the results.
for (int i= 0; i<FibonacciCalculations; i++)
{
Fibonacci f = fibArray[i];
Console.WriteLine("Fibonacci({0}) = {1}", f.N, f.FibOfN);
}
}
}