C# 如何遍历一个列表,更新同一个列表,然后继续循环?
C# How to loop through a list, update the same list, and continue looping?
我正在编写一个抓取特定 URL 并将其添加到列表中的网络抓取工具。
using HtmlAgilityPack;
List<string> mylist = new List<string>();
var firstUrl = "http://example.com";
HtmlWeb web = new HtmlWeb();
HtmlDocument document = web.Load(firstUrl);
HtmlNodeCollection nodes = document.DocumentNode.SelectNodes("//div[contains(@class,'Name')]/a");
foreach (HtmlNode htmlNode in (IEnumerable<HtmlNode>)nodes)
{
if (!mylist.Contains(htmlNode.InnerText))
{
mylist.Add(htmlNode.InnerText);
}
}
此时我想做的是遍历 'mylist' 并做完全相同的事情,基本上永远继续下去。该代码应该采用新解析的 URL 并将它们添加到列表中。最简单的方法是什么?
我尝试在上面的循环之后创建一个 for 循环。但它似乎并没有更新列表。它只会继续循环遍历列表中已经存在的相同项目(因为 i 将始终小于 mylist.Count)
for (int i = 0; i < mylist.Count; i++)
{
//the items in mylist are added to the url
var urls = "http://example.com" + mylist[i];
HtmlWeb web = new HtmlWeb();
HtmlDocument document = web.Load(urls);
HtmlNodeCollection nodes = document.DocumentNode.SelectNodes("//div[contains(@class,'Name')]/a");
foreach (HtmlNode htmlNode in (IEnumerable<HtmlNode>)nodes)
{
if (!mylist.Contains(htmlNode.InnerText))
{
mylist.Add(htmlNode.InnerText);
}
}
}
谢谢!
Queue
符合您的要求。
Queue<string> mylist = new Queue<string>();
第一关:
using HtmlAgilityPack;
Queue<string> mylist = new Queue<string>();
var firstUrl = "http://example.com";
HtmlWeb web = new HtmlWeb();
HtmlDocument document = web.Load(firstUrl);
HtmlNodeCollection nodes = document.DocumentNode.SelectNodes("//div[contains(@class,'Name')]/a");
foreach (HtmlNode htmlNode in (IEnumerable<HtmlNode>)nodes)
{
if (!mylist.Contains(htmlNode.InnerText))
{
mylist.Enqueue(htmlNode.InnerText);
}
}
现在第二关
while (mylist.Count > 0)
{
var url = mylist..Dequeue();
//the items in mylist are added to the url
var urls = "http://example.com" + url;
HtmlWeb web = new HtmlWeb();
HtmlDocument document = web.Load(urls);
HtmlNodeCollection nodes = document.DocumentNode.SelectNodes("//div[contains(@class,'Name')]/a");
foreach (HtmlNode htmlNode in (IEnumerable<HtmlNode>)nodes)
{
if (!mylist.Contains(htmlNode.InnerText))
{
mylist.Enqueue(htmlNode.InnerText);
}
}
}
一个可能的(危险的?)使用递归的实现,它会在使用时生成 url:
public IEnumerable<string> Scrap(string url)
{
var web = new HtmlWeb();
var seenUrls = new HashSet<string>();
return ScrapImpl(web, seenUrls, url);
}
private IEnumerable<string> ScrapImpl(HtmlWeb web, HashSet<string> seenUrls, string baseUrl)
{
var document = web.Load(baseUrl);
foreach (var node in document.DocumentNode.SelectNodes("//div[contains(@class,'Name')]/a"))
{
yield return node.InnerText;
if (seenUrls.Add(node.InnerText))
{
foreach (var childUrl in ScrapImpl(web, seenUrls, baseUrl + node.InnerText))
{
yield return childUrl;
}
}
}
}
用法:
var urls = Scrap("http://example.com"); //nothing is done yet
foreach(var url in urls) //http://example.com starts beeing scraped at this point
{
...
}
转到 NuGet "System.Interactive" 然后执行此操作:
var found = new HashSet<string>();
var urls =
EnumerableEx
.Expand(
new[] { "http://example.com" },
url =>
{
var web = new HtmlWeb();
var document = web.Load(url);
var nodes = document.DocumentNode.SelectNodes("//div[contains(@class,'Name')]/a");
return
nodes
.Cast<HtmlNode>()
.Select(x => x.InnerText)
.Where(x => !found.Contains(x))
.Do(x => found.Add(x))
.Select(x => "http://example.com" + x);
});
我正在编写一个抓取特定 URL 并将其添加到列表中的网络抓取工具。
using HtmlAgilityPack;
List<string> mylist = new List<string>();
var firstUrl = "http://example.com";
HtmlWeb web = new HtmlWeb();
HtmlDocument document = web.Load(firstUrl);
HtmlNodeCollection nodes = document.DocumentNode.SelectNodes("//div[contains(@class,'Name')]/a");
foreach (HtmlNode htmlNode in (IEnumerable<HtmlNode>)nodes)
{
if (!mylist.Contains(htmlNode.InnerText))
{
mylist.Add(htmlNode.InnerText);
}
}
此时我想做的是遍历 'mylist' 并做完全相同的事情,基本上永远继续下去。该代码应该采用新解析的 URL 并将它们添加到列表中。最简单的方法是什么?
我尝试在上面的循环之后创建一个 for 循环。但它似乎并没有更新列表。它只会继续循环遍历列表中已经存在的相同项目(因为 i 将始终小于 mylist.Count)
for (int i = 0; i < mylist.Count; i++)
{
//the items in mylist are added to the url
var urls = "http://example.com" + mylist[i];
HtmlWeb web = new HtmlWeb();
HtmlDocument document = web.Load(urls);
HtmlNodeCollection nodes = document.DocumentNode.SelectNodes("//div[contains(@class,'Name')]/a");
foreach (HtmlNode htmlNode in (IEnumerable<HtmlNode>)nodes)
{
if (!mylist.Contains(htmlNode.InnerText))
{
mylist.Add(htmlNode.InnerText);
}
}
}
谢谢!
Queue
符合您的要求。
Queue<string> mylist = new Queue<string>();
第一关:
using HtmlAgilityPack;
Queue<string> mylist = new Queue<string>();
var firstUrl = "http://example.com";
HtmlWeb web = new HtmlWeb();
HtmlDocument document = web.Load(firstUrl);
HtmlNodeCollection nodes = document.DocumentNode.SelectNodes("//div[contains(@class,'Name')]/a");
foreach (HtmlNode htmlNode in (IEnumerable<HtmlNode>)nodes)
{
if (!mylist.Contains(htmlNode.InnerText))
{
mylist.Enqueue(htmlNode.InnerText);
}
}
现在第二关
while (mylist.Count > 0)
{
var url = mylist..Dequeue();
//the items in mylist are added to the url
var urls = "http://example.com" + url;
HtmlWeb web = new HtmlWeb();
HtmlDocument document = web.Load(urls);
HtmlNodeCollection nodes = document.DocumentNode.SelectNodes("//div[contains(@class,'Name')]/a");
foreach (HtmlNode htmlNode in (IEnumerable<HtmlNode>)nodes)
{
if (!mylist.Contains(htmlNode.InnerText))
{
mylist.Enqueue(htmlNode.InnerText);
}
}
}
一个可能的(危险的?)使用递归的实现,它会在使用时生成 url:
public IEnumerable<string> Scrap(string url)
{
var web = new HtmlWeb();
var seenUrls = new HashSet<string>();
return ScrapImpl(web, seenUrls, url);
}
private IEnumerable<string> ScrapImpl(HtmlWeb web, HashSet<string> seenUrls, string baseUrl)
{
var document = web.Load(baseUrl);
foreach (var node in document.DocumentNode.SelectNodes("//div[contains(@class,'Name')]/a"))
{
yield return node.InnerText;
if (seenUrls.Add(node.InnerText))
{
foreach (var childUrl in ScrapImpl(web, seenUrls, baseUrl + node.InnerText))
{
yield return childUrl;
}
}
}
}
用法:
var urls = Scrap("http://example.com"); //nothing is done yet
foreach(var url in urls) //http://example.com starts beeing scraped at this point
{
...
}
转到 NuGet "System.Interactive" 然后执行此操作:
var found = new HashSet<string>();
var urls =
EnumerableEx
.Expand(
new[] { "http://example.com" },
url =>
{
var web = new HtmlWeb();
var document = web.Load(url);
var nodes = document.DocumentNode.SelectNodes("//div[contains(@class,'Name')]/a");
return
nodes
.Cast<HtmlNode>()
.Select(x => x.InnerText)
.Where(x => !found.Contains(x))
.Do(x => found.Add(x))
.Select(x => "http://example.com" + x);
});