转到存储在字符串中的每个 link 并列出所有 PDF link
Go to each link stored in string and list all PDF links
我第一次尝试创建程序。
我正在尝试访问网站,获取所有 link,然后继续每个 link 并获取所有以
结尾的 link
.pdf
我可以获得所有需要的 link。现在我想继续每个 link 并搜索 PDF 文件。
Imports HtmlAgilityPack
Module Module1
Sub Main()
Dim mainUrl As String = "xxx"
Dim htmlDoc As HtmlDocument = New HtmlWeb().Load(mainUrl) '< - - - Load the webage into htmldocument
Dim listLinks As New List(Of String)
Dim srcs As HtmlNodeCollection = htmlDoc.DocumentNode.SelectNodes("//ul[@class='products-list-page']//a") '< - - - select nodes with links
For Each src As HtmlNode In srcs
' Store links in array
listLinks.Add(src.Attributes("href").Value)
Next
' Here I am attempting to through each link and get listed all .pdf links
'get the array from the list.
Dim arrayLinks() As String = listLinks.ToArray()
'Console.Read()
Dim scrapedsrcs As HtmlNodeCollection = htmlDoc.DocumentNode.SelectNodes("//ul[@class='dl-items']//a") '< - - - select nodes with links
For Each scrapedlink As HtmlNode In scrapedsrcs
' Show links in console
Console.WriteLine(scrapedlink.Attributes("href").Value) '< - - - Print urls
Next
End Sub
End Module
如何实现?有人可以给我提示吗?
编辑:
首先,你没有迭代每个产品links和下载html来扫描pdf文件下载links。
完成者:
For Each productLink As String In listLinks
Dim prodDoc As HtmlDocument = New HtmlWeb().Load(productLink)
Dim scrapedsrcs As HtmlNodeCollection = prodDoc.DocumentNode.SelectNodes("//div[@class='dl-items']//a") '< - - - select nodes with links
If scrapedsrcs IsNot Nothing Then
For Each scrapedlink As HtmlNode In scrapedsrcs
' Show links in console
Console.WriteLine($"-- {scrapedlink.Attributes("href").Value}") '< - - - Print urls
Next
End If
Next
其次,下载 pdf 的 a
link 包含在 div
而不是 ul
中。所以对于 select 个节点,使用 :
prodDoc.DocumentNode.SelectNodes("//div[@class='dl-items']//a")
或者您可以通过 class 将 *
指定为 select,而不考虑像 :
这样的元素
prodDoc.DocumentNode.SelectNodes("//*[@class='dl-items']//a")
既然你没有对这些链接做任何事情,为什么不把它写得又好又短呢?
喜欢:
Imports HtmlAgilityPack
Module Module1
Sub Main()
Dim htmlDoc As HtmlDocument = New HtmlWeb().Load("https: //www.nordicwater.com/products/waste-water/")
For Each src As HtmlNode In htmlDoc.DocumentNode.SelectNodes("//ul[@class='products-list-page']//a")
htmlDoc = New HtmlWeb().Load(src.Attributes("href").Value)
Dim LinkTest As HtmlNode = htmlDoc.DocumentNode.SelectSingleNode("//div[@class='dl-items']/a")
If LinkTest IsNot Nothing AndAlso LinkTest.Attributes("href").Value.Length > 0 Then Console.WriteLine(LinkTest.Attributes("href").Value)
Next
End Sub
End Module
我第一次尝试创建程序。
我正在尝试访问网站,获取所有 link,然后继续每个 link 并获取所有以
结尾的 link.pdf
我可以获得所有需要的 link。现在我想继续每个 link 并搜索 PDF 文件。
Imports HtmlAgilityPack
Module Module1
Sub Main()
Dim mainUrl As String = "xxx"
Dim htmlDoc As HtmlDocument = New HtmlWeb().Load(mainUrl) '< - - - Load the webage into htmldocument
Dim listLinks As New List(Of String)
Dim srcs As HtmlNodeCollection = htmlDoc.DocumentNode.SelectNodes("//ul[@class='products-list-page']//a") '< - - - select nodes with links
For Each src As HtmlNode In srcs
' Store links in array
listLinks.Add(src.Attributes("href").Value)
Next
' Here I am attempting to through each link and get listed all .pdf links
'get the array from the list.
Dim arrayLinks() As String = listLinks.ToArray()
'Console.Read()
Dim scrapedsrcs As HtmlNodeCollection = htmlDoc.DocumentNode.SelectNodes("//ul[@class='dl-items']//a") '< - - - select nodes with links
For Each scrapedlink As HtmlNode In scrapedsrcs
' Show links in console
Console.WriteLine(scrapedlink.Attributes("href").Value) '< - - - Print urls
Next
End Sub
End Module
如何实现?有人可以给我提示吗?
编辑:
首先,你没有迭代每个产品links和下载html来扫描pdf文件下载links。
完成者:
For Each productLink As String In listLinks
Dim prodDoc As HtmlDocument = New HtmlWeb().Load(productLink)
Dim scrapedsrcs As HtmlNodeCollection = prodDoc.DocumentNode.SelectNodes("//div[@class='dl-items']//a") '< - - - select nodes with links
If scrapedsrcs IsNot Nothing Then
For Each scrapedlink As HtmlNode In scrapedsrcs
' Show links in console
Console.WriteLine($"-- {scrapedlink.Attributes("href").Value}") '< - - - Print urls
Next
End If
Next
其次,下载 pdf 的 a
link 包含在 div
而不是 ul
中。所以对于 select 个节点,使用 :
prodDoc.DocumentNode.SelectNodes("//div[@class='dl-items']//a")
或者您可以通过 class 将 *
指定为 select,而不考虑像 :
prodDoc.DocumentNode.SelectNodes("//*[@class='dl-items']//a")
既然你没有对这些链接做任何事情,为什么不把它写得又好又短呢?
喜欢:
Imports HtmlAgilityPack
Module Module1
Sub Main()
Dim htmlDoc As HtmlDocument = New HtmlWeb().Load("https: //www.nordicwater.com/products/waste-water/")
For Each src As HtmlNode In htmlDoc.DocumentNode.SelectNodes("//ul[@class='products-list-page']//a")
htmlDoc = New HtmlWeb().Load(src.Attributes("href").Value)
Dim LinkTest As HtmlNode = htmlDoc.DocumentNode.SelectSingleNode("//div[@class='dl-items']/a")
If LinkTest IsNot Nothing AndAlso LinkTest.Attributes("href").Value.Length > 0 Then Console.WriteLine(LinkTest.Attributes("href").Value)
Next
End Sub
End Module