为什么我无法使用带 VBA 的 MSXML2 将 HTML 类名添加到元素集合
Why am I not able to add an HTML Classname to an Element Collection using MSXML2 with VBA
我尝试了各种帖子中的许多行之有效的方法来从网页中获取一些数据,但都没有成功。我能够在起始页面上获得链接项目的列表,但是一旦我导航到任何其他页面,我就会用下面的代码绘制一个空白。
当我 运行 代码时,我在 Cats 中没有得到任何结果。
Sub Main()
Dim XMLReq As New MSXML2.XMLHTTP60
Dim HTMLDoc As New MSHTML.HTMLDocument
Dim Cats As MSHTML.IHTMLElementCollection
Dim Cat As MSHTML.IHTMLElement
Dim NextHref As String
Dim NextURL As String
XMLReq.Open "GET", URL, False
XMLReq.send
If XMLReq.Status <> 200 Then
MsgBox "Problem"
Exit Sub
End If
HTMLDoc.body.innerHTML = XMLReq.responseText
Set XMLReq = Nothing
Set Cats = HTMLDoc.getElementsByClassName("ng-tns-c329-5 product-grid--tile ng-star-inserted")
Debug.Print Cats.Length 'Returns 0
'For Each Cat In Cats
' NextHref = Cat.getAttribute("href")
' NextURL = URL & Mid(NextHref, InStr(NextHref, ":") + 2)
' ListItemsInCats Cat.innerText, NextURL
'Next Cat
End Sub
Expanded Element structure
Collased structure
感谢您的帮助。
您尝试抓取的网站存在以下问题:
在 XMLHTTP 请求方法中 - 产品详细信息是从 Fetch/XHR 中提取的动态内容,而 XMLHTTP 没有 运行,XMLHTTP 只为您提供 HTML 文档,因为它没有任何脚本 运行ning.
在 Internet Explorer 方法中 - 在实际加载产品详细信息之前网页被视为准备就绪,因此 Busy
和 ReadyState
的常规循环检查是不够的。
下面的代码使用 Internet Explorer 并解决上面提到的问题,我已经进行了一些检查(我认为这并不完美,但它在我的测试中到目前为止有效)将等到第一个产品在继续拉取产品详细信息之前加载:
Private Sub GetBakeryProducts()
Const URL As String = "https://www.woolworths.com.au/shop/browse/bakery"
Dim ieObj As InternetExplorer
Set ieObj = New InternetExplorer
ieObj.navigate URL
ieObj.Visible = True
Do While ieObj.Busy Or ieObj.readyState <> READYSTATE_COMPLETE
DoEvents
Loop
Do While ieObj.document.getElementsByClassName("productCarousel-header").Length = 0
DoEvents
Loop
Dim ieDoc As MSHTML.HTMLDocument
Set ieDoc = ieObj.document
Dim productList As Object
Set productList = ieDoc.getElementsByClassName("product-grid--tile")
'==== Test if the website has finish loading the 1st product details
On Error Resume Next
Dim testStatus As String
Do
Err.Clear
testStatus = productList(0).getElementsByClassName("shelfProductTile-descriptionLink")(0).innerText
Loop Until Err.Number = 0
'====
Dim outputArr() As String
ReDim outputArr(1 To productList.Length, 1 To 2) As String
Dim outputIndex As Long
Dim i As Long
For i = 0 To productList.Length - 1
If productList(i).getElementsByClassName("shelfProductTile-descriptionLink").Length <> 0 Then
If Err.Number <> 0 Then
Err.Clear
Exit For
End If
Dim productName As String
Dim productPrice As String
productName = productList(i).getElementsByClassName("shelfProductTile-descriptionLink")(0).innerText
productPrice = Replace(productList(i).getElementsByClassName("price")(0).innerText, vbNewLine, vbNullString)
outputIndex = outputIndex + 1
outputArr(outputIndex, 1) = productName
outputArr(outputIndex, 2) = productPrice
End If
Next i
ReDim Preserve outputArr(1 To outputIndex, 1 To 2) As String
ieObj.Quit
Set ieObj = Nothing
ThisWorkbook.Sheets("Sheet1").Range("A1").Resize(outputIndex, UBound(outputArr, 2)).Value = outputArr
End Sub
运行 这将从网站提取数据并将输出从单元格 A1
开始粘贴到 Sheet1
,请根据需要更改工作表名称和范围。
我尝试了各种帖子中的许多行之有效的方法来从网页中获取一些数据,但都没有成功。我能够在起始页面上获得链接项目的列表,但是一旦我导航到任何其他页面,我就会用下面的代码绘制一个空白。
当我 运行 代码时,我在 Cats 中没有得到任何结果。
Sub Main()
Dim XMLReq As New MSXML2.XMLHTTP60
Dim HTMLDoc As New MSHTML.HTMLDocument
Dim Cats As MSHTML.IHTMLElementCollection
Dim Cat As MSHTML.IHTMLElement
Dim NextHref As String
Dim NextURL As String
XMLReq.Open "GET", URL, False
XMLReq.send
If XMLReq.Status <> 200 Then
MsgBox "Problem"
Exit Sub
End If
HTMLDoc.body.innerHTML = XMLReq.responseText
Set XMLReq = Nothing
Set Cats = HTMLDoc.getElementsByClassName("ng-tns-c329-5 product-grid--tile ng-star-inserted")
Debug.Print Cats.Length 'Returns 0
'For Each Cat In Cats
' NextHref = Cat.getAttribute("href")
' NextURL = URL & Mid(NextHref, InStr(NextHref, ":") + 2)
' ListItemsInCats Cat.innerText, NextURL
'Next Cat
End Sub
Expanded Element structure
Collased structure
感谢您的帮助。
您尝试抓取的网站存在以下问题:
在 XMLHTTP 请求方法中 - 产品详细信息是从 Fetch/XHR 中提取的动态内容,而 XMLHTTP 没有 运行,XMLHTTP 只为您提供 HTML 文档,因为它没有任何脚本 运行ning.
在 Internet Explorer 方法中 - 在实际加载产品详细信息之前网页被视为准备就绪,因此 Busy
和 ReadyState
的常规循环检查是不够的。
下面的代码使用 Internet Explorer 并解决上面提到的问题,我已经进行了一些检查(我认为这并不完美,但它在我的测试中到目前为止有效)将等到第一个产品在继续拉取产品详细信息之前加载:
Private Sub GetBakeryProducts()
Const URL As String = "https://www.woolworths.com.au/shop/browse/bakery"
Dim ieObj As InternetExplorer
Set ieObj = New InternetExplorer
ieObj.navigate URL
ieObj.Visible = True
Do While ieObj.Busy Or ieObj.readyState <> READYSTATE_COMPLETE
DoEvents
Loop
Do While ieObj.document.getElementsByClassName("productCarousel-header").Length = 0
DoEvents
Loop
Dim ieDoc As MSHTML.HTMLDocument
Set ieDoc = ieObj.document
Dim productList As Object
Set productList = ieDoc.getElementsByClassName("product-grid--tile")
'==== Test if the website has finish loading the 1st product details
On Error Resume Next
Dim testStatus As String
Do
Err.Clear
testStatus = productList(0).getElementsByClassName("shelfProductTile-descriptionLink")(0).innerText
Loop Until Err.Number = 0
'====
Dim outputArr() As String
ReDim outputArr(1 To productList.Length, 1 To 2) As String
Dim outputIndex As Long
Dim i As Long
For i = 0 To productList.Length - 1
If productList(i).getElementsByClassName("shelfProductTile-descriptionLink").Length <> 0 Then
If Err.Number <> 0 Then
Err.Clear
Exit For
End If
Dim productName As String
Dim productPrice As String
productName = productList(i).getElementsByClassName("shelfProductTile-descriptionLink")(0).innerText
productPrice = Replace(productList(i).getElementsByClassName("price")(0).innerText, vbNewLine, vbNullString)
outputIndex = outputIndex + 1
outputArr(outputIndex, 1) = productName
outputArr(outputIndex, 2) = productPrice
End If
Next i
ReDim Preserve outputArr(1 To outputIndex, 1 To 2) As String
ieObj.Quit
Set ieObj = Nothing
ThisWorkbook.Sheets("Sheet1").Range("A1").Resize(outputIndex, UBound(outputArr, 2)).Value = outputArr
End Sub
运行 这将从网站提取数据并将输出从单元格 A1
开始粘贴到 Sheet1
,请根据需要更改工作表名称和范围。