vba,getElementsByClassName,HTMLSource的双引号没了
vba, getElementsByClassName, HTMLSource's double quotation marks are gone
我用 vba 抓取一些网站是为了好玩,我使用 VBA 作为工具。我使用 XMLHTTP 和 HTMLDocument(因为它比 internetExplorer.Application 更快)。
Public Sub XMLhtmlDocumentHTMLSourceScraper()
Dim XMLHTTPReq As Object
Dim htmlDoc As HTMLDocument
Dim postURL As String
postURL = "http://foodffs.tumblr.com/archive/2015/11"
Set XMLHTTPReq = New MSXML2.XMLHTTP
With XMLHTTPReq
.Open "GET", postURL, False
.Send
End With
Set htmlDoc = New HTMLDocument
With htmlDoc
.body.innerHTML = XMLHTTPReq.responseText
End With
i = 0
Set varTemp = htmlDoc.getElementsByClassName("post_glass post_micro_glass")
For Each vr In varTemp
''''the next line is important to solve this issue *1
Cells(1, 1) = vr.outerHTML
Set varTemp2 = vr.getElementsByTagName("SPAN class=post_date")
Cells(i + 1, 3) = varTemp2.Item(0).innerText
''''the next line occur 438Error''''
Set varTemp2 = vr.getElementsByClassName("hover_inner")
Cells(i + 1, 4) = varTemp2.innerText
i = i + 1
Next vr
End Sub
我通过 *1 解决了这个问题
cells(1,1) 告诉我接下来的事情
<DIV class="post_glass post_micro_glass" title=""><A class=hover title="" href="http://foodffs.tumblr.com/post/134291668251/sugar-free-low-carb-coffee-ricotta-mousse-really" target=_blank>
<DIV class=hover_inner><SPAN class=post_date>...............
是的,所有 class 标签都丢失了“”。只有第一个函数的 class 有 " "
真不知道为什么会出现这种情况
//我可以通过 getElementsByTagName("span") 进行语法分析。但我更喜欢"class"标签.....
getElementsByClassName method本身不被认为是一个方法;只有父 HTMLDocument。如果您想使用它来定位 DIV 元素中的元素,您需要创建一个由该特定 DIV 元素的 .outerHtml 组成的子 HTMLDocument。
Public Sub XMLhtmlDocumentHTMLSourceScraper()
Dim xmlHTTPReq As New MSXML2.XMLHTTP
Dim htmlDOC As New HTMLDocument, divSUBDOC As New HTMLDocument
Dim iDIV As Long, iSPN As Long, iEL As Long
Dim postURL As String, nr As Long, i As Long
postURL = "http://foodffs.tumblr.com/archive/2015/11"
With xmlHTTPReq
.Open "GET", postURL, False
.Send
End With
'Set htmlDOC = New HTMLDocument
With htmlDOC
.body.innerHTML = xmlHTTPReq.responseText
End With
i = 0
With htmlDOC
For iDIV = 0 To .getElementsByClassName("post_glass post_micro_glass").Length - 1
nr = Sheet1.Cells(Rows.Count, 3).End(xlUp).Offset(1, 0).Row
With .getElementsByClassName("post_glass post_micro_glass")(iDIV)
'method 1 - run through multiples in a collection
For iSPN = 0 To .getElementsByTagName("span").Length - 1
With .getElementsByTagName("span")(iSPN)
Select Case LCase(.className)
Case "post_date"
Cells(nr, 3) = .innerText
Case "post_notes"
Cells(nr, 4) = .innerText
Case Else
'do nothing
End Select
End With
Next iSPN
'method 2 - create a sub-HTML doc to facilitate getting els by classname
divSUBDOC.body.innerHTML = .outerHTML 'only the HTML from this DIV
With divSUBDOC
If CBool(.getElementsByClassName("hover_inner").Length) Then 'there is at least 1
'use the first
Cells(nr, 5) = .getElementsByClassName("hover_inner")(0).innerText
End If
End With
End With
Next iDIV
End With
End Sub
虽然其他 .getElementsByXXXX 可以很容易地检索另一个元素中的集合,但 getElementsByClassName method 需要考虑它认为是整个 HTMLDocument 的内容,即使您愚弄它以为。
这是另一种方法。它与原始代码非常相似,但使用 querySelectorAll 来 select 相关的跨度元素。此方法的一个重点是必须将 vr 声明为特定元素类型,而不是 IHTMLElement 或通用对象:
Option Explicit
Public Sub XMLhtmlDocumentHTMLSourceScraper()
' Changed from generic Object to specific type - not
' strictly necessary to do this
Dim XMLHTTPReq As MSXML2.XMLHTTP60
Dim htmlDoc As HTMLDocument
' These declarations weren't included in the original code
Dim i As Integer
Dim varTemp As Object
' IMPORTANT: vr must be declared as a specific element type and not
' as an IHTMLElement or generic Object
Dim vr As HTMLDivElement
Dim varTemp2 As Object
Dim postURL As String
postURL = "http://foodffs.tumblr.com/archive/2015/11"
' Changed from XMLHTTP to XMLHTTP60 as XMLHTTP is equivalent
' to the older XMLHTTP30
Set XMLHTTPReq = New MSXML2.XMLHTTP60
With XMLHTTPReq
.Open "GET", postURL, False
.Send
End With
Set htmlDoc = New HTMLDocument
With htmlDoc
.body.innerHTML = XMLHTTPReq.responseText
End With
i = 0
Set varTemp = htmlDoc.getElementsByClassName("post_glass post_micro_glass")
For Each vr In varTemp
''''the next line is important to solve this issue *1
Cells(1, 1) = vr.outerHTML
Set varTemp2 = vr.querySelectorAll("span.post_date")
Cells(i + 1, 3) = varTemp2.Item(0).innerText
Set varTemp2 = vr.getElementsByClassName("hover_inner")
' incorporating correction from Jeeped's comment (#56349646)
Cells(i + 1, 4) = varTemp2.Item(0).innerText
i = i + 1
Next vr
End Sub
备注:
- XMLHTTP 等同于描述的 XMLHTTP30 here
- 显然需要声明在 this question 中探索的特定元素类型,但是,与 getElementsByClassName 不同,querySelectorAll 在 IHTMLElement
的任何版本中都不存在
我用 vba 抓取一些网站是为了好玩,我使用 VBA 作为工具。我使用 XMLHTTP 和 HTMLDocument(因为它比 internetExplorer.Application 更快)。
Public Sub XMLhtmlDocumentHTMLSourceScraper()
Dim XMLHTTPReq As Object
Dim htmlDoc As HTMLDocument
Dim postURL As String
postURL = "http://foodffs.tumblr.com/archive/2015/11"
Set XMLHTTPReq = New MSXML2.XMLHTTP
With XMLHTTPReq
.Open "GET", postURL, False
.Send
End With
Set htmlDoc = New HTMLDocument
With htmlDoc
.body.innerHTML = XMLHTTPReq.responseText
End With
i = 0
Set varTemp = htmlDoc.getElementsByClassName("post_glass post_micro_glass")
For Each vr In varTemp
''''the next line is important to solve this issue *1
Cells(1, 1) = vr.outerHTML
Set varTemp2 = vr.getElementsByTagName("SPAN class=post_date")
Cells(i + 1, 3) = varTemp2.Item(0).innerText
''''the next line occur 438Error''''
Set varTemp2 = vr.getElementsByClassName("hover_inner")
Cells(i + 1, 4) = varTemp2.innerText
i = i + 1
Next vr
End Sub
我通过 *1 解决了这个问题 cells(1,1) 告诉我接下来的事情
<DIV class="post_glass post_micro_glass" title=""><A class=hover title="" href="http://foodffs.tumblr.com/post/134291668251/sugar-free-low-carb-coffee-ricotta-mousse-really" target=_blank>
<DIV class=hover_inner><SPAN class=post_date>...............
是的,所有 class 标签都丢失了“”。只有第一个函数的 class 有 " " 真不知道为什么会出现这种情况
//我可以通过 getElementsByTagName("span") 进行语法分析。但我更喜欢"class"标签.....
getElementsByClassName method本身不被认为是一个方法;只有父 HTMLDocument。如果您想使用它来定位 DIV 元素中的元素,您需要创建一个由该特定 DIV 元素的 .outerHtml 组成的子 HTMLDocument。
Public Sub XMLhtmlDocumentHTMLSourceScraper()
Dim xmlHTTPReq As New MSXML2.XMLHTTP
Dim htmlDOC As New HTMLDocument, divSUBDOC As New HTMLDocument
Dim iDIV As Long, iSPN As Long, iEL As Long
Dim postURL As String, nr As Long, i As Long
postURL = "http://foodffs.tumblr.com/archive/2015/11"
With xmlHTTPReq
.Open "GET", postURL, False
.Send
End With
'Set htmlDOC = New HTMLDocument
With htmlDOC
.body.innerHTML = xmlHTTPReq.responseText
End With
i = 0
With htmlDOC
For iDIV = 0 To .getElementsByClassName("post_glass post_micro_glass").Length - 1
nr = Sheet1.Cells(Rows.Count, 3).End(xlUp).Offset(1, 0).Row
With .getElementsByClassName("post_glass post_micro_glass")(iDIV)
'method 1 - run through multiples in a collection
For iSPN = 0 To .getElementsByTagName("span").Length - 1
With .getElementsByTagName("span")(iSPN)
Select Case LCase(.className)
Case "post_date"
Cells(nr, 3) = .innerText
Case "post_notes"
Cells(nr, 4) = .innerText
Case Else
'do nothing
End Select
End With
Next iSPN
'method 2 - create a sub-HTML doc to facilitate getting els by classname
divSUBDOC.body.innerHTML = .outerHTML 'only the HTML from this DIV
With divSUBDOC
If CBool(.getElementsByClassName("hover_inner").Length) Then 'there is at least 1
'use the first
Cells(nr, 5) = .getElementsByClassName("hover_inner")(0).innerText
End If
End With
End With
Next iDIV
End With
End Sub
虽然其他 .getElementsByXXXX 可以很容易地检索另一个元素中的集合,但 getElementsByClassName method 需要考虑它认为是整个 HTMLDocument 的内容,即使您愚弄它以为。
这是另一种方法。它与原始代码非常相似,但使用 querySelectorAll 来 select 相关的跨度元素。此方法的一个重点是必须将 vr 声明为特定元素类型,而不是 IHTMLElement 或通用对象:
Option Explicit
Public Sub XMLhtmlDocumentHTMLSourceScraper()
' Changed from generic Object to specific type - not
' strictly necessary to do this
Dim XMLHTTPReq As MSXML2.XMLHTTP60
Dim htmlDoc As HTMLDocument
' These declarations weren't included in the original code
Dim i As Integer
Dim varTemp As Object
' IMPORTANT: vr must be declared as a specific element type and not
' as an IHTMLElement or generic Object
Dim vr As HTMLDivElement
Dim varTemp2 As Object
Dim postURL As String
postURL = "http://foodffs.tumblr.com/archive/2015/11"
' Changed from XMLHTTP to XMLHTTP60 as XMLHTTP is equivalent
' to the older XMLHTTP30
Set XMLHTTPReq = New MSXML2.XMLHTTP60
With XMLHTTPReq
.Open "GET", postURL, False
.Send
End With
Set htmlDoc = New HTMLDocument
With htmlDoc
.body.innerHTML = XMLHTTPReq.responseText
End With
i = 0
Set varTemp = htmlDoc.getElementsByClassName("post_glass post_micro_glass")
For Each vr In varTemp
''''the next line is important to solve this issue *1
Cells(1, 1) = vr.outerHTML
Set varTemp2 = vr.querySelectorAll("span.post_date")
Cells(i + 1, 3) = varTemp2.Item(0).innerText
Set varTemp2 = vr.getElementsByClassName("hover_inner")
' incorporating correction from Jeeped's comment (#56349646)
Cells(i + 1, 4) = varTemp2.Item(0).innerText
i = i + 1
Next vr
End Sub
备注:
- XMLHTTP 等同于描述的 XMLHTTP30 here
- 显然需要声明在 this question 中探索的特定元素类型,但是,与 getElementsByClassName 不同,querySelectorAll 在 IHTMLElement 的任何版本中都不存在