Excel VBA 网络抓取 - getElementsbyTagName.Item 数字不是常数
Excel VBA Web Scrape - getElementsbyTagName.Item Number isn't Constant
我正在尝试使用下面的 VBA 代码从网上抓取县、人口和房屋价值中值。这行得通,但我遇到了以下问题:项目编号(在下面的块引号中)不同,这意味着对于某些邮政编码,我检索到不正确的数据点,请参见下面的电子表格图像。
我研究了可变项目编号,但唯一接近的情况是检索 collection,然后通过 class 名称挑选元素。但是,我正在使用的 HTML 没有 class 名称,请参见下文:
是否可以先找到字符串"County: "然后检索TagName的innerText td?我也考虑过返回整个 table,但数据托管在两个不同的 table 中。如您所见,我有一些想法,但似乎找不到一个很好的例子,所以我有点迷失在它的代码方面。
Sub ZipCodeScrape()
Set ZipCodeRange = Range("C2", Range("C2").End(xlDown))
Dim IE As Object
Set IE = New InternetExplorer
Dim url As String
url = "https://www.unitedstateszipcodes.org/"
Dim County As String
Dim Population As String
Dim MedianHomeVal As String
Dim HTMLdoc As HTMLDocument
For Each cell In ZipCodeRange
IE.navigate (url & cell.Value)
'Allows IE to load
While IE.readyState <> 4
DoEvents
Wend
Set HTMLdoc = IE.document
County = HTMLdoc.getElementsByTagName("td").Item(2).innerText
Population = HTMLdoc.getElementsByTagName("td").Item(6).innerText
MedianHomeVal = HTMLdoc.getElementsByTagName("td").Item(12).innerText
cell.Offset(0, 1) = County
cell.Offset(0, 2) = Population
cell.Offset(0, 3) = MedianHomeVal
Next cell
End Sub
也许遍历 <th>
个元素,然后得到 next sibling 节点,它应该是所需的 td 节点?我没有测试过这个。
For each header in HTMLdoc.getElementsByTagName("th")
If header.InnerText = "County: " then
county = header.NextSibling.InnerText
Exit For
End If
Next
试试这个。它将为您获取预期的输出(县、人口和房屋价值中值)。我已经设置了几个从您上传的图片中获取的搜索选项。顺便说一句,应该使用邮政编码进行搜索,否则 population
和 median home value
不会出现在该网页中。
Sub ZipCodeScrape()
Dim IE As New InternetExplorer, html As HTMLDocument
Dim search_input As Variant, posts As Object, post As Object, elem As Object
With IE
.Visible = True
.navigate "https://www.unitedstateszipcodes.org/"
Do Until .readyState = READYSTATE_COMPLETE: Loop
Set html = .document
End With
Application.Wait Now + TimeValue("00:00:03")
For Each search_input In [{"32937","33056","33312","33844","34698"}]
html.getElementById("q").Value = search_input
html.getElementsByClassName("btn btn-danger")(0).Click
Application.Wait Now + TimeValue("00:00:05")
For Each posts In html.getElementsByTagName("th")
If InStr(posts.innerText, "County:") > 0 Then Row = Row + 1: Cells(Row, 1) = posts.NextSibling.innerText: Exit For
Next posts
For Each post In html.getElementsByTagName("th")
If InStr(post.innerText, "Population") > 0 Then Cells(Row, 2) = post.ParentNode.getElementsByTagName("td")(0).innerText: Exit For
Next post
For Each elem In html.getElementsByTagName("th")
If InStr(elem.innerText, "Median Home Value") > 0 Then Cells(Row, 3) = elem.ParentNode.getElementsByTagName("td")(0).innerText: Exit For
Next elem
Next search_input
IE.Quit
End Sub
我正在尝试使用下面的 VBA 代码从网上抓取县、人口和房屋价值中值。这行得通,但我遇到了以下问题:项目编号(在下面的块引号中)不同,这意味着对于某些邮政编码,我检索到不正确的数据点,请参见下面的电子表格图像。
我研究了可变项目编号,但唯一接近的情况是检索 collection,然后通过 class 名称挑选元素。但是,我正在使用的 HTML 没有 class 名称,请参见下文:
是否可以先找到字符串"County: "然后检索TagName的innerText td?我也考虑过返回整个 table,但数据托管在两个不同的 table 中。如您所见,我有一些想法,但似乎找不到一个很好的例子,所以我有点迷失在它的代码方面。
Sub ZipCodeScrape()
Set ZipCodeRange = Range("C2", Range("C2").End(xlDown))
Dim IE As Object
Set IE = New InternetExplorer
Dim url As String
url = "https://www.unitedstateszipcodes.org/"
Dim County As String
Dim Population As String
Dim MedianHomeVal As String
Dim HTMLdoc As HTMLDocument
For Each cell In ZipCodeRange
IE.navigate (url & cell.Value)
'Allows IE to load
While IE.readyState <> 4
DoEvents
Wend
Set HTMLdoc = IE.document
County = HTMLdoc.getElementsByTagName("td").Item(2).innerText
Population = HTMLdoc.getElementsByTagName("td").Item(6).innerText
MedianHomeVal = HTMLdoc.getElementsByTagName("td").Item(12).innerText
cell.Offset(0, 1) = County
cell.Offset(0, 2) = Population
cell.Offset(0, 3) = MedianHomeVal
Next cell
End Sub
也许遍历 <th>
个元素,然后得到 next sibling 节点,它应该是所需的 td 节点?我没有测试过这个。
For each header in HTMLdoc.getElementsByTagName("th")
If header.InnerText = "County: " then
county = header.NextSibling.InnerText
Exit For
End If
Next
试试这个。它将为您获取预期的输出(县、人口和房屋价值中值)。我已经设置了几个从您上传的图片中获取的搜索选项。顺便说一句,应该使用邮政编码进行搜索,否则 population
和 median home value
不会出现在该网页中。
Sub ZipCodeScrape()
Dim IE As New InternetExplorer, html As HTMLDocument
Dim search_input As Variant, posts As Object, post As Object, elem As Object
With IE
.Visible = True
.navigate "https://www.unitedstateszipcodes.org/"
Do Until .readyState = READYSTATE_COMPLETE: Loop
Set html = .document
End With
Application.Wait Now + TimeValue("00:00:03")
For Each search_input In [{"32937","33056","33312","33844","34698"}]
html.getElementById("q").Value = search_input
html.getElementsByClassName("btn btn-danger")(0).Click
Application.Wait Now + TimeValue("00:00:05")
For Each posts In html.getElementsByTagName("th")
If InStr(posts.innerText, "County:") > 0 Then Row = Row + 1: Cells(Row, 1) = posts.NextSibling.innerText: Exit For
Next posts
For Each post In html.getElementsByTagName("th")
If InStr(post.innerText, "Population") > 0 Then Cells(Row, 2) = post.ParentNode.getElementsByTagName("td")(0).innerText: Exit For
Next post
For Each elem In html.getElementsByTagName("th")
If InStr(elem.innerText, "Median Home Value") > 0 Then Cells(Row, 3) = elem.ParentNode.getElementsByTagName("td")(0).innerText: Exit For
Next elem
Next search_input
IE.Quit
End Sub