Excel VBA 网络抓取 - getElementsbyTagName.Item 数字不是常数

Excel VBA Web Scrape - getElementsbyTagName.Item Number isn't Constant

我正在尝试使用下面的 VBA 代码从网上抓取县、人口和房屋价值中值。这行得通,但我遇到了以下问题:项目编号(在下面的块引号中)不同,这意味着对于某些邮政编码,我检索到不正确的数据点,请参见下面的电子表格图像。

我研究了可变项目编号,但唯一接近的情况是检索 collection,然后通过 class 名称挑选元素。但是,我正在使用的 HTML 没有 class 名称,请参见下文:

是否可以先找到字符串"County: "然后检索TagName的innerText td?我也考虑过返回整个 table,但数据托管在两个不同的 table 中。如您所见,我有一些想法,但似乎找不到一个很好的例子,所以我有点迷失在它的代码方面。

Sub ZipCodeScrape()

Set ZipCodeRange = Range("C2", Range("C2").End(xlDown))

Dim IE As Object
Set IE = New InternetExplorer

Dim url As String
url = "https://www.unitedstateszipcodes.org/"

Dim County As String
Dim Population As String
Dim MedianHomeVal As String
Dim HTMLdoc As HTMLDocument

For Each cell In ZipCodeRange

    IE.navigate (url & cell.Value)

    'Allows IE to load
    While IE.readyState <> 4
        DoEvents
    Wend

    Set HTMLdoc = IE.document

    County = HTMLdoc.getElementsByTagName("td").Item(2).innerText  
    Population = HTMLdoc.getElementsByTagName("td").Item(6).innerText  
    MedianHomeVal = HTMLdoc.getElementsByTagName("td").Item(12).innerText

    cell.Offset(0, 1) = County
    cell.Offset(0, 2) = Population
    cell.Offset(0, 3) = MedianHomeVal

Next cell

End Sub

也许遍历 <th> 个元素,然后得到 next sibling 节点,它应该是所需的 td 节点?我没有测试过这个。

For each header in HTMLdoc.getElementsByTagName("th")
    If header.InnerText = "County: " then
        county = header.NextSibling.InnerText
        Exit For
    End If
Next

试试这个。它将为您获取预期的输出(县、人口和房屋价值中值)。我已经设置了几个从您上传的图片中获取的搜索选项。顺便说一句,应该使用邮政编码进行搜索,否则 populationmedian home value 不会出现在该网页中。

Sub ZipCodeScrape()
    Dim IE As New InternetExplorer, html As HTMLDocument
    Dim search_input As Variant, posts As Object, post As Object, elem As Object

    With IE
        .Visible = True
        .navigate "https://www.unitedstateszipcodes.org/"
        Do Until .readyState = READYSTATE_COMPLETE: Loop
        Set html = .document
    End With

    Application.Wait Now + TimeValue("00:00:03")

    For Each search_input In [{"32937","33056","33312","33844","34698"}]

        html.getElementById("q").Value = search_input
        html.getElementsByClassName("btn btn-danger")(0).Click
        Application.Wait Now + TimeValue("00:00:05")

        For Each posts In html.getElementsByTagName("th")
            If InStr(posts.innerText, "County:") > 0 Then Row = Row + 1: Cells(Row, 1) = posts.NextSibling.innerText: Exit For
        Next posts
        For Each post In html.getElementsByTagName("th")
            If InStr(post.innerText, "Population") > 0 Then Cells(Row, 2) = post.ParentNode.getElementsByTagName("td")(0).innerText: Exit For
        Next post
        For Each elem In html.getElementsByTagName("th")
            If InStr(elem.innerText, "Median Home Value") > 0 Then Cells(Row, 3) = elem.ParentNode.getElementsByTagName("td")(0).innerText: Exit For
        Next elem
    Next search_input
    IE.Quit
End Sub