使用 VBA 抓取

Scraping using VBA

我正在尝试从政府网站上提取一个数字,我已经进行了大量的谷歌搜索,但我有点迷失了想法,我的代码在 returns 下是一个数字,但这不是我想要的数字得到,我不完全确定为什么。

我想从 'Cases by Area (Whole Pandemic)' table 'Upper tier LA' 部分和 'Southend on Sea' 案例编号中减去数字。

https://coronavirus.data.gov.uk/details/cases

我从网上某个地方偷了这段代码,并试图用我在网站 F12 部分找到的 class 号码进行复制。

Sub ExtractLastValue()

Set objIE = CreateObject("InternetExplorer.Application")

objIE.Top = 0
objIE.Left = 0
objIE.Width = 800
objIE.Height = 600

objIE.Visible = True

objIE.Navigate ("https://coronavirus.data.gov.uk/details/cases")

Do
DoEvents
Loop Until objIE.readystate = 4

MsgBox objIE.document.getElementsByClassName("sc-bYEvPH khGBIg govuk-table__cell govuk-table__cell--numeric ")(0).innerText

Set objIE = Nothing

End Sub


数据来自官方 API 和 returns 当您单击上层面板时该页面上的 json 动态响应。


查看并使用 API 指南 这里:

https://coronavirus.data.gov.uk/details/developers-guide


您可以按照 API 文档中的指导发出直接的 xhr 请求,然后使用 json 解析器来处理响应。对于您的请求,它将类似于以下内容:

https://coronavirus.data.gov.uk/api/v1/data?filters=areaName=Southend-on-Sea&areaType=utla&latestBy=cumCasesByPublishDate&structure=
{"date":"date", "areaName":"areaName","cumCasesByPublishDate":"cumCasesByPublishDate",
"cumCasesByPublishDateRate":"cumCasesByPublishDateRate"}

XHR:

使用 jsonconverter.bas 作为 json 解析器的工作示例

Option Explicit

Public Sub GetCovidNumbers()
    Dim http As Object, json As Object

    Set http = CreateObject("MSXML2.XMLHTTP")

    With http
        .Open "GET", "https://coronavirus.data.gov.uk/api/v1/data?filters=areaName=Southend-on-Sea&areaType=utla&latestBy=cumCasesByPublishDate&structure={""date"":""date"",""areaName"":""areaName"",""cumCasesByPublishDate"":""cumCasesByPublishDate"",""cumCasesByPublishDateRate"":""cumCasesByPublishDateRate""}", False
        .setRequestHeader "User-Agent", "Mozilla/5.0"
        .send
        Set json = JsonConverter.ParseJson(.responseText)("data")(1)
    End With
    With ActiveSheet
        Dim arr()
        arr = json.Keys
        .Cells(1, 1).Resize(1, UBound(arr) + 1) = arr
        arr = json.Items
        .Cells(2, 1).Resize(1, UBound(arr) + 1) = arr
    End With
End Sub

Json 库(在上述解决方案中使用):

我用jsonconverter.bas。从 here 下载原始代码并添加到名为 JsonConverter 的标准模块。然后您需要转到 VBE > 工具 > 参考 > 添加对 Microsoft 脚本运行时的参考。从复制的代码中删除顶部的属性行。


Internet Explorer:

你可以做一个更慢、更复杂的 Internet Explorer 解决方案,你需要 select utla 选项,然后 select 来自 table 所需的值:

Option Explicit

Public Sub GetCovidNumbers()
    'Tools references Microsoft Internet Controls and Microsoft HTML Object Library
    
    Dim ie As SHDocVw.InternetExplorer, t As Date, ele As Object
    Const MAX_WAIT_SEC As Long = 10
    
    Set ie = New SHDocVw.InternetExplorer
    
    With ie
        .Visible = True
        .Navigate2 "https://coronavirus.data.gov.uk/details/cases"
        While .Busy Or .ReadyState <> READYSTATE_COMPLETE: DoEvents: Wend
        
        t = Timer 'timed loop for element to be present to click on (to get utla)
        Do
            On Error Resume Next
            Set ele = .Document.querySelector("#card-cases_by_area_whole_pandemic [aria-label='Upper tier LA']")
            On Error GoTo 0
            If Timer - t > MAX_WAIT_SEC Then Exit Do
        Loop While ele Is Nothing

        If ele Is Nothing Then Exit Sub
        
        ele.Click
        
        While .Busy Or .ReadyState <> READYSTATE_COMPLETE: DoEvents: Wend
        
        Dim table As MSHTML.HTMLTable, datetime As String, result()
        
        Set table = .Document.querySelector("table[download='cumCasesByPublishDate,cumCasesByPublishDateRate']")
        datetime = .Document.querySelector("time").getAttribute("datetime")
        result = GetDataForUtla("Southend-on-Sea", datetime, table)
        
        With ActiveSheet
            .Cells(1, 1).Resize(1, 4) = Array("Datetime", "Area", "Cases", "Rate per 100,000 population")
            .Cells(2, 1).Resize(1, UBound(result) + 1) = result
        End With
        .Quit
    End With
    
End Sub

Public Function GetDataForUtla(ByVal utla As String, ByVal datetime As String, ByVal table As MSHTML.HTMLTable) As Variant
    Dim row As MSHTML.HTMLTableRow, i As Long

    For Each row In table.Rows
        
        If InStr(row.outerHTML, utla) > 0 Then
            Dim arr(4)
            arr(0) = datetime
            For i = 0 To 2
                arr(i + 1) = row.Children(i).innerText
            Next
            GetDataForUtla = arr
            Exit Function
        End If
    Next
    GetDataForUtla = Array("Not found")
End Function

参考文献:

  1. https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_Selectors
  2. https://developer.mozilla.org/en-US/docs/Web/API/Document/querySelector