如何使用 VBA 和选择器使本网站的内容达到 Excel?

How to get the content of this website to Excel with VBA and selectors?

我的网站问题:

尽管数据会定期更改,但数据的结构始终保持不变。我尝试将内容(仅带有 headers 的最后两列:AktenzeichenAufgehoben)传输到 excel 3 列(ID-Number、日期、时间) 通过按日期和时间拆分 Aufgehoben 的值。

我的问题是“Bundesland”和“Amtsgericht”列中的值(即使我不需要那些)与其他列的出现频率不同数据并弄乱 html-structure 中的所有 trs 和 tds,所以我不明白如何使用选择器!有任何想法吗?谢谢。

我的...嗯...代码:

Sub GetData()

    Const URL = "https://www.zvg.com/appl/aufgehoben.prg?act=getHTML"
    Dim html As New HTMLDocument
    Dim elmt As Object
    Dim x As long

    With CreateObject("MSXML2.XMLHTTP")
        .Open "GET", URL, False
        .send
        html.body.innerHTML = .responseText
    End With
                   
     For x = 0 to ????.Length - 1
     Set elmt = html.querySelectorAll("???")
       ActiveSheet.Cells(y + 2, 2) = elmt.Item(?).innerText  'Aktenzeichen
       ActiveSheet.Cells(y + 2, 3) = elmt.Item(?).innerText  'Date
       ActiveSheet.Cells(y + 2, 4) = elmt.Item(?).innerText  'Time
     Next

End Sub

我可以向您保证,可能有比这更好的答案,但以下代码有效:

Sub getStuff()

' Declare variables
Dim XMLPage As New MSXML2.XMLHTTP60
Dim HTMLDoc As New MSHTML.HTMLDocument
Dim Table, Row, Data, Point As Variant
Dim i, x, j As Integer

' Make Request
With XMLPage
    .Open "GET", "https://www.zvg.com/appl/aufgehoben.prg?act=getHTML", False
    .send
    HTMLDoc.body.innerHTML = .responseText
End With

' Set counters
i = 1
x = 0
j = 1

' Parse data into worksheet
For Each Table In HTMLDoc.getElementsByTagName("tr")
    For Each Row In Table.getElementsByTagName("tr")
        For Each Data In Row.getElementsByTagName("td")
            ' Parse headers in first run
            If i = 1 Then
                Cells(i, j).Value = Data.innerText
            Else
                x = i
                ' Split the data points
                For Each Point In Split(Data.innerText, Chr(13))
                    Cells(i, j).Value = Point
                    i = i + 1
                Next Point
                If j <> 3 Then
                    i = x
                End If
            End If
            j = j + 1
        Next Data
    i = i + 1
    j = 1
    Next Row
Next Table

' Remove empty rows
Columns("C:C").Select
Selection.SpecialCells(xlCellTypeBlanks).Select
Selection.EntireRow.Delete

End Sub

使用数组,仅在末尾触摸 sheet 以减少 I/O,转换为使用 css 选择器并在这些选择器中进行过滤,使用类型化函数并减少代码复杂度。


结果:

超过 10,000 次运行的差异(单个请求、多个解析和将数据放入 sheet)。

在 10,000 次运行中进行这些更改的中位数差异快 5.4 倍。

与上述优化没有其他区别。 ScreenUpdating 两者均已关闭。


VBA:

Option Explicit

Public r As Long

Public Sub GetContents()
    
    Dim http As MSXML2.XMLHTTP60, html As MSHTML.HTMLDocument
    
    Set http = New MSXML2.XMLHTTP60: Set html = New MSHTML.HTMLDocument
    
    With http
        .Open "GET", "https://www.zvg.com/appl/aufgehoben.prg?act=getHTML", False
        .send
        html.body.innerHTML = .responseText
    End With
    
    Dim colOne As MSHTML.IHTMLDOMChildrenCollection, colTwoAndThree As MSHTML.IHTMLDOMChildrenCollection, i As Long
    
    Set colOne = html.querySelectorAll("td + td > table td + td:nth-child(2)")
    Set colTwoAndThree = html.querySelectorAll("td + td > table td + td:nth-child(3)")
    
    Dim headers() As Variant, results() As Variant
    
    headers = Array("ID-Number", "Date", "Time")
    ReDim results(1 To 1000, 1 To UBound(headers) + 1)
    
    With colOne
        
        For i = 0 To colOne.Length - 1
    
            UpdateResults results, colOne.Item(i).innerText, colTwoAndThree.Item(i).innerText

        Next
        
    End With

    results = Application.Transpose(results)
    ReDim Preserve results(1 To UBound(headers) + 1, 1 To r)
    results = Application.Transpose(results)
    
    With ActiveSheet
        .Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
        .Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
    End With
End Sub

Public Sub UpdateResults(ByRef results As Variant, ByVal col1 As String, ByVal col2And3 As String)
    Dim arrCol1() As String, arrCol2And3() As String
    Dim i As Long, datetime() As String
    
    arrCol1 = Split(col1, Chr$(10))
    arrCol2And3 = Split(col2And3, vbCrLf)
    
    For i = LBound(arrCol1) To UBound(arrCol1)
        r = r + 1
        results(r, 1) = Trim$(arrCol1(i))
        datetime = Split(arrCol2And3(i), Chr$(32))
        results(r, 2) = datetime(0): results(r, 3) = datetime(1)
    Next
End Sub

将 r 作为参数传递给签名(而不是 Public):

Option Explicit

Public Sub GetContents()
    
    Dim http As MSXML2.XMLHTTP60, html As MSHTML.HTMLDocument
    
    Set http = New MSXML2.XMLHTTP60: Set html = New MSHTML.HTMLDocument
    
    With http
        .Open "GET", "https://www.zvg.com/appl/aufgehoben.prg?act=getHTML", False
        .send
        html.body.innerHTML = .responseText
    End With
    
    Dim colOne As MSHTML.IHTMLDOMChildrenCollection, colTwoAndThree As MSHTML.IHTMLDOMChildrenCollection, i As Long
    
    'Set colOne = html.querySelectorAll("td + td > table td + td:nth-child(2)")
    Set colOne = html.querySelectorAll("td td + td[style*='150']")
    'Set colTwoAndThree = html.querySelectorAll("td + td > table td + td:nth-child(3)")
    Set colTwoAndThree = html.querySelectorAll("td td + td[style*='150'] + td")
    
    Dim headers() As Variant, results() As Variant
    
    headers = Array("ID-Number", "Date", "Time")
    ReDim results(1 To 1000, 1 To UBound(headers) + 1)
    
    With colOne
        
        For i = 0 To colOne.Length - 1
    
            UpdateResults results, colOne.Item(i).innerText, colTwoAndThree.Item(i).innerText, r

        Next
        
    End With

    results = Application.Transpose(results)
    ReDim Preserve results(1 To UBound(headers) + 1, 1 To r)
    results = Application.Transpose(results)
    
    With ActiveSheet
        .Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
        .Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
    End With
End Sub

Public Sub UpdateResults(ByRef results As Variant, ByVal col1 As String, ByVal col2And3 As String, ByRef r As Long)
    Dim arrCol1() As String, arrCol2And3() As String
    Dim i As Long, datetime() As String
    
    arrCol1 = Split(col1, Chr$(10))
    arrCol2And3 = Split(col2And3, vbCrLf)
    
    For i = LBound(arrCol1) To UBound(arrCol1)
        r = r + 1
        results(r, 1) = Trim$(arrCol1(i))
        datetime = Split(arrCol2And3(i), Chr$(32))
        results(r, 2) = datetime(0): results(r, 3) = datetime(1)
    Next
End Sub