如何使用 VBA 和选择器使本网站的内容达到 Excel?
How to get the content of this website to Excel with VBA and selectors?
我的网站问题:
尽管数据会定期更改,但数据的结构始终保持不变。我尝试将内容(仅带有 headers 的最后两列:Aktenzeichen 和 Aufgehoben)传输到 excel 3 列(ID-Number、日期、时间) 通过按日期和时间拆分 Aufgehoben 的值。
我的问题是“Bundesland”和“Amtsgericht”列中的值(即使我不需要那些)与其他列的出现频率不同数据并弄乱 html-structure 中的所有 trs 和 tds,所以我不明白如何使用选择器!有任何想法吗?谢谢。
我的...嗯...代码:
Sub GetData()
Const URL = "https://www.zvg.com/appl/aufgehoben.prg?act=getHTML"
Dim html As New HTMLDocument
Dim elmt As Object
Dim x As long
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", URL, False
.send
html.body.innerHTML = .responseText
End With
For x = 0 to ????.Length - 1
Set elmt = html.querySelectorAll("???")
ActiveSheet.Cells(y + 2, 2) = elmt.Item(?).innerText 'Aktenzeichen
ActiveSheet.Cells(y + 2, 3) = elmt.Item(?).innerText 'Date
ActiveSheet.Cells(y + 2, 4) = elmt.Item(?).innerText 'Time
Next
End Sub
我可以向您保证,可能有比这更好的答案,但以下代码有效:
Sub getStuff()
' Declare variables
Dim XMLPage As New MSXML2.XMLHTTP60
Dim HTMLDoc As New MSHTML.HTMLDocument
Dim Table, Row, Data, Point As Variant
Dim i, x, j As Integer
' Make Request
With XMLPage
.Open "GET", "https://www.zvg.com/appl/aufgehoben.prg?act=getHTML", False
.send
HTMLDoc.body.innerHTML = .responseText
End With
' Set counters
i = 1
x = 0
j = 1
' Parse data into worksheet
For Each Table In HTMLDoc.getElementsByTagName("tr")
For Each Row In Table.getElementsByTagName("tr")
For Each Data In Row.getElementsByTagName("td")
' Parse headers in first run
If i = 1 Then
Cells(i, j).Value = Data.innerText
Else
x = i
' Split the data points
For Each Point In Split(Data.innerText, Chr(13))
Cells(i, j).Value = Point
i = i + 1
Next Point
If j <> 3 Then
i = x
End If
End If
j = j + 1
Next Data
i = i + 1
j = 1
Next Row
Next Table
' Remove empty rows
Columns("C:C").Select
Selection.SpecialCells(xlCellTypeBlanks).Select
Selection.EntireRow.Delete
End Sub
使用数组,仅在末尾触摸 sheet 以减少 I/O,转换为使用 css 选择器并在这些选择器中进行过滤,使用类型化函数并减少代码复杂度。
结果:
超过 10,000 次运行的差异(单个请求、多个解析和将数据放入 sheet)。
在 10,000 次运行中进行这些更改的中位数差异快 5.4 倍。
与上述优化没有其他区别。 ScreenUpdating
两者均已关闭。
VBA:
Option Explicit
Public r As Long
Public Sub GetContents()
Dim http As MSXML2.XMLHTTP60, html As MSHTML.HTMLDocument
Set http = New MSXML2.XMLHTTP60: Set html = New MSHTML.HTMLDocument
With http
.Open "GET", "https://www.zvg.com/appl/aufgehoben.prg?act=getHTML", False
.send
html.body.innerHTML = .responseText
End With
Dim colOne As MSHTML.IHTMLDOMChildrenCollection, colTwoAndThree As MSHTML.IHTMLDOMChildrenCollection, i As Long
Set colOne = html.querySelectorAll("td + td > table td + td:nth-child(2)")
Set colTwoAndThree = html.querySelectorAll("td + td > table td + td:nth-child(3)")
Dim headers() As Variant, results() As Variant
headers = Array("ID-Number", "Date", "Time")
ReDim results(1 To 1000, 1 To UBound(headers) + 1)
With colOne
For i = 0 To colOne.Length - 1
UpdateResults results, colOne.Item(i).innerText, colTwoAndThree.Item(i).innerText
Next
End With
results = Application.Transpose(results)
ReDim Preserve results(1 To UBound(headers) + 1, 1 To r)
results = Application.Transpose(results)
With ActiveSheet
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End Sub
Public Sub UpdateResults(ByRef results As Variant, ByVal col1 As String, ByVal col2And3 As String)
Dim arrCol1() As String, arrCol2And3() As String
Dim i As Long, datetime() As String
arrCol1 = Split(col1, Chr$(10))
arrCol2And3 = Split(col2And3, vbCrLf)
For i = LBound(arrCol1) To UBound(arrCol1)
r = r + 1
results(r, 1) = Trim$(arrCol1(i))
datetime = Split(arrCol2And3(i), Chr$(32))
results(r, 2) = datetime(0): results(r, 3) = datetime(1)
Next
End Sub
将 r 作为参数传递给签名(而不是 Public):
Option Explicit
Public Sub GetContents()
Dim http As MSXML2.XMLHTTP60, html As MSHTML.HTMLDocument
Set http = New MSXML2.XMLHTTP60: Set html = New MSHTML.HTMLDocument
With http
.Open "GET", "https://www.zvg.com/appl/aufgehoben.prg?act=getHTML", False
.send
html.body.innerHTML = .responseText
End With
Dim colOne As MSHTML.IHTMLDOMChildrenCollection, colTwoAndThree As MSHTML.IHTMLDOMChildrenCollection, i As Long
'Set colOne = html.querySelectorAll("td + td > table td + td:nth-child(2)")
Set colOne = html.querySelectorAll("td td + td[style*='150']")
'Set colTwoAndThree = html.querySelectorAll("td + td > table td + td:nth-child(3)")
Set colTwoAndThree = html.querySelectorAll("td td + td[style*='150'] + td")
Dim headers() As Variant, results() As Variant
headers = Array("ID-Number", "Date", "Time")
ReDim results(1 To 1000, 1 To UBound(headers) + 1)
With colOne
For i = 0 To colOne.Length - 1
UpdateResults results, colOne.Item(i).innerText, colTwoAndThree.Item(i).innerText, r
Next
End With
results = Application.Transpose(results)
ReDim Preserve results(1 To UBound(headers) + 1, 1 To r)
results = Application.Transpose(results)
With ActiveSheet
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End Sub
Public Sub UpdateResults(ByRef results As Variant, ByVal col1 As String, ByVal col2And3 As String, ByRef r As Long)
Dim arrCol1() As String, arrCol2And3() As String
Dim i As Long, datetime() As String
arrCol1 = Split(col1, Chr$(10))
arrCol2And3 = Split(col2And3, vbCrLf)
For i = LBound(arrCol1) To UBound(arrCol1)
r = r + 1
results(r, 1) = Trim$(arrCol1(i))
datetime = Split(arrCol2And3(i), Chr$(32))
results(r, 2) = datetime(0): results(r, 3) = datetime(1)
Next
End Sub
我的网站问题:
尽管数据会定期更改,但数据的结构始终保持不变。我尝试将内容(仅带有 headers 的最后两列:Aktenzeichen 和 Aufgehoben)传输到 excel 3 列(ID-Number、日期、时间) 通过按日期和时间拆分 Aufgehoben 的值。
我的问题是“Bundesland”和“Amtsgericht”列中的值(即使我不需要那些)与其他列的出现频率不同数据并弄乱 html-structure 中的所有 trs 和 tds,所以我不明白如何使用选择器!有任何想法吗?谢谢。
我的...嗯...代码:
Sub GetData()
Const URL = "https://www.zvg.com/appl/aufgehoben.prg?act=getHTML"
Dim html As New HTMLDocument
Dim elmt As Object
Dim x As long
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", URL, False
.send
html.body.innerHTML = .responseText
End With
For x = 0 to ????.Length - 1
Set elmt = html.querySelectorAll("???")
ActiveSheet.Cells(y + 2, 2) = elmt.Item(?).innerText 'Aktenzeichen
ActiveSheet.Cells(y + 2, 3) = elmt.Item(?).innerText 'Date
ActiveSheet.Cells(y + 2, 4) = elmt.Item(?).innerText 'Time
Next
End Sub
我可以向您保证,可能有比这更好的答案,但以下代码有效:
Sub getStuff()
' Declare variables
Dim XMLPage As New MSXML2.XMLHTTP60
Dim HTMLDoc As New MSHTML.HTMLDocument
Dim Table, Row, Data, Point As Variant
Dim i, x, j As Integer
' Make Request
With XMLPage
.Open "GET", "https://www.zvg.com/appl/aufgehoben.prg?act=getHTML", False
.send
HTMLDoc.body.innerHTML = .responseText
End With
' Set counters
i = 1
x = 0
j = 1
' Parse data into worksheet
For Each Table In HTMLDoc.getElementsByTagName("tr")
For Each Row In Table.getElementsByTagName("tr")
For Each Data In Row.getElementsByTagName("td")
' Parse headers in first run
If i = 1 Then
Cells(i, j).Value = Data.innerText
Else
x = i
' Split the data points
For Each Point In Split(Data.innerText, Chr(13))
Cells(i, j).Value = Point
i = i + 1
Next Point
If j <> 3 Then
i = x
End If
End If
j = j + 1
Next Data
i = i + 1
j = 1
Next Row
Next Table
' Remove empty rows
Columns("C:C").Select
Selection.SpecialCells(xlCellTypeBlanks).Select
Selection.EntireRow.Delete
End Sub
使用数组,仅在末尾触摸 sheet 以减少 I/O,转换为使用 css 选择器并在这些选择器中进行过滤,使用类型化函数并减少代码复杂度。
结果:
超过 10,000 次运行的差异(单个请求、多个解析和将数据放入 sheet)。
在 10,000 次运行中进行这些更改的中位数差异快 5.4 倍。
与上述优化没有其他区别。 ScreenUpdating
两者均已关闭。
VBA:
Option Explicit
Public r As Long
Public Sub GetContents()
Dim http As MSXML2.XMLHTTP60, html As MSHTML.HTMLDocument
Set http = New MSXML2.XMLHTTP60: Set html = New MSHTML.HTMLDocument
With http
.Open "GET", "https://www.zvg.com/appl/aufgehoben.prg?act=getHTML", False
.send
html.body.innerHTML = .responseText
End With
Dim colOne As MSHTML.IHTMLDOMChildrenCollection, colTwoAndThree As MSHTML.IHTMLDOMChildrenCollection, i As Long
Set colOne = html.querySelectorAll("td + td > table td + td:nth-child(2)")
Set colTwoAndThree = html.querySelectorAll("td + td > table td + td:nth-child(3)")
Dim headers() As Variant, results() As Variant
headers = Array("ID-Number", "Date", "Time")
ReDim results(1 To 1000, 1 To UBound(headers) + 1)
With colOne
For i = 0 To colOne.Length - 1
UpdateResults results, colOne.Item(i).innerText, colTwoAndThree.Item(i).innerText
Next
End With
results = Application.Transpose(results)
ReDim Preserve results(1 To UBound(headers) + 1, 1 To r)
results = Application.Transpose(results)
With ActiveSheet
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End Sub
Public Sub UpdateResults(ByRef results As Variant, ByVal col1 As String, ByVal col2And3 As String)
Dim arrCol1() As String, arrCol2And3() As String
Dim i As Long, datetime() As String
arrCol1 = Split(col1, Chr$(10))
arrCol2And3 = Split(col2And3, vbCrLf)
For i = LBound(arrCol1) To UBound(arrCol1)
r = r + 1
results(r, 1) = Trim$(arrCol1(i))
datetime = Split(arrCol2And3(i), Chr$(32))
results(r, 2) = datetime(0): results(r, 3) = datetime(1)
Next
End Sub
将 r 作为参数传递给签名(而不是 Public):
Option Explicit
Public Sub GetContents()
Dim http As MSXML2.XMLHTTP60, html As MSHTML.HTMLDocument
Set http = New MSXML2.XMLHTTP60: Set html = New MSHTML.HTMLDocument
With http
.Open "GET", "https://www.zvg.com/appl/aufgehoben.prg?act=getHTML", False
.send
html.body.innerHTML = .responseText
End With
Dim colOne As MSHTML.IHTMLDOMChildrenCollection, colTwoAndThree As MSHTML.IHTMLDOMChildrenCollection, i As Long
'Set colOne = html.querySelectorAll("td + td > table td + td:nth-child(2)")
Set colOne = html.querySelectorAll("td td + td[style*='150']")
'Set colTwoAndThree = html.querySelectorAll("td + td > table td + td:nth-child(3)")
Set colTwoAndThree = html.querySelectorAll("td td + td[style*='150'] + td")
Dim headers() As Variant, results() As Variant
headers = Array("ID-Number", "Date", "Time")
ReDim results(1 To 1000, 1 To UBound(headers) + 1)
With colOne
For i = 0 To colOne.Length - 1
UpdateResults results, colOne.Item(i).innerText, colTwoAndThree.Item(i).innerText, r
Next
End With
results = Application.Transpose(results)
ReDim Preserve results(1 To UBound(headers) + 1, 1 To r)
results = Application.Transpose(results)
With ActiveSheet
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End Sub
Public Sub UpdateResults(ByRef results As Variant, ByVal col1 As String, ByVal col2And3 As String, ByRef r As Long)
Dim arrCol1() As String, arrCol2And3() As String
Dim i As Long, datetime() As String
arrCol1 = Split(col1, Chr$(10))
arrCol2And3 = Split(col2And3, vbCrLf)
For i = LBound(arrCol1) To UBound(arrCol1)
r = r + 1
results(r, 1) = Trim$(arrCol1(i))
datetime = Split(arrCol2And3(i), Chr$(32))
results(r, 2) = datetime(0): results(r, 3) = datetime(1)
Next
End Sub