Excel - VBA 网页抓取 - getElementsByTagName
Excel - VBA Web Scraping - getElementsByTagName
我是运行这段代码,它完美地展示了我想要获得的这些田径表演:
Sub WebScraping()
Dim XMLPage As New MSXML2.XMLHTTP60
Dim HTMLDoc As New MSHTML.HTMLDocument
Dim Records As MSHTML.IHTMLElementCollection
Dim Record As MSHTML.IHTMLElement
Dim HTMLIms As MSHTML.IHTMLElementCollection
Dim HTMLIm As MSHTML.IHTMLElement
Dim URL As String
Dim RowNum As Integer: RowNum = 1
Dim NumPage As Integer
Sheets("Sheet1").Range("a1:z10000").ClearContents
For NumPage = 1 To 4
URL = "https://www.worldathletics.org/records/toplists/sprints/100-metres/outdoor/men/senior/2020?page=" & NumPage & ""
XMLPage.Open "Get", URL, False
XMLPage.setRequestHeader "Content-Type", "text/xml"
XMLPage.send
HTMLDoc.body.innerHTML = XMLPage.responseText
Set Records = HTMLDoc.getElementById("toplists").getElementsByTagName("table")(0).getElementsByTagName("tbody")(0).getElementsByTagName("tr")
For Each Record In Records
Set HTMLIms = Record.getElementsByTagName("td")
For Each HTMLIm In HTMLIms
Sheets("Sheet1").Cells(RowNum, 1).Value = HTMLIms.Item(0).innerText
Sheets("Sheet1").Cells(RowNum, 2).Value = HTMLIms.Item(1).innerText
Sheets("Sheet1").Cells(RowNum, 3).Value = HTMLIms.Item(2).innerText
Sheets("Sheet1").Cells(RowNum, 4).Value = HTMLIms.Item(3).innerText
Sheets("Sheet1").Cells(RowNum, 5).Value = HTMLIms.Item(4).innerText
Sheets("Sheet1").Cells(RowNum, 6).Value = HTMLIms.Item(5).innerText
Sheets("Sheet1").Cells(RowNum, 7).Value = HTMLIms.Item(6).innerText
Sheets("Sheet1").Cells(RowNum, 9).Value = HTMLIms.Item(8).innerText
Sheets("Sheet1").Cells(RowNum, 10).Value = HTMLIms.Item(9).innerText
Sheets("Sheet1").Cells(RowNum, 11).Value = HTMLIms.Item(10).innerText
Next HTMLIm
RowNum = RowNum + 1
Next Record
Next NumPage
End Sub
但是当我想插入一个代码来获取运动员的 id 时遇到了问题(这也可以单独使用):
'Athletes' codes
RowNum = 1
Set HTMLIms = HTMLDoc.getElementsByTagName("a")
For Each HTMLIm In HTMLIms
If Left(HTMLIm.getAttribute("href"), 24) = "about:/athletes/athlete=" Then
Sheets("Sheet1").Cells(RowNum, 12).Value = Right(HTMLIm.getAttribute("href"), Len(HTMLIm.getAttribute("href")) - _
(InStr(HTMLIm.getAttribute("href"), "=")))
RowNum = RowNum + 1
End If
Next HTMLIm
有人能帮我先把第二个代码插入进去吗?
提前致谢。
试试这个一次性包含 ID 号。为了让您在第 3 列中的不同数字旁边获得 +
或 -
符号,我使用了一个小技巧。您可以合并其余部分,因为我在此处粘贴了相关部分。
部分合并代码:
Sub WebScraping()
Const Url As String = "https://www.worldathletics.org/records/toplists/sprints/100-metres/outdoor/men/senior/2020?page="
Dim XMLPage As New XMLHTTP60, HTMLDoc As New HTMLDocument
Dim Record As Object, NumPage As Integer
Dim RowNum As Integer, Ws As Worksheet, I&
Set Ws = ThisWorkbook.Worksheets("Sheet1")
For NumPage = 1 To 2
XMLPage.Open "Get", Url & NumPage, False
XMLPage.setRequestHeader "Content-Type", "text/xml"
XMLPage.send
HTMLDoc.body.innerHTML = XMLPage.responseText
For Each Record In HTMLDoc.getElementsByTagName("table")(0).getElementsByTagName("tbody")(0).getElementsByTagName("tr")
RowNum = RowNum + 1: Ws.Cells(RowNum, 1).Value = Record.getElementsByTagName("td").Item(0).innerText
Ws.Cells(RowNum, 2).Value = Record.getElementsByTagName("td").Item(1).innerText
Ws.Cells(RowNum, 3).Value = "'" & Record.getElementsByTagName("td").Item(2).innerText
Ws.Cells(RowNum, 4).Value = Record.getElementsByTagName("td").Item(3).innerText
On Error Resume Next
Ws.Cells(RowNum, 5).Value = Split(Record.getElementsByTagName("td").Item(3).getElementsByTagName("a")(0).getAttribute("href"), "=")(1)
On Error GoTo 0
Next Record
Next NumPage
End Sub
我是运行这段代码,它完美地展示了我想要获得的这些田径表演:
Sub WebScraping()
Dim XMLPage As New MSXML2.XMLHTTP60
Dim HTMLDoc As New MSHTML.HTMLDocument
Dim Records As MSHTML.IHTMLElementCollection
Dim Record As MSHTML.IHTMLElement
Dim HTMLIms As MSHTML.IHTMLElementCollection
Dim HTMLIm As MSHTML.IHTMLElement
Dim URL As String
Dim RowNum As Integer: RowNum = 1
Dim NumPage As Integer
Sheets("Sheet1").Range("a1:z10000").ClearContents
For NumPage = 1 To 4
URL = "https://www.worldathletics.org/records/toplists/sprints/100-metres/outdoor/men/senior/2020?page=" & NumPage & ""
XMLPage.Open "Get", URL, False
XMLPage.setRequestHeader "Content-Type", "text/xml"
XMLPage.send
HTMLDoc.body.innerHTML = XMLPage.responseText
Set Records = HTMLDoc.getElementById("toplists").getElementsByTagName("table")(0).getElementsByTagName("tbody")(0).getElementsByTagName("tr")
For Each Record In Records
Set HTMLIms = Record.getElementsByTagName("td")
For Each HTMLIm In HTMLIms
Sheets("Sheet1").Cells(RowNum, 1).Value = HTMLIms.Item(0).innerText
Sheets("Sheet1").Cells(RowNum, 2).Value = HTMLIms.Item(1).innerText
Sheets("Sheet1").Cells(RowNum, 3).Value = HTMLIms.Item(2).innerText
Sheets("Sheet1").Cells(RowNum, 4).Value = HTMLIms.Item(3).innerText
Sheets("Sheet1").Cells(RowNum, 5).Value = HTMLIms.Item(4).innerText
Sheets("Sheet1").Cells(RowNum, 6).Value = HTMLIms.Item(5).innerText
Sheets("Sheet1").Cells(RowNum, 7).Value = HTMLIms.Item(6).innerText
Sheets("Sheet1").Cells(RowNum, 9).Value = HTMLIms.Item(8).innerText
Sheets("Sheet1").Cells(RowNum, 10).Value = HTMLIms.Item(9).innerText
Sheets("Sheet1").Cells(RowNum, 11).Value = HTMLIms.Item(10).innerText
Next HTMLIm
RowNum = RowNum + 1
Next Record
Next NumPage
End Sub
但是当我想插入一个代码来获取运动员的 id 时遇到了问题(这也可以单独使用):
'Athletes' codes
RowNum = 1
Set HTMLIms = HTMLDoc.getElementsByTagName("a")
For Each HTMLIm In HTMLIms
If Left(HTMLIm.getAttribute("href"), 24) = "about:/athletes/athlete=" Then
Sheets("Sheet1").Cells(RowNum, 12).Value = Right(HTMLIm.getAttribute("href"), Len(HTMLIm.getAttribute("href")) - _
(InStr(HTMLIm.getAttribute("href"), "=")))
RowNum = RowNum + 1
End If
Next HTMLIm
有人能帮我先把第二个代码插入进去吗?
提前致谢。
试试这个一次性包含 ID 号。为了让您在第 3 列中的不同数字旁边获得 +
或 -
符号,我使用了一个小技巧。您可以合并其余部分,因为我在此处粘贴了相关部分。
部分合并代码:
Sub WebScraping()
Const Url As String = "https://www.worldathletics.org/records/toplists/sprints/100-metres/outdoor/men/senior/2020?page="
Dim XMLPage As New XMLHTTP60, HTMLDoc As New HTMLDocument
Dim Record As Object, NumPage As Integer
Dim RowNum As Integer, Ws As Worksheet, I&
Set Ws = ThisWorkbook.Worksheets("Sheet1")
For NumPage = 1 To 2
XMLPage.Open "Get", Url & NumPage, False
XMLPage.setRequestHeader "Content-Type", "text/xml"
XMLPage.send
HTMLDoc.body.innerHTML = XMLPage.responseText
For Each Record In HTMLDoc.getElementsByTagName("table")(0).getElementsByTagName("tbody")(0).getElementsByTagName("tr")
RowNum = RowNum + 1: Ws.Cells(RowNum, 1).Value = Record.getElementsByTagName("td").Item(0).innerText
Ws.Cells(RowNum, 2).Value = Record.getElementsByTagName("td").Item(1).innerText
Ws.Cells(RowNum, 3).Value = "'" & Record.getElementsByTagName("td").Item(2).innerText
Ws.Cells(RowNum, 4).Value = Record.getElementsByTagName("td").Item(3).innerText
On Error Resume Next
Ws.Cells(RowNum, 5).Value = Split(Record.getElementsByTagName("td").Item(3).getElementsByTagName("a")(0).getAttribute("href"), "=")(1)
On Error GoTo 0
Next Record
Next NumPage
End Sub