网页抓取 - 标签问题

Web Scraping - Problems with tags

我是网络数据抓取的新手,也在使用 For...Next。我正在尝试从网站获取数据(所有页面),但代码似乎是错误的,因为我收到错误 91。这是代码:

Dim ie As Object

Sub connect()
Set ie = CreateObject("INTERNETEXPLORER.APPLICATION")
ie.NAVIGATE "https://www.worldathletics.org/world-rankings/100m/men"
ie.Visible = True
End Sub

Sub id_tr_td_for()

Range("a1:z10000").ClearContents

For i = 0 To 10
For j = 0 To 5
Cells(i + 1, j + 1) = ie.document.getElementById("toplists").getElementsByTagName("tr")(i).getElementsByTagName("td")(j).innerText
Next
Next
End Sub

有人可以帮我解决这个问题并告诉我谁可以列出所有页面吗?

谢谢。

我不确定错误来自哪里,我也知道了。

下面的代码应该会有帮助,它将指定页面的 table 的内容打印到调试 window.

以下代码应将 selected 页面的所有数据复制到 sheet1

您需要在 VBA 编辑器中添加几个引用才能使用它。 (工具菜单,参考,然后找到并 select 它们)Microsoft HTML Object LibraryMicrosoft Internet Controls

Const MaxPage = 2 ' set to 26 (or however many there are) - at 2 for testing purposes
Dim Browser As InternetExplorer

Sub Start()
Dim Page As Integer: Page = 1 ' start at page 1
Dim PageDocument As IHTMLDocument
Dim RecordRow As IHTMLElementCollection
Dim RecordItem As IHTMLElement

Dim Sheet As Worksheet: Set Sheet = ThisWorkbook.Worksheets("Sheet1") ' output sheet
If Browser Is Nothing Then
    Set Browser = New InternetExplorer
End If
Dim oRow As Integer: oRow = 2 ' begin output at row 2 (account for header)
Dim Record As Integer
For Page = 1 To MaxPage
    LoadPage Page
        For Record = 0 To 99 ' zero index, 100 items (1-99)
        Set PageDocument = Browser.Document
        Set RecordRow = PageDocument.getElementById("toplists").getElementsByTagName("table")(0).getElementsByTagName("tbody")(0).getElementsByTagName("tr")(Record).getElementsByTagName("td")
        Sheet.Cells(oRow, 1).Value = Trim(RecordRow(0).innerText)
        Sheet.Cells(oRow, 2).Value = Trim(RecordRow(1).innerText)
        Sheet.Cells(oRow, 3).Value = Trim(RecordRow(2).innerText)
        Sheet.Cells(oRow, 4).Value = Trim(RecordRow(3).innerText)
        Sheet.Cells(oRow, 5).Value = Trim(RecordRow(4).innerText)
        Sheet.Cells(oRow, 6).Value = Trim(RecordRow(5).innerText)
        oRow = oRow + 1
    Next Record
Next Page
Browser.Quit
End Sub

Sub LoadPage(ByVal PageNumber As Integer)
Debug.Print "Navigating to Page #" & CStr(PageNumber)
Browser.navigate "https://www.worldathletics.org/world-rankings/100m/men?page=" & CStr(PageNumber)
While Browser.readyState <> 4 Or Browser.Busy: DoEvents: Wend
Debug.Print "Navigation Complete"
End Sub

更新代码

Index Out-of-Bound错误可能是硬编码索引导致的,如果一个页面没有99条记录就会失败,如果一条记录没有5个字段就会失败。下面的代码取消了索引,只是擦除它找到的每一行和单元格。您不应该得到索引错误,但输出可能参差不齐。

进一步更新

462 错误是由 Browser.Quit 引起的。这会关闭浏览器但不会设置对 Nothing 的引用,因此当您再次 运行 代码时,它会尝试使用不存在的浏览器。最后明确将其设置为空可以解决此问题。

竞争对手一栏没有link,整行有一个数据-url,由别的东西处理。但是 URL 可以很容易地访问。

Sub NewStart()
Dim PageDocument As IHTMLDocument

Dim Records As IHTMLElementCollection
Dim Record As IHTMLElement
Dim RecordItems As IHTMLElementCollection
Dim RecordItem As IHTMLElement

Dim OutputRow As Integer: OutputRow = 2
Dim OutputColumn As Integer

Dim Page As Integer

Dim Sheet As Worksheet: Set Sheet = ThisWorkbook.Worksheets("Sheet1")
If Browser Is Nothing Then
    Set Browser = New InternetExplorer
    Browser.Visible = True
End If
For Page = 1 To MaxPage
    LoadPage Page
    Set PageDocument = Browser.Document
    Set Records = PageDocument.getElementById("toplists").getElementsByTagName("table")(0).getElementsByTagName("tbody")(0).getElementsByTagName("tr")
    For Each Record In Records
        Set RecordItems = Record.getElementsByTagName("td")
        OutputColumn = 1
        For Each RecordItem In RecordItems
            Sheet.Cells(OutputRow, OutputColumn).Value = Trim(RecordItem.innerText)
            OutputColumn = OutputColumn + 1
        Next RecordItem
        Sheet.Cells(OutputRow, OutputColumn).Value = "http://worldathletics.org/" & Record.getAttribute("data-athlete-url") ' This will add the link after the last column
        OutputRow = OutputRow + 1
    Next Record
Next Page
Browser.Quit
Set Browser = Nothing ' This will fix the 462 error
End Sub