解析 HTML getElementsByTagName 未返回所有单元格
Parsing HTML getElementsByTagName not returning all cells
我有一些代码可用于从网页中抓取数据,但网页已更改,无法再使用。该代码应该对内部交易 table 进行计算,但是 getelementsbytagname("td")
不再 returns 所有单元格。
我猜是因为它是一个嵌入页面或其他东西的页面,但我无法解决它,我对 html 不是很熟悉。示例网页是 gurufocus.com/stock/lmb/insider
。
我的代码如下:
Sub getStatements()
Dim wb As Object
Dim doc As Object
Dim incomeStmtURLs As Variant
Dim sURL As String
Dim allCells As IHTMLElementCollection
Dim aCell As HTMLTableCell
Dim i As Integer
Dim loginBoxData As String
Application.DisplayAlerts = False
Call ToggleEvents(False)
incomeStmtURLs = Range("Sheet1!h1:h2").Value
For i = 1 To UBound(incomeStmtURLs)
Set wb = CreateObject("internetExplorer.Application")
sURL = incomeStmtURLs(i, 1)
wb.navigate sURL
wb.Visible = False
While wb.Busy
Application.Wait Now + #12:00:01 AM#
DoEvents
Wend
'HTML document
Set doc = wb.document
On Error GoTo err_clear
' gets all cell and looks for date format,
' goes from new transaction to old so once gets to older than a year it exits for loop
' checks nextSibling from date is a buy and if so does calculations, by taking further value sin row
' for priceThisTime have to get rid of $ symbol for calculation
Set allCells = doc.getElementsByTagName("td")
For Each aCell In allCells
MsgBox (aCell.innerText)
If aCell.innerText Like "####-##-##" = True Then
If CDate(aCell.innerText) >= Date - 365 Then
If aCell.NextSibling.innerText = "Buy" Then
buys = buys + 1
sharesThisTime = CDec(aCell.NextSibling.NextSibling.innerText)
priceThisTime = aCell.NextSibling.NextSibling.NextSibling.NextSibling.innerText
totalPrice = totalPrice + (sharesThisTime * CDec(Right(priceThisTime, Len(priceThisTime) - 1)))
shareCount = shareCount + sharesThisTime
End If
Else
Exit For
End If
End If
Next aCell
Sheet6.Cells(i + 1, 2) = buys
If (shareCount <> 0) Then
Sheet6.Cells(i + 1, 3).Value = totalPrice / shareCount
End If
buys = 0
totalPrice = 0
shareCount = 0
err_clear:
If Err <> 0 Then
Err.Clear
Resume Next
End If
wb.Quit
Next i
Call ToggleEvents(True)
End Sub
以下内容专门针对 table 并检索所有 td 元素。我认为您的逻辑可能适用于列号,但无论如何(以防万一我也将 table 设置为变量)。
我将每页的结果设置为 100,但您可以注释掉该行
Option Explicit
'VBE > Tools > References:
' Microsoft Internet Controls
Public Sub GetData()
Dim ie As New InternetExplorer, lastDropDrownItemIndex As Long, dropDown As Object, t As Date
Const MAX_WAIT_SEC As Long = 10
With ie
.Visible = True
.Navigate2 "https://www.gurufocus.com/stock/lmb/insider"
While .Busy Or .readyState < 4: DoEvents: Wend
t = Timer
Do
Set dropDown = .document.querySelectorAll(".el-dropdown-menu__item")
lastDropDrownItemIndex = dropDown.Length - 1
If Timer - t > MAX_WAIT_SEC Then Exit Do
Loop While lastDropDrownItemIndex < 1
If dropDown.Length = 0 Then Exit Sub
dropDown.item(lastDropDrownItemIndex).Click 'comment me out if don't want 100 results per page
Dim tds As Object, table As Object
Set tds = .document.getElementsByClassName("data-table")(0).getElementsByTagName("td")
Set table = .document.getElementsByClassName("data-table")
Stop
.Quit
End With
End Sub
我有一些代码可用于从网页中抓取数据,但网页已更改,无法再使用。该代码应该对内部交易 table 进行计算,但是 getelementsbytagname("td")
不再 returns 所有单元格。
我猜是因为它是一个嵌入页面或其他东西的页面,但我无法解决它,我对 html 不是很熟悉。示例网页是 gurufocus.com/stock/lmb/insider
。
我的代码如下:
Sub getStatements()
Dim wb As Object
Dim doc As Object
Dim incomeStmtURLs As Variant
Dim sURL As String
Dim allCells As IHTMLElementCollection
Dim aCell As HTMLTableCell
Dim i As Integer
Dim loginBoxData As String
Application.DisplayAlerts = False
Call ToggleEvents(False)
incomeStmtURLs = Range("Sheet1!h1:h2").Value
For i = 1 To UBound(incomeStmtURLs)
Set wb = CreateObject("internetExplorer.Application")
sURL = incomeStmtURLs(i, 1)
wb.navigate sURL
wb.Visible = False
While wb.Busy
Application.Wait Now + #12:00:01 AM#
DoEvents
Wend
'HTML document
Set doc = wb.document
On Error GoTo err_clear
' gets all cell and looks for date format,
' goes from new transaction to old so once gets to older than a year it exits for loop
' checks nextSibling from date is a buy and if so does calculations, by taking further value sin row
' for priceThisTime have to get rid of $ symbol for calculation
Set allCells = doc.getElementsByTagName("td")
For Each aCell In allCells
MsgBox (aCell.innerText)
If aCell.innerText Like "####-##-##" = True Then
If CDate(aCell.innerText) >= Date - 365 Then
If aCell.NextSibling.innerText = "Buy" Then
buys = buys + 1
sharesThisTime = CDec(aCell.NextSibling.NextSibling.innerText)
priceThisTime = aCell.NextSibling.NextSibling.NextSibling.NextSibling.innerText
totalPrice = totalPrice + (sharesThisTime * CDec(Right(priceThisTime, Len(priceThisTime) - 1)))
shareCount = shareCount + sharesThisTime
End If
Else
Exit For
End If
End If
Next aCell
Sheet6.Cells(i + 1, 2) = buys
If (shareCount <> 0) Then
Sheet6.Cells(i + 1, 3).Value = totalPrice / shareCount
End If
buys = 0
totalPrice = 0
shareCount = 0
err_clear:
If Err <> 0 Then
Err.Clear
Resume Next
End If
wb.Quit
Next i
Call ToggleEvents(True)
End Sub
以下内容专门针对 table 并检索所有 td 元素。我认为您的逻辑可能适用于列号,但无论如何(以防万一我也将 table 设置为变量)。
我将每页的结果设置为 100,但您可以注释掉该行
Option Explicit
'VBE > Tools > References:
' Microsoft Internet Controls
Public Sub GetData()
Dim ie As New InternetExplorer, lastDropDrownItemIndex As Long, dropDown As Object, t As Date
Const MAX_WAIT_SEC As Long = 10
With ie
.Visible = True
.Navigate2 "https://www.gurufocus.com/stock/lmb/insider"
While .Busy Or .readyState < 4: DoEvents: Wend
t = Timer
Do
Set dropDown = .document.querySelectorAll(".el-dropdown-menu__item")
lastDropDrownItemIndex = dropDown.Length - 1
If Timer - t > MAX_WAIT_SEC Then Exit Do
Loop While lastDropDrownItemIndex < 1
If dropDown.Length = 0 Then Exit Sub
dropDown.item(lastDropDrownItemIndex).Click 'comment me out if don't want 100 results per page
Dim tds As Object, table As Object
Set tds = .document.getElementsByClassName("data-table")(0).getElementsByTagName("td")
Set table = .document.getElementsByClassName("data-table")
Stop
.Quit
End With
End Sub