Excel VBA Web 抓取表没有响应; MSXML2.ServerXMLhttp.6.0 方法
Excel VBA Web Scraping Tables Not Responding; MSXML2.ServerXMLhttp.6.0 Method
我使用 Excel VBA 构建了一个网络抓取工具,它执行以下操作:
- 从名为 "CIK_Links" 的 sheet 中的 link 列表中一次读取一个 link。
- 它转到 link,读取它的响应文本,如果在该响应文本中它找到一个超级link,其内部HTML 读取,“(所有基金列表和 Classes/Contracts for",然后它将 link 保存到一个变量中并创建另一个 MSXML2.ServerXMLhttp.6.0 对象。
- 创建对象后,它会在响应文本中找到第 3 个 table,循环并找到那个 table 的特定元素,然后将这些值输出到 Excel sheet 调用了 "Parsed_Tables"。
- 代码然后转到 "CIK_Links" sheet 上的下一个 link 并重复步骤 1-3。注意:sheet中有大约640,000个link,但我运行ning循环一次只有几千个。是的,我已经尝试 运行 一次将其设置为 10、20、100,但问题仍然存在。
我遇到的问题是,一旦我点击 运行,我就会收到消息 "Excel is not responding",但代码仍在后台 运行。考虑到我要求它执行的操作,该代码运行完美且速度非常快,但显然我需要进一步优化它以防止它过载 Excel。找到一些方法来避免在每次迭代时将解析的 HTML 写入 Excel 会很有帮助,但是,我不知道如何以我需要的格式写入数据而不做所以。数组解决方案会很棒,但是在将数组中的数据写入 Excel 之前,必须对数组中的数据进行大量操作,甚至可能 subsetting/slicing 数组。我需要帮助,因为我已经用尽了我所有的知识,并且在构建这个应用程序的过程中我做了很多研究。我什至愿意使用 python 和 beautifulsoup 库等其他技术,我只是不知道如何将 table 数据以我需要的格式输出到 csv 文件.提前致谢!
这是文件: TrustTable_Parse.xlsb
免责声明: 我有一个 B.S。在数学方面,我通过用每种语言实施我自己的许多项目,自学了如何使用 VBA、SQL 和 R 进行编码。重点是,如果我的代码看起来很奇怪,或者您认为我做事效率低下,那是因为我已经多年没有编码了,而且我不知道更好,大声笑。
下面是我的代码:
Option Explicit
Sub Final_Parse_TrustTables()
Dim HTML As New HTMLDocument
Dim http As Object
Dim links As Object
Dim Url, Trst As String
Dim link As HTMLHtmlElement
Dim i As Long
Dim http2 As Object
Dim HTML2 As New HTMLDocument
Dim tbl As Object
Dim ele As HTMLHtmlElement
Dim wb As Workbook
Dim ws, ws_2 As Worksheet
'sets ScreenUpdating to false _
turns off event triggers, ect.
OptimizeCode_Begin
Set wb = ThisWorkbook
Set ws = wb.Sheets("CIK_Links")
'Creates this object to see if Trust table exists
Set http = CreateObject("MSXML2.ServerXMLhttp.6.0")
'Loops through the list of URL's _
in the 'CIK_Links' Sheet
For i = 2 To 3000
'List of URL's
Url = ws.Range("C1").Cells(i, 1).Value2
'Gets webpage to check _
if Trust table exists
On Error Resume Next
http.Open "GET", Url, False
http.send
'Runs code If the website sent a valid response to our request _
for FIRST http object
If Err.Number = 0 Then
If http.Status = 200 Then
'If the website sent a valid response to our request _
for SECOND http object "http2"
If Err.Number = 0 Then
If http2.Status = 200 Then
HTML.body.innerHTML = http.responseText
Set links = HTML.getElementsByTagName("a")
'Determines if there is a trust table and if so _
then it creates the http2 object and gets the _
trust table responsetext
Trst = "(List all Funds and Classes/Contracts for"
For Each link In links
'Link is returned in responsetext with "about:/" at _
the beginning instead of https://www.sec.gov/, so I _
used this to replace the "about:/"
If InStr(link.innerHTML, Trst) > 0 Then
link = Replace(link, "about:/", "https://www.sec.gov/")
Debug.Print link
'Creates this object to go to trust table webpage
Set http2 = CreateObject("MSXML2.ServerXMLhttp.6.0")
'Gets webpage to parse _
trust table
On Error Resume Next
http2.Open "GET", link, False
http2.send
HTML2.body.innerHTML = http2.responseText
'If there exists a Trust, then this refers to the _
3rd table on the trust table webpage; _
note ("table")(3)
On Error Resume Next
Set tbl = HTML2.getElementsByTagName("table")(3)
Set ws_2 = wb.Sheets("Parsed_Tables")
With ws_2
For Each ele In tbl.getElementsByTagName("tr")
'First finds rows with Class/Con numbers
If InStr(ele.innerText, "C00") Then
'Pulls Class/Con Numbers, note children(2)
'output to col E sheet
.Cells(Rows.Count, "E"). _
End(xlUp).Offset(1, 0).Value2 = ele.Children(2).innerText
'Outputs Share Class, children(3)
'Output to col F sheet
.Cells(Rows.Count, "F"). _
End(xlUp).Offset(1, 0).Value2 = ele.Children(3).innerText
'Not not all Funds have Ticker _
so this keeps the module from _
asking for object to be set
On Error Resume Next
'Outputs Ticker to excel
'Reads the last value in Col F and offsets Ticker to _
to show directly in adjacent cel in Col G
.Cells(Rows.Count, "F"). _
End(xlUp).Offset(0, 1).Value2 = ele.Children(4).innerText
'Pulls SIC number
ElseIf InStr(ele.innerText, "S00") Then
'Offsets from col F to be placed in col C
.Cells(Rows.Count, "F"). _
End(xlUp).Offset(1, -3).Value2 = ele.Children(1).innerText
'Pulls Fund Name
'Offsets from col F to col D
.Cells(Rows.Count, "F"). _
End(xlUp).Offset(1, -2).Value2 = ele.Children(2).innerText
'Pulls CIK number
ElseIf InStr(ele.Children(0).innerText, "000") Then
'Offset from col F to col A
.Cells(Rows.Count, "F"). _
End(xlUp).Offset(1, -5).Value2 = ele.Children(0).innerText
'Pulls Trust Name
'Offsets from col F to col B
.Cells(Rows.Count, "F"). _
End(xlUp).Offset(1, -4).Value2 = ele.Children(1).innerText
End If
'Counts the number of iterations of the loop _
and places it in the lower left corner of the _
workbook
Application.StatusBar = "Current Iteration: " & i
Next
End With
End If
Next
End If
Else
MsgBox "Error loading webpage", vbExclamation, "Alert!!!"
Exit Sub
End If
On Error GoTo 0
End If
Else
MsgBox "Error loading webpage", vbExclamation, "Alert!!!"
Exit Sub
End If
On Error GoTo 0
If i Mod 1000 = 0 Then
ActiveWorkbook.Save
Application.Wait (Now + TimeValue("0:00:03"))
End If
Next i
'sets everything back to normal after running code
OptimizeCode_End
End Sub
以下是 CIK_Links Sheet 中列出的 link 的示例:
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=2&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=3&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=11&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=13&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=14&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=17&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=18&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=2110&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=2135&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=2145&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=2663&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=2664&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=2691&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=2768&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=3521&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=3794&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=4123&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=4405&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=4568&owner=include&count=02
我认为您的代码不会 运行 除非至少有一个 On Error Resume Next 隐藏了一些 运行time 错误。例如,在实例化 http2 对象之前,您有 If http2.Status = 200 Then
。
下面是一个绝对可以改进的方法,但使用 class 来保存 xmlhttp 对象并提供检索所需信息的方法。您想要的 table 布局使得解析实际网页特别复杂。你可能希望坚持下去。我选择按原样使用 table 结构。也许,这至少可以为您提供一个框架。您可以将自定义优化子调用添加到其中。
待办事项:
看看是否可以对一个超大的结果数组进行估计,该数组可以容纳所有结果而不是一个数组数组,以便可以在 go 中完成写出。如果我有时间我会做这个修改。
ClassclsHTTP
Option Explicit
Private http As Object
Const SEARCH_TERM As String = "(List all Funds and Classes/Contracts"
Private Sub Class_Initialize()
Set http = CreateObject("MSXML2.XMLHTTP")
End Sub
Public Function GetString(ByVal Url As String, Optional ByVal search As Boolean = False) As String
Dim sResponse As String
searchTermFound = False
With http
.Open "GET", Url, False
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
sResponse = StrConv(.responseBody, vbUnicode)
If InStr(sResponse, SEARCH_TERM) > 0 Then searchTermFound = True
GetString = sResponse
End With
End Function
Public Function GetLink(ByVal html As HTMLDocument) As String
Dim i As Long, nodeList As Object
Set nodeList = html.querySelectorAll("a")
GetLink = vbNullString
For i = 0 To nodeList.Length - 1
If InStr(nodeList.item(i).innerText, SEARCH_TERM) > 0 Then
GetLink = Replace$(nodeList.item(i).href, "about:/", "https://www.sec.gov/")
Exit For
End If
Next
End Function
Public Function GetInfo(ByVal html As HTMLDocument) As Variant
Dim CIK As String, table As HTMLTable, tables As Object, tRows As Object
Dim arr(), tr As Object, td As Object, r As Long, c As Long
Set tables = html.querySelectorAll("table")
If tables.Length > 3 Then
CIK = "'" & html.querySelector(".search").innerText
Set table = tables.item(3)
Set tRows = table.getElementsByTagName("tr")
ReDim arr(1 To tRows.Length, 1 To 6)
Dim numColumns As Long, numBlanks As Long
For Each tr In tRows
numColumns = tr.getElementsByTagName("td").Length
r = r + 1: c = 2: numBlanks = 0
If r > 4 Then
arr(r - 4, 1) = CIK
For Each td In tr.getElementsByTagName("td")
If td.innerText = vbNullString Then numBlanks = numBlanks + 1
arr(r - 4, c) = td.innerText
c = c + 1
Next td
If numBlanks = numColumns Then Exit For
End If
Next
Else
ReDim arr(1, 1)
GetInfo = arr
Exit Function
End If
arr = Application.Transpose(arr)
ReDim Preserve arr(1 To 6, 1 To r - 4)
arr = Application.Transpose(arr)
GetInfo = arr
End Function
标准模块1
Option Explicit
Public searchTermFound As Boolean
Public Sub GetInfo()
Dim wsLinks As Worksheet, links(), link As Long, http As clsHTTP
Dim lastRow As Long, html As HTMLDocument, newURL As String
Set wsLinks = ThisWorkbook.Worksheets("CIK_Links")
Set http = New clsHTTP
Set html = New HTMLDocument
With wsLinks
lastRow = GetLastRow(wsLinks, 3)
If lastRow = 2 Then
ReDim links(1, 1)
links(1, 1) = .Range("C2").Value
Else
links = .Range("C2:C" & lastRow).Value
End If
End With
Dim results(), arr(), i As Long, j As Long
ReDim results(1 To UBound(links, 1))
For link = LBound(links, 1) To UBound(links, 1)
If InStr(links(link, 1), "https://www.sec.gov") > 0 Then
html.body.innerHTML = http.GetString(links(link, 1), True)
If searchTermFound Then
newURL = http.GetLink(html)
html.body.innerHTML = http.GetString(newURL, False)
arr = http.GetInfo(html)
If UBound(arr, 1) > 1 Then
i = i + 1
results(i) = arr
End If
End If
End If
Next
Dim wsOut As Worksheet
Set wsOut = ThisWorkbook.Worksheets("Parsed_Tables")
For j = 1 To i
arr = results(j)
With wsOut
.Cells(GetLastRow(wsOut, 1), 1).Resize(UBound(arr, 1), UBound(arr, 2)) = arr
End With
Next
End Sub
Public Function GetLastRow(ByVal ws As Worksheet, Optional ByVal columnNumber As Long = 1) As Long
With ws
GetLastRow = .Cells(.Rows.Count, columnNumber).End(xlUp).Row
End With
End Function
我使用 Excel VBA 构建了一个网络抓取工具,它执行以下操作:
- 从名为 "CIK_Links" 的 sheet 中的 link 列表中一次读取一个 link。
- 它转到 link,读取它的响应文本,如果在该响应文本中它找到一个超级link,其内部HTML 读取,“(所有基金列表和 Classes/Contracts for",然后它将 link 保存到一个变量中并创建另一个 MSXML2.ServerXMLhttp.6.0 对象。
- 创建对象后,它会在响应文本中找到第 3 个 table,循环并找到那个 table 的特定元素,然后将这些值输出到 Excel sheet 调用了 "Parsed_Tables"。
- 代码然后转到 "CIK_Links" sheet 上的下一个 link 并重复步骤 1-3。注意:sheet中有大约640,000个link,但我运行ning循环一次只有几千个。是的,我已经尝试 运行 一次将其设置为 10、20、100,但问题仍然存在。
我遇到的问题是,一旦我点击 运行,我就会收到消息 "Excel is not responding",但代码仍在后台 运行。考虑到我要求它执行的操作,该代码运行完美且速度非常快,但显然我需要进一步优化它以防止它过载 Excel。找到一些方法来避免在每次迭代时将解析的 HTML 写入 Excel 会很有帮助,但是,我不知道如何以我需要的格式写入数据而不做所以。数组解决方案会很棒,但是在将数组中的数据写入 Excel 之前,必须对数组中的数据进行大量操作,甚至可能 subsetting/slicing 数组。我需要帮助,因为我已经用尽了我所有的知识,并且在构建这个应用程序的过程中我做了很多研究。我什至愿意使用 python 和 beautifulsoup 库等其他技术,我只是不知道如何将 table 数据以我需要的格式输出到 csv 文件.提前致谢!
这是文件: TrustTable_Parse.xlsb
免责声明: 我有一个 B.S。在数学方面,我通过用每种语言实施我自己的许多项目,自学了如何使用 VBA、SQL 和 R 进行编码。重点是,如果我的代码看起来很奇怪,或者您认为我做事效率低下,那是因为我已经多年没有编码了,而且我不知道更好,大声笑。
下面是我的代码:
Option Explicit
Sub Final_Parse_TrustTables()
Dim HTML As New HTMLDocument
Dim http As Object
Dim links As Object
Dim Url, Trst As String
Dim link As HTMLHtmlElement
Dim i As Long
Dim http2 As Object
Dim HTML2 As New HTMLDocument
Dim tbl As Object
Dim ele As HTMLHtmlElement
Dim wb As Workbook
Dim ws, ws_2 As Worksheet
'sets ScreenUpdating to false _
turns off event triggers, ect.
OptimizeCode_Begin
Set wb = ThisWorkbook
Set ws = wb.Sheets("CIK_Links")
'Creates this object to see if Trust table exists
Set http = CreateObject("MSXML2.ServerXMLhttp.6.0")
'Loops through the list of URL's _
in the 'CIK_Links' Sheet
For i = 2 To 3000
'List of URL's
Url = ws.Range("C1").Cells(i, 1).Value2
'Gets webpage to check _
if Trust table exists
On Error Resume Next
http.Open "GET", Url, False
http.send
'Runs code If the website sent a valid response to our request _
for FIRST http object
If Err.Number = 0 Then
If http.Status = 200 Then
'If the website sent a valid response to our request _
for SECOND http object "http2"
If Err.Number = 0 Then
If http2.Status = 200 Then
HTML.body.innerHTML = http.responseText
Set links = HTML.getElementsByTagName("a")
'Determines if there is a trust table and if so _
then it creates the http2 object and gets the _
trust table responsetext
Trst = "(List all Funds and Classes/Contracts for"
For Each link In links
'Link is returned in responsetext with "about:/" at _
the beginning instead of https://www.sec.gov/, so I _
used this to replace the "about:/"
If InStr(link.innerHTML, Trst) > 0 Then
link = Replace(link, "about:/", "https://www.sec.gov/")
Debug.Print link
'Creates this object to go to trust table webpage
Set http2 = CreateObject("MSXML2.ServerXMLhttp.6.0")
'Gets webpage to parse _
trust table
On Error Resume Next
http2.Open "GET", link, False
http2.send
HTML2.body.innerHTML = http2.responseText
'If there exists a Trust, then this refers to the _
3rd table on the trust table webpage; _
note ("table")(3)
On Error Resume Next
Set tbl = HTML2.getElementsByTagName("table")(3)
Set ws_2 = wb.Sheets("Parsed_Tables")
With ws_2
For Each ele In tbl.getElementsByTagName("tr")
'First finds rows with Class/Con numbers
If InStr(ele.innerText, "C00") Then
'Pulls Class/Con Numbers, note children(2)
'output to col E sheet
.Cells(Rows.Count, "E"). _
End(xlUp).Offset(1, 0).Value2 = ele.Children(2).innerText
'Outputs Share Class, children(3)
'Output to col F sheet
.Cells(Rows.Count, "F"). _
End(xlUp).Offset(1, 0).Value2 = ele.Children(3).innerText
'Not not all Funds have Ticker _
so this keeps the module from _
asking for object to be set
On Error Resume Next
'Outputs Ticker to excel
'Reads the last value in Col F and offsets Ticker to _
to show directly in adjacent cel in Col G
.Cells(Rows.Count, "F"). _
End(xlUp).Offset(0, 1).Value2 = ele.Children(4).innerText
'Pulls SIC number
ElseIf InStr(ele.innerText, "S00") Then
'Offsets from col F to be placed in col C
.Cells(Rows.Count, "F"). _
End(xlUp).Offset(1, -3).Value2 = ele.Children(1).innerText
'Pulls Fund Name
'Offsets from col F to col D
.Cells(Rows.Count, "F"). _
End(xlUp).Offset(1, -2).Value2 = ele.Children(2).innerText
'Pulls CIK number
ElseIf InStr(ele.Children(0).innerText, "000") Then
'Offset from col F to col A
.Cells(Rows.Count, "F"). _
End(xlUp).Offset(1, -5).Value2 = ele.Children(0).innerText
'Pulls Trust Name
'Offsets from col F to col B
.Cells(Rows.Count, "F"). _
End(xlUp).Offset(1, -4).Value2 = ele.Children(1).innerText
End If
'Counts the number of iterations of the loop _
and places it in the lower left corner of the _
workbook
Application.StatusBar = "Current Iteration: " & i
Next
End With
End If
Next
End If
Else
MsgBox "Error loading webpage", vbExclamation, "Alert!!!"
Exit Sub
End If
On Error GoTo 0
End If
Else
MsgBox "Error loading webpage", vbExclamation, "Alert!!!"
Exit Sub
End If
On Error GoTo 0
If i Mod 1000 = 0 Then
ActiveWorkbook.Save
Application.Wait (Now + TimeValue("0:00:03"))
End If
Next i
'sets everything back to normal after running code
OptimizeCode_End
End Sub
以下是 CIK_Links Sheet 中列出的 link 的示例:
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=2&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=3&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=11&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=13&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=14&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=17&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=18&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=2110&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=2135&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=2145&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=2663&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=2664&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=2691&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=2768&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=3521&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=3794&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=4123&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=4405&owner=include&count=02
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=4568&owner=include&count=02
我认为您的代码不会 运行 除非至少有一个 On Error Resume Next 隐藏了一些 运行time 错误。例如,在实例化 http2 对象之前,您有 If http2.Status = 200 Then
。
下面是一个绝对可以改进的方法,但使用 class 来保存 xmlhttp 对象并提供检索所需信息的方法。您想要的 table 布局使得解析实际网页特别复杂。你可能希望坚持下去。我选择按原样使用 table 结构。也许,这至少可以为您提供一个框架。您可以将自定义优化子调用添加到其中。
待办事项:
看看是否可以对一个超大的结果数组进行估计,该数组可以容纳所有结果而不是一个数组数组,以便可以在 go 中完成写出。如果我有时间我会做这个修改。
ClassclsHTTP
Option Explicit
Private http As Object
Const SEARCH_TERM As String = "(List all Funds and Classes/Contracts"
Private Sub Class_Initialize()
Set http = CreateObject("MSXML2.XMLHTTP")
End Sub
Public Function GetString(ByVal Url As String, Optional ByVal search As Boolean = False) As String
Dim sResponse As String
searchTermFound = False
With http
.Open "GET", Url, False
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
sResponse = StrConv(.responseBody, vbUnicode)
If InStr(sResponse, SEARCH_TERM) > 0 Then searchTermFound = True
GetString = sResponse
End With
End Function
Public Function GetLink(ByVal html As HTMLDocument) As String
Dim i As Long, nodeList As Object
Set nodeList = html.querySelectorAll("a")
GetLink = vbNullString
For i = 0 To nodeList.Length - 1
If InStr(nodeList.item(i).innerText, SEARCH_TERM) > 0 Then
GetLink = Replace$(nodeList.item(i).href, "about:/", "https://www.sec.gov/")
Exit For
End If
Next
End Function
Public Function GetInfo(ByVal html As HTMLDocument) As Variant
Dim CIK As String, table As HTMLTable, tables As Object, tRows As Object
Dim arr(), tr As Object, td As Object, r As Long, c As Long
Set tables = html.querySelectorAll("table")
If tables.Length > 3 Then
CIK = "'" & html.querySelector(".search").innerText
Set table = tables.item(3)
Set tRows = table.getElementsByTagName("tr")
ReDim arr(1 To tRows.Length, 1 To 6)
Dim numColumns As Long, numBlanks As Long
For Each tr In tRows
numColumns = tr.getElementsByTagName("td").Length
r = r + 1: c = 2: numBlanks = 0
If r > 4 Then
arr(r - 4, 1) = CIK
For Each td In tr.getElementsByTagName("td")
If td.innerText = vbNullString Then numBlanks = numBlanks + 1
arr(r - 4, c) = td.innerText
c = c + 1
Next td
If numBlanks = numColumns Then Exit For
End If
Next
Else
ReDim arr(1, 1)
GetInfo = arr
Exit Function
End If
arr = Application.Transpose(arr)
ReDim Preserve arr(1 To 6, 1 To r - 4)
arr = Application.Transpose(arr)
GetInfo = arr
End Function
标准模块1
Option Explicit
Public searchTermFound As Boolean
Public Sub GetInfo()
Dim wsLinks As Worksheet, links(), link As Long, http As clsHTTP
Dim lastRow As Long, html As HTMLDocument, newURL As String
Set wsLinks = ThisWorkbook.Worksheets("CIK_Links")
Set http = New clsHTTP
Set html = New HTMLDocument
With wsLinks
lastRow = GetLastRow(wsLinks, 3)
If lastRow = 2 Then
ReDim links(1, 1)
links(1, 1) = .Range("C2").Value
Else
links = .Range("C2:C" & lastRow).Value
End If
End With
Dim results(), arr(), i As Long, j As Long
ReDim results(1 To UBound(links, 1))
For link = LBound(links, 1) To UBound(links, 1)
If InStr(links(link, 1), "https://www.sec.gov") > 0 Then
html.body.innerHTML = http.GetString(links(link, 1), True)
If searchTermFound Then
newURL = http.GetLink(html)
html.body.innerHTML = http.GetString(newURL, False)
arr = http.GetInfo(html)
If UBound(arr, 1) > 1 Then
i = i + 1
results(i) = arr
End If
End If
End If
Next
Dim wsOut As Worksheet
Set wsOut = ThisWorkbook.Worksheets("Parsed_Tables")
For j = 1 To i
arr = results(j)
With wsOut
.Cells(GetLastRow(wsOut, 1), 1).Resize(UBound(arr, 1), UBound(arr, 2)) = arr
End With
Next
End Sub
Public Function GetLastRow(ByVal ws As Worksheet, Optional ByVal columnNumber As Long = 1) As Long
With ws
GetLastRow = .Cells(.Rows.Count, columnNumber).End(xlUp).Row
End With
End Function