所有数据都没有从网站导入到 excel
All data not getting imported from website to excel
我想将餐厅名称、phone 号码和网站等餐厅数据导入到 excel,但不幸的是我只得到一页(第一页),但是我想要任何范围内的数据在每个页面的单独工作表中定义第 1 页到第 3 页或第 2 页到第 5 页。样本输出文件附在我暂时得到的输出中。
Sub Webscraping()
'Declaration
Dim ie As InternetExplorer
Dim ht As HTMLDocument
'Initialization
Set ie = New InternetExplorer
ie.Visible = True
'Open a url
ie.navigate ("https://www.yellowpages.com/atlanta-ga/restaurants")
'Set ht = ie.document
'MsgBox ht.getElementsByClassName("ot_lrp_bname_free_center")
'Alternative Approach for wait
Do Until ie.readyState = READYSTATE_COMPLETE
DoEvents
Loop
'Initialize the document
Set ht = ie.document
'Set elems = ht.getElementsByClassName("list-title")
Set elems = ht.getElementsByClassName("business-name")
'Set elems = ht.getElementsByClassName("website-lrp icon-link ot_lrp_website_text_free_center")
i = 1
For Each elem In elems
Sheet1.Cells(i, 1).Value = elem.innerText
i = i + 1
'Debug.Print (elem.innerText)
Next
Set elems = ht.getElementsByClassName("phone primary")
i = 1
For Each elem In elems
Sheet1.Cells(i, 2).Value = elem.innerText
i = i + 1
'Debug.Print (elem.innerText)
Next
Set elems = ht.getElementsByClassName("links")
i = 1
For Each elem In elems
Set link = elem.ChildNodes.Item(0)
Sheet1.Cells(i, 3).Value = link.href
i = i + 1
Next
'Set internetdata = ie.document
'Set div_result = internetdata.getElementById("ctl00_gvMain_ctl03_hlTitle")
'Set header_links = div_result.getElementsByTagName("a")
'For Each h In header_links
'Set link = h.ChildNodes.Item(0)
'Worksheets("Stocks").Cells(Range("L" & Rows.Count).End(xlUp).Row + 1, 12) = link.href
End Sub
这是已经完成但难以获得所需输出的工作
使用 VBA 执行此操作的唯一方法是检查是否存在 "Next" 按钮并单击它,如果它存在:
这是它的HTML:
<a class="next ajax-page" href="/atlanta-ga/restaurants?page=2" data-page="2" data-analytics="{"click_id":132}" data-remote="true" data-impressed="1">Next</a>
这不是 "science fiction" 可以用 VBA 完成的,但是,有商业 RPA 解决方案,它们提供 "out of the box" 完全适合此任务的功能 - UiPath、AutomationAnywhere、BluePrism。 Python 的 "beautiful soup" 也会做得很好。
页面连接到 url 的末尾。我会在给定页面范围内循环使用 xhr 问题请求,并用正则表达式输出包含所需信息的 json(它在脚本标记之一中)。这种方法非常快,并且超过了正则表达式的使用。我也 re-use 可能的对象。
我使用 jsonconverter.bas 来处理 json 并解析出所需的信息(json 中有更多信息,包括评论)。下载 .bas 并添加到项目中名为 JsonConverter 的模块后,您需要转到 VBE > 工具 > 参考 > 添加对 Microsoft 脚本运行时的参考。
辅助函数用于测试要写出的页面是否已经存在或需要创建,以及将json结果写入数组并一次性将数组转储到sheet(效率增益)。保留结构,以便在需要更多信息时很容易扩展检索到的信息,例如评论。
可能需要做一些工作来确保不存在的页面能够正常工作。我目前只是简单地使用响应的状态码来过滤掉这些。
备注:
作为完整性检查,我会使用 InternetExplorer 转到第 1 页并提取总结果数。我会将其除以每页结果(目前为 30)来计算总页数。这会给我 lbound 和 ubound 值(可能页面的最小值和最大值)。然后切换到 xmlhttp 来实际检索。请参阅末尾的附加辅助函数。
代码:
Option Explicit
Public Sub GetRestuarantInfo()
Dim s As String, re As Object, p As String, page As Long, r As String, json As Object
Const START_PAGE As Long = 2
Const END_PAGE As Long = 4
Const RESULTS_PER_PAGE As Long = 30
p = "\[{""@context"".*?\]"
Set re = CreateObject("VBScript.RegExp")
Application.ScreenUpdating = False
With CreateObject("MSXML2.XMLHTTP")
For page = START_PAGE To END_PAGE
.Open "GET", "https://www.yellowpages.com/atlanta-ga/restaurants?page=" & page, False
.send
If .Status = 200 Then
s = .responseText
r = GetValue(re, s, p)
If r <> "Not Found" Then
Set json = JsonConverter.ParseJson(r)
WriteOutResults page, RESULTS_PER_PAGE, json
End If
End If
Next
End With
Application.ScreenUpdating = True
End Sub
Public Sub WriteOutResults(ByVal page As Long, ByVal RESULTS_PER_PAGE As Long, ByVal json As Object)
Dim sheetName As String, results(), r As Long, headers(), ws As Worksheet
ReDim results(1 To RESULTS_PER_PAGE, 1 To 3)
sheetName = "page" & page
headers = Array("Name", "Website", "Tel")
If Not WorksheetExists(sheetName) Then
Set ws = ThisWorkbook.Worksheets.Add
ws.Name = sheetName
Else
ThisWorkbook.Worksheets(sheetName).Cells.ClearContents
End If
With ws
Dim review As Object
For Each review In json 'collection of dictionaries
r = r + 1
results(r, 1) = review("name")
results(r, 2) = review("url")
results(r, 3) = review("telephone")
Next
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End Sub
Public Function GetValue(ByVal re As Object, inputString As String, ByVal pattern As String) As String
'https://regex101.com/r/M9oRON/1
With re
.Global = True
.MultiLine = True
.IgnoreCase = False
.pattern = pattern
If .Test(inputString) Then
GetValue = .Execute(inputString)(0)
Else
GetValue = "Not found"
End If
End With
End Function
Public Function WorksheetExists(ByVal sName As String) As Boolean '@Rory
WorksheetExists = Evaluate("ISREF('" & sName & "'!A1)")
End Function
return 页数
的辅助函数
'VBE > Tools > References: Microsoft Internet Controls
Public Function GetNumberOfPages(ByVal RESULTS_PER_PAGE As Long) As Variant
Dim ie As Object, totalResults As Long
On Error GoTo errhand
Set ie = CreateObject("InternetExplorer.Application")
With ie
.Visible = False
.Navigate2 "https://www.yellowpages.com/atlanta-ga/restaurants?page=1"
While .Busy Or .readyState < 4: DoEvents: Wend
With .document
totalResults = Replace$(Replace$(.querySelector(".pagination p").innerText, "We found", vbNullString), "results", vbNullString)
GetNumberOfPages = totalResults / RESULTS_PER_PAGE
ie.Quit
Exit Function
End With
End With
errhand:
If Err.Number <> 0 Then
GetNumberOfPages = CVErr(xlErrNA)
End If
End Function
正则表达式解释:
试试看 here.
我想将餐厅名称、phone 号码和网站等餐厅数据导入到 excel,但不幸的是我只得到一页(第一页),但是我想要任何范围内的数据在每个页面的单独工作表中定义第 1 页到第 3 页或第 2 页到第 5 页。样本输出文件附在我暂时得到的输出中。
Sub Webscraping()
'Declaration
Dim ie As InternetExplorer
Dim ht As HTMLDocument
'Initialization
Set ie = New InternetExplorer
ie.Visible = True
'Open a url
ie.navigate ("https://www.yellowpages.com/atlanta-ga/restaurants")
'Set ht = ie.document
'MsgBox ht.getElementsByClassName("ot_lrp_bname_free_center")
'Alternative Approach for wait
Do Until ie.readyState = READYSTATE_COMPLETE
DoEvents
Loop
'Initialize the document
Set ht = ie.document
'Set elems = ht.getElementsByClassName("list-title")
Set elems = ht.getElementsByClassName("business-name")
'Set elems = ht.getElementsByClassName("website-lrp icon-link ot_lrp_website_text_free_center")
i = 1
For Each elem In elems
Sheet1.Cells(i, 1).Value = elem.innerText
i = i + 1
'Debug.Print (elem.innerText)
Next
Set elems = ht.getElementsByClassName("phone primary")
i = 1
For Each elem In elems
Sheet1.Cells(i, 2).Value = elem.innerText
i = i + 1
'Debug.Print (elem.innerText)
Next
Set elems = ht.getElementsByClassName("links")
i = 1
For Each elem In elems
Set link = elem.ChildNodes.Item(0)
Sheet1.Cells(i, 3).Value = link.href
i = i + 1
Next
'Set internetdata = ie.document
'Set div_result = internetdata.getElementById("ctl00_gvMain_ctl03_hlTitle")
'Set header_links = div_result.getElementsByTagName("a")
'For Each h In header_links
'Set link = h.ChildNodes.Item(0)
'Worksheets("Stocks").Cells(Range("L" & Rows.Count).End(xlUp).Row + 1, 12) = link.href
End Sub
这是已经完成但难以获得所需输出的工作
使用 VBA 执行此操作的唯一方法是检查是否存在 "Next" 按钮并单击它,如果它存在:
这是它的HTML:
<a class="next ajax-page" href="/atlanta-ga/restaurants?page=2" data-page="2" data-analytics="{"click_id":132}" data-remote="true" data-impressed="1">Next</a>
这不是 "science fiction" 可以用 VBA 完成的,但是,有商业 RPA 解决方案,它们提供 "out of the box" 完全适合此任务的功能 - UiPath、AutomationAnywhere、BluePrism。 Python 的 "beautiful soup" 也会做得很好。
页面连接到 url 的末尾。我会在给定页面范围内循环使用 xhr 问题请求,并用正则表达式输出包含所需信息的 json(它在脚本标记之一中)。这种方法非常快,并且超过了正则表达式的使用。我也 re-use 可能的对象。
我使用 jsonconverter.bas 来处理 json 并解析出所需的信息(json 中有更多信息,包括评论)。下载 .bas 并添加到项目中名为 JsonConverter 的模块后,您需要转到 VBE > 工具 > 参考 > 添加对 Microsoft 脚本运行时的参考。
辅助函数用于测试要写出的页面是否已经存在或需要创建,以及将json结果写入数组并一次性将数组转储到sheet(效率增益)。保留结构,以便在需要更多信息时很容易扩展检索到的信息,例如评论。
可能需要做一些工作来确保不存在的页面能够正常工作。我目前只是简单地使用响应的状态码来过滤掉这些。
备注:
作为完整性检查,我会使用 InternetExplorer 转到第 1 页并提取总结果数。我会将其除以每页结果(目前为 30)来计算总页数。这会给我 lbound 和 ubound 值(可能页面的最小值和最大值)。然后切换到 xmlhttp 来实际检索。请参阅末尾的附加辅助函数。
代码:
Option Explicit
Public Sub GetRestuarantInfo()
Dim s As String, re As Object, p As String, page As Long, r As String, json As Object
Const START_PAGE As Long = 2
Const END_PAGE As Long = 4
Const RESULTS_PER_PAGE As Long = 30
p = "\[{""@context"".*?\]"
Set re = CreateObject("VBScript.RegExp")
Application.ScreenUpdating = False
With CreateObject("MSXML2.XMLHTTP")
For page = START_PAGE To END_PAGE
.Open "GET", "https://www.yellowpages.com/atlanta-ga/restaurants?page=" & page, False
.send
If .Status = 200 Then
s = .responseText
r = GetValue(re, s, p)
If r <> "Not Found" Then
Set json = JsonConverter.ParseJson(r)
WriteOutResults page, RESULTS_PER_PAGE, json
End If
End If
Next
End With
Application.ScreenUpdating = True
End Sub
Public Sub WriteOutResults(ByVal page As Long, ByVal RESULTS_PER_PAGE As Long, ByVal json As Object)
Dim sheetName As String, results(), r As Long, headers(), ws As Worksheet
ReDim results(1 To RESULTS_PER_PAGE, 1 To 3)
sheetName = "page" & page
headers = Array("Name", "Website", "Tel")
If Not WorksheetExists(sheetName) Then
Set ws = ThisWorkbook.Worksheets.Add
ws.Name = sheetName
Else
ThisWorkbook.Worksheets(sheetName).Cells.ClearContents
End If
With ws
Dim review As Object
For Each review In json 'collection of dictionaries
r = r + 1
results(r, 1) = review("name")
results(r, 2) = review("url")
results(r, 3) = review("telephone")
Next
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End Sub
Public Function GetValue(ByVal re As Object, inputString As String, ByVal pattern As String) As String
'https://regex101.com/r/M9oRON/1
With re
.Global = True
.MultiLine = True
.IgnoreCase = False
.pattern = pattern
If .Test(inputString) Then
GetValue = .Execute(inputString)(0)
Else
GetValue = "Not found"
End If
End With
End Function
Public Function WorksheetExists(ByVal sName As String) As Boolean '@Rory
WorksheetExists = Evaluate("ISREF('" & sName & "'!A1)")
End Function
return 页数
的辅助函数'VBE > Tools > References: Microsoft Internet Controls
Public Function GetNumberOfPages(ByVal RESULTS_PER_PAGE As Long) As Variant
Dim ie As Object, totalResults As Long
On Error GoTo errhand
Set ie = CreateObject("InternetExplorer.Application")
With ie
.Visible = False
.Navigate2 "https://www.yellowpages.com/atlanta-ga/restaurants?page=1"
While .Busy Or .readyState < 4: DoEvents: Wend
With .document
totalResults = Replace$(Replace$(.querySelector(".pagination p").innerText, "We found", vbNullString), "results", vbNullString)
GetNumberOfPages = totalResults / RESULTS_PER_PAGE
ie.Quit
Exit Function
End With
End With
errhand:
If Err.Number <> 0 Then
GetNumberOfPages = CVErr(xlErrNA)
End If
End Function
正则表达式解释:
试试看 here.