网页抓取的内循环设计
Inner Loop design for webscraping
我想将餐厅名称、phone 号码、网站和地址等餐厅数据导入到 excel,但不幸的是,我得到的是赞助结果,也没有得到网站和完整地址,因为它在当我们点击酒店名称时在内页。我在平台的一些帮助下创建了一个代码,但它没有帮助。请纠正我的代码中的问题。网站:https://www.yelp.com/searchcflt=restaurants&find_loc=San%20Francisco%2C%20CA&start=
这是我的代码:
Sub GetInfo()
Const URL$ = "https://www.yelp.com/search?cflt=restaurants&find_loc=San%20Francisco%2C%20CA&start="
Dim Http As New XMLHTTP60, Html As New HTMLDocument, Htmldoc As New HTMLDocument, page&, I&
For page = 0 To 1 ' this is where you change the last number for the pages to traverse
With Http
.Open "GET", URL & page * 30, False
.send
Html.body.innerHTML = .responseText
End With
With Html.querySelectorAll("[class*='searchResult']")
For I = 0 To .Length - 1
Htmldoc.body.innerHTML = .Item(I).outerHTML
On Error Resume Next
r = r + 1: Cells(r, 1) = Htmldoc.querySelector("[class*='heading--h3'] > a").innerText
Cells(r, 2) = Htmldoc.querySelector("[class*='container'] > [class*='display--inline-block']").innerText
' Cells(r, 3) = Htmldoc.querySelector("[class*='container'] > address").innerText
'Cells(r, 4) = Htmldoc.querySelector("[class*='container'] > address").NextSibling.innerText
'Inner loop creation
Cells(r, 5) = Htmldoc.querySelector("[class*='container'] > website").href ' Extract from window after clicking on hotel name
Cells(r, 6) = Htmldoc.querySelector("[class*='container'] > fulladdress").innerText ' Extract from window after clicking on hotel name
On Error GoTo 0
Next I
End With
Next page
End Sub
这是一种让您从其内页解析结果的方法。我无法再访问该网页以进一步帮助您。但是,试一试。我想它会起作用:
Sub GetInfo()
Const URL$ = "https://www.yelp.com/search?cflt=restaurants&find_loc=San%20Francisco%2C%20CA&start="
Const base$ = "https://www.yelp.com"
Dim Http As New XMLHTTP60, Html As New HTMLDocument
Dim oTitle$, oPhone As Object, Htmldoc As New HTMLDocument
Dim R&, newUrl$, I&, oWeb As Object, page&, oAddress As Object
[A1:D1] = [{"Name","Phone","Address","Website"}]
For page = 1 To 3 'this is where you change the last number for this script to traverse
With Http
.Open "GET", URL & page * 30, False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
Html.body.innerHTML = .responseText
End With
With Html.querySelectorAll("[class*='searchResult'] [class*='heading--h3'] > a")
For I = 0 To .Length - 1
If Not InStr(.item(I).getAttribute("href"), "/adredir?") > 0 Then
oTitle = .item(I).innerText
newUrl = Replace(.item(I).getAttribute("href"), "about:", base)
With Http
.Open "GET", newUrl, False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
Htmldoc.body.innerHTML = .responseText
End With
R = R + 1: Cells(R + 1, 1) = oTitle
Set oPhone = Htmldoc.querySelector(".biz-phone")
If Not oPhone Is Nothing Then
Cells(R + 1, 2) = oPhone.innerText
End If
Set oAddress = Htmldoc.querySelector(".map-box-address")
If Not oAddress Is Nothing Then
Cells(R + 1, 3) = WorksheetFunction.Clean(oAddress.innerText)
End If
Set oWeb = Htmldoc.querySelector(".biz-website > a")
If Not oWeb Is Nothing Then
Cells(R + 1, 4) = oWeb.innerText
End If
End If
Next I
End With
Next page
End Sub
顺便说一句,广告已经被踢掉了。
您可以使用免费的 API 从 business_search 端点获取前 50 名。在查询字符串中传递排序参数以获得最高评价。
使用 json 解析器,例如 jsonconverter.bas 来处理响应。在名为 JsonConverter 的标准模块中安装来自 link 的代码后,转到 VBE > 工具 > 参考 > 添加对 Microsoft 脚本运行时的参考。
API指令是here. You need to set up a test app, which requires some basic user info, and verify your email. You will then receive an API key for authentication在授权header中传递,如下图
还有其他信息 returned,您可以根据需要进行解析。
Option Explicit
Public Sub GetTopRestuarants()
Dim json As Object, headers(), r As Long, c As Long
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://api.yelp.com/v3/businesses/search?term=restuarant&location=san-francisco&limit=50&sort_by=rating", False
.setRequestHeader "Authorization", "Bearer yourAPIkey"
.send
Set json = JsonConverter.ParseJson(.responseText)("businesses")
headers = Array("Restaurant name", "phone", "website", "address")
Dim results(), item As Object
ReDim results(1 To json.Count, 1 To UBound(headers) + 1)
For Each item In json
r = r + 1
results(r, 1) = item("name")
results(r, 2) = item("phone")
results(r, 3) = item("url")
Dim subItem As Variant, address As String
address = vbNullString
For Each subItem In item("location")("display_address")
address = address & Chr$(32) & subItem
Next
results(r, 4) = Trim$(address)
Next
End With
With ActiveSheet
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End Sub
示例前 20 名,共 50 名 return编辑:
买者自负
请注意,指定 sort_by 是对 Yelp 的搜索的一个建议(并非严格执行),它最多考虑 return 的多个输入参数相关结果。例如,评级排序并不是严格按照评级值排序,而是根据考虑评级数量的调整评级值排序,类似于贝叶斯平均。这是为了防止通过一次评论向企业倾斜结果。
我想将餐厅名称、phone 号码、网站和地址等餐厅数据导入到 excel,但不幸的是,我得到的是赞助结果,也没有得到网站和完整地址,因为它在当我们点击酒店名称时在内页。我在平台的一些帮助下创建了一个代码,但它没有帮助。请纠正我的代码中的问题。网站:https://www.yelp.com/searchcflt=restaurants&find_loc=San%20Francisco%2C%20CA&start=
这是我的代码:
Sub GetInfo()
Const URL$ = "https://www.yelp.com/search?cflt=restaurants&find_loc=San%20Francisco%2C%20CA&start="
Dim Http As New XMLHTTP60, Html As New HTMLDocument, Htmldoc As New HTMLDocument, page&, I&
For page = 0 To 1 ' this is where you change the last number for the pages to traverse
With Http
.Open "GET", URL & page * 30, False
.send
Html.body.innerHTML = .responseText
End With
With Html.querySelectorAll("[class*='searchResult']")
For I = 0 To .Length - 1
Htmldoc.body.innerHTML = .Item(I).outerHTML
On Error Resume Next
r = r + 1: Cells(r, 1) = Htmldoc.querySelector("[class*='heading--h3'] > a").innerText
Cells(r, 2) = Htmldoc.querySelector("[class*='container'] > [class*='display--inline-block']").innerText
' Cells(r, 3) = Htmldoc.querySelector("[class*='container'] > address").innerText
'Cells(r, 4) = Htmldoc.querySelector("[class*='container'] > address").NextSibling.innerText
'Inner loop creation
Cells(r, 5) = Htmldoc.querySelector("[class*='container'] > website").href ' Extract from window after clicking on hotel name
Cells(r, 6) = Htmldoc.querySelector("[class*='container'] > fulladdress").innerText ' Extract from window after clicking on hotel name
On Error GoTo 0
Next I
End With
Next page
End Sub
这是一种让您从其内页解析结果的方法。我无法再访问该网页以进一步帮助您。但是,试一试。我想它会起作用:
Sub GetInfo()
Const URL$ = "https://www.yelp.com/search?cflt=restaurants&find_loc=San%20Francisco%2C%20CA&start="
Const base$ = "https://www.yelp.com"
Dim Http As New XMLHTTP60, Html As New HTMLDocument
Dim oTitle$, oPhone As Object, Htmldoc As New HTMLDocument
Dim R&, newUrl$, I&, oWeb As Object, page&, oAddress As Object
[A1:D1] = [{"Name","Phone","Address","Website"}]
For page = 1 To 3 'this is where you change the last number for this script to traverse
With Http
.Open "GET", URL & page * 30, False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
Html.body.innerHTML = .responseText
End With
With Html.querySelectorAll("[class*='searchResult'] [class*='heading--h3'] > a")
For I = 0 To .Length - 1
If Not InStr(.item(I).getAttribute("href"), "/adredir?") > 0 Then
oTitle = .item(I).innerText
newUrl = Replace(.item(I).getAttribute("href"), "about:", base)
With Http
.Open "GET", newUrl, False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
Htmldoc.body.innerHTML = .responseText
End With
R = R + 1: Cells(R + 1, 1) = oTitle
Set oPhone = Htmldoc.querySelector(".biz-phone")
If Not oPhone Is Nothing Then
Cells(R + 1, 2) = oPhone.innerText
End If
Set oAddress = Htmldoc.querySelector(".map-box-address")
If Not oAddress Is Nothing Then
Cells(R + 1, 3) = WorksheetFunction.Clean(oAddress.innerText)
End If
Set oWeb = Htmldoc.querySelector(".biz-website > a")
If Not oWeb Is Nothing Then
Cells(R + 1, 4) = oWeb.innerText
End If
End If
Next I
End With
Next page
End Sub
顺便说一句,广告已经被踢掉了。
您可以使用免费的 API 从 business_search 端点获取前 50 名。在查询字符串中传递排序参数以获得最高评价。
使用 json 解析器,例如 jsonconverter.bas 来处理响应。在名为 JsonConverter 的标准模块中安装来自 link 的代码后,转到 VBE > 工具 > 参考 > 添加对 Microsoft 脚本运行时的参考。
API指令是here. You need to set up a test app, which requires some basic user info, and verify your email. You will then receive an API key for authentication在授权header中传递,如下图
还有其他信息 returned,您可以根据需要进行解析。
Option Explicit
Public Sub GetTopRestuarants()
Dim json As Object, headers(), r As Long, c As Long
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://api.yelp.com/v3/businesses/search?term=restuarant&location=san-francisco&limit=50&sort_by=rating", False
.setRequestHeader "Authorization", "Bearer yourAPIkey"
.send
Set json = JsonConverter.ParseJson(.responseText)("businesses")
headers = Array("Restaurant name", "phone", "website", "address")
Dim results(), item As Object
ReDim results(1 To json.Count, 1 To UBound(headers) + 1)
For Each item In json
r = r + 1
results(r, 1) = item("name")
results(r, 2) = item("phone")
results(r, 3) = item("url")
Dim subItem As Variant, address As String
address = vbNullString
For Each subItem In item("location")("display_address")
address = address & Chr$(32) & subItem
Next
results(r, 4) = Trim$(address)
Next
End With
With ActiveSheet
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End Sub
示例前 20 名,共 50 名 return编辑:
买者自负
请注意,指定 sort_by 是对 Yelp 的搜索的一个建议(并非严格执行),它最多考虑 return 的多个输入参数相关结果。例如,评级排序并不是严格按照评级值排序,而是根据考虑评级数量的调整评级值排序,类似于贝叶斯平均。这是为了防止通过一次评论向企业倾斜结果。