使用 MSXML2.XMLHTTP 提取包装器值
Wrapper value extraction using MSXML2.XMLHTTP
我们目前正在使用 MSXML2.XMLHTTP.Using 我的代码从网页中提取数据,除 rvw-cnt-tx
class data.I 之外的所有数据都已提取 data.I 想从以下内容中提取 43 评论值 url.
url="https://www.trendyol.com/lc-waikiki/erkek-cocuk-lacivert-takim-p-78215759?boutiqueId=555784&merchantId=4171"
网页html:
<a href="/lc-waikiki/erkek-cocuk-lacivert-takim-p-78215759/yorumlar?boutiqueId=555784&merchantId=4171&v=11-12-yas" class="rvw-cnt-tx">43 Reviews </a>
我的代码
Set http = CreateObject("MSXML2.XMLHTTP")
http.Open "GET", url, False
http.Send
html.body.innerHTML = http.ResponseText
html1 = html.body.innerHTML
brand = html.body.innerText
Dim reviews As String
cat = html.getElementsByClassName("breadcrumb full-width")(0).innerText
reviews = html.getElementsByClassName("rvw-cnt-tx")(0).innerText
它是动态检索的。但是,您可以将 /yorumlar
连接到当前 url 的末尾以到达评论页面,并且该值静态存在。我使用正则表达式提取文本中存在评论数的数字部分。
此 html.querySelector(".title h3")
是为了限制正则表达式仅从存在该值的节点搜索字符串。
Option Explicit
Public Sub GetReviewCount()
'tools > references > Microsoft HTML Object Library
Dim re As Object, html As MSHTML.HTMLDocument, xhr As Object
Set re = CreateObject("VBScript.RegExp")
Set xhr = CreateObject("MSXML2.XMLHTTP")
Set html = New MSHTML.HTMLDocument
re.Pattern = "([0-9,]+)"
With xhr
.Open "GET", "https://www.trendyol.com/lc-waikiki/erkek-cocuk-lacivert-takim-p-78215759/yorumlar", False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
html.body.innerhtml = .responseText
End With
Debug.Print re.Execute(html.querySelector(".title h3").innerText)(0).SubMatches(0)
End Sub
要正确获取 cat 变量:
Option Explicit
Public Sub GetCat()
'tools > references > Microsoft HTML Object Library
Dim html As MSHTML.HTMLDocument, xhr As Object
Set xhr = CreateObject("MSXML2.XMLHTTP")
Set html = New MSHTML.HTMLDocument
With xhr
.Open "GET", "https://www.trendyol.com/lc-waikiki/erkek-cocuk-lacivert-takim-p-78215759?boutiqueId=555784&merchantId=4171", False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
html.body.innerhtml = .responseText
End With
Dim nodes As Object, cat As String, i As Long
Set nodes = html.querySelectorAll(".breadcrumb .breadcrumb-item")
For i = 0 To nodes.Length - 1
cat = cat & IIf(i = nodes.Length - 1, nodes.Item(i).innerText, nodes.Item(i).innerText & " > ")
Next
Debug.Print cat
End Sub
我们目前正在使用 MSXML2.XMLHTTP.Using 我的代码从网页中提取数据,除 rvw-cnt-tx
class data.I 之外的所有数据都已提取 data.I 想从以下内容中提取 43 评论值 url.
url="https://www.trendyol.com/lc-waikiki/erkek-cocuk-lacivert-takim-p-78215759?boutiqueId=555784&merchantId=4171"
网页html:
<a href="/lc-waikiki/erkek-cocuk-lacivert-takim-p-78215759/yorumlar?boutiqueId=555784&merchantId=4171&v=11-12-yas" class="rvw-cnt-tx">43 Reviews </a>
我的代码
Set http = CreateObject("MSXML2.XMLHTTP")
http.Open "GET", url, False
http.Send
html.body.innerHTML = http.ResponseText
html1 = html.body.innerHTML
brand = html.body.innerText
Dim reviews As String
cat = html.getElementsByClassName("breadcrumb full-width")(0).innerText
reviews = html.getElementsByClassName("rvw-cnt-tx")(0).innerText
它是动态检索的。但是,您可以将 /yorumlar
连接到当前 url 的末尾以到达评论页面,并且该值静态存在。我使用正则表达式提取文本中存在评论数的数字部分。
此 html.querySelector(".title h3")
是为了限制正则表达式仅从存在该值的节点搜索字符串。
Option Explicit
Public Sub GetReviewCount()
'tools > references > Microsoft HTML Object Library
Dim re As Object, html As MSHTML.HTMLDocument, xhr As Object
Set re = CreateObject("VBScript.RegExp")
Set xhr = CreateObject("MSXML2.XMLHTTP")
Set html = New MSHTML.HTMLDocument
re.Pattern = "([0-9,]+)"
With xhr
.Open "GET", "https://www.trendyol.com/lc-waikiki/erkek-cocuk-lacivert-takim-p-78215759/yorumlar", False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
html.body.innerhtml = .responseText
End With
Debug.Print re.Execute(html.querySelector(".title h3").innerText)(0).SubMatches(0)
End Sub
要正确获取 cat 变量:
Option Explicit
Public Sub GetCat()
'tools > references > Microsoft HTML Object Library
Dim html As MSHTML.HTMLDocument, xhr As Object
Set xhr = CreateObject("MSXML2.XMLHTTP")
Set html = New MSHTML.HTMLDocument
With xhr
.Open "GET", "https://www.trendyol.com/lc-waikiki/erkek-cocuk-lacivert-takim-p-78215759?boutiqueId=555784&merchantId=4171", False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
html.body.innerhtml = .responseText
End With
Dim nodes As Object, cat As String, i As Long
Set nodes = html.querySelectorAll(".breadcrumb .breadcrumb-item")
For i = 0 To nodes.Length - 1
cat = cat & IIf(i = nodes.Length - 1, nodes.Item(i).innerText, nodes.Item(i).innerText & " > ")
Next
Debug.Print cat
End Sub