通过 Msxml2.ServerXMLHTTP.6.0 在 Excel VBA 上设置网络 table 抓取工具

Setting a web table scraper by Msxml2.ServerXMLHTTP.6.0 on Excel VBA

我需要做一个网络数据抓取工具。

  1. 我需要登录网站:用户名,密码,点击登录按钮
  2. 单击第二个按钮
  3. 等待页面加载,这里是有问题的 Table。 table是通话记录,动态添加新内容,所以一直在刷新。
  4. 我想从 table 内容中排除一个表单并限制粘贴到 Excel 的行数。

我用 InternetExplorer.Application 代码使它工作,但我需要切换到 MSXML2.XMLHTTP 代码,因为它很慢。

正在工作 InternetExplorer.Application 版本:

 Sub extractTablesData()
 'we define the essential variables

 Dim IE As Object, obj As Object
 Dim r As Integer, c As Integer, t As Integer
 Dim elemCollection As Object


 'add the "Microsoft Internet Controls" reference in your VBA Project indirectly
 Set IE = CreateObject("InternetExplorer.Application")

 With IE
 .Silent = True
 .Visible = True
 .navigate ("https://www.clickphone.ro")

 ' we ensure that the web page downloads completely before we fill the form automatically
 While IE.readyState <> 4
 DoEvents
 Wend
Application.Wait Now + TimeValue("00:00:03")
Set HTMLDoc = IE.document
 HTMLDoc.all.user.Value = "user or email" 'Enter your email/user id here
 HTMLDoc.all.pass.Value = "xXXxXXXxxXXXxx" 'Enter your password here
 'Login Button Click               
 With IE.document

    Set elems = .getElementsByTagName("a")
    For Each e In elems

        If (e.getAttribute("class") = "orange_button") Then
            e.Click
            Exit For
        End If

    Next e

End With

 'Needed Table page Button Click https://www.clickphone.ro/account/istoric_apel_in.html
 While IE.readyState <> 4
 DoEvents
 Wend
Set iedoc = IE.document

Set elems = iedoc.getElementsByClassName("black")(12)
    elems.Click

 ' again ensuring that the web page loads completely before we start scraping data
 While IE.readyState <> 4
 DoEvents
 Wend
 Application.Wait Now + TimeValue("00:00:05")
 Set iedoc = IE.document

'Clearing any unnecessary or old data in Sheet1

 ThisWorkbook.Sheets("Sheet1").Range("A1:K1000").ClearContents

'Scrapping Data and past to Sheet1
 Set elemCollection = IE.document.getElementsByTagName("table")

    For t = 0 To (elemCollection.Length - 1)
        For r = 0 To (elemCollection(t).Rows.Length - 1)
            For c = 0 To (elemCollection(t).Rows(r).Cells.Length - 1)
                ThisWorkbook.Worksheets(1).Cells(r + 1, c + 1) = elemCollection(t).Rows(r).Cells(c).innerText
            Next c
        Next r
    Next t

 End With

 ' cleaning up memory
 Set IE = Nothing

 End Sub

这是我对 MSXMLHTTP 的尝试:

Option Explicit
 'reference to Microsoft Internet Controls
 'reference to Microsoft HTML Object Library

Sub Web_Table_Option_One()
Dim xml    As Object
Dim html   As Object
Dim objTable As Object
Dim result As String
Dim lRow As Long
Dim lngTable As Long
Dim lngRow As Long
Dim lngCol As Long
Dim ActRw As Long

Set xml = CreateObject("MSXML2.XMLHTTP.6.0")

Set html = CreateObject("htmlFile")

With xml
.Open "POST", "https://www.clickphone.ro/login.html", False
.setRequestHeader "Content-type", "application/x-www-form-urlencoded"
.send "userName=USER&password=XXXXxxxxXxxxxXXX"
.Open "GET", "https://www.clickphone.ro/account/istoric_apel_in.html", False 
.setRequestHeader "Content-type", "text/xml"
.send
End With

html.body.innerHTML = xml.responseText

Set objTable = html.getElementsByTagName("table")
 For lngTable = 0 To objTable.Length - 1
        For lngRow = 0 To objTable(lngTable).Rows.Length - 1
            For lngCol = 0 To objTable(lngTable).Rows(lngRow).Cells.Length - 1
                ThisWorkbook.Sheets("Sheet2").Cells(ActRw + lngRow + 1, lngCol + 1) = objTable(lngTable).Rows(lngRow).Cells(lngCol).innerText
            Next lngCol
        Next lngRow
        ActRw = ActRw + objTable(lngTable).Rows.Length + 1
    Next lngTable
End Sub

HTML源代码:

对于用户,通过,登录按钮:

<form action="/login.html" id="toploginform" name="toploginform" method="post">
                                                                      <script>
                                            function processLoginForm(){
                                                with (document.toploginform) {
                                                    if (user.value=="Email"){alert('Email/Parola incorecte!'); return false}
                                                    document.getElementById('toploginform').submit();
                                                }
                                            }
                                        </script>

                                                                            <fieldset>
                                            <input name="userlogin" type="hidden" id="userlogin" value="true" />
                                            <span class="text">
                                            <input name="user" type="text" onFocus="if(this.value=='Email'){this.value=''}" onBlur="if(this.value==''){this.value='Email'}" value="Email">
                                            </span> <span class="text">
                                            <input name="pass" type="password" onFocus="if(this.value=='Password'){this.value=''}" onBlur="if(this.value==''){this.value='Password'}" value="Password">
                                            </span> 
                                            <input name="authcode" type="hidden" id="authcode" value="false" />
                                            <span><a href="#" class="orange_button" onClick="return processLoginForm()">Login</a></span>
                                             <span class="links"><a href="/login~parola.html">Am uitat parola</a><br/>
                                                <input class="css-checkbox" id="checkbox2" type="checkbox" name="rememberpass" value="da" />
                                                <label for="checkbox2" name="checkbox2_lbl" class="css-label lite-orange-check">Retin datele?</label>
                                        </span>
                                        </fieldset>                         
                                                                </form>

Table 页面按钮:

<br />&nbsp;<img src="/images/sageata_orange.gif" width="7" height="8" />&nbsp;<a class="black" href="/account/istoric_apel_in.html">Apeluri primite</a>

Table源代码:

<table class="TabelDate" cellspacing="0">
  <thead>
    <tr>
      <th width="130">Data</th>
      <th>Sursa</th>
      <th>Destinatie</th>
      <th>Durata</th>
      <th class="ultima">Status</th>
    </tr>
  </thead>
  <tr class="u">    <td class="prima">19-03-2017 17:31:16</td><td><font color="green"><form name="form24-1489937476.41719" method="post" action="">0720145931 <a class="TipFB" href="#"><span class="tip"><span class="tipTitle">Suna inapoi la 0720145931</span><span class="tipBody">Click si serverul te va suna gratuit pe numarul tau 0371780434.<br />Dupa ce raspunzi se formeaza automat numarul 0720145931.</span><span class="tipArrow"></span></span><input type="image" name="btn_opentextbox" src="/images/phone_small.gif" value="Submit" /></a>    <input name="numartel" type="hidden" id="numartel" value="0720145931" /></form></font></td><td align="center"><font color="green">0371780444</font></td><td align="center"><font color="green">00:00:07</font></td>
            <td class="ultima" align="center"><font color="green">Apel preluat</font></td></tr>  <tr class="gri">    <td class="prima">19-03-2017 17:30:48</td><td><font color="green"><form name="form24-1489937448.41715" method="post" action="">0728409617 <a class="TipFB" href="#"><span class="tip"><span class="tipTitle">Suna inapoi la 0728409617</span><span class="tipBody">Click si serverul te va suna gratuit pe numarul tau 0371780434.<br />Dupa ce raspunzi se formeaza automat numarul 0728409617.</span><span class="tipArrow"></span></span><input type="image" name="btn_opentextbox" src="/images/phone_small.gif" value="Submit" /></a>    <input name="numartel" type="hidden" id="numartel" value="0728409617" /></form></font></td><td align="center"><font color="green">0371780655</font></td><td align="center"><font color="green">00:00:07</font></td>

我设法部分解决了我的问题。现在我可以使用 XmlHttp 登录并检索我需要的 table。我将 post 工作代码放在这里,这样每个人都可以使用它(我没有为这段代码取得任何功劳,我是在不同论坛的帮助下完成的)

Option Explicit
 'reference to Microsoft Internet Controls
 'reference to Microsoft HTML Object Library

Sub CallLog()
Dim xml    As Object
Dim html   As Object
Dim objTable As Object
Dim result As String
Dim lRow As Long
Dim lngTable As Long
Dim lngRow As Long
Dim lngCol As Long
Dim ActRw As Long

Set xml = CreateObject("MSXML2.XMLHTTP.6.0")

Set html = CreateObject("htmlFile")

With xml
.Open "POST", "https://www.XXXXXX.xx/login.html", False
.setRequestHeader "Content-type", "application/x-www-form-urlencoded" 'send appropriate Headers
.send "userlogin=true&user=USERNAME&pass=PASSWORD&authcode=false" ' send login info
'MsgBox xml.responseText
.Open "GET", "https://www.XXXXXX.xx/account/callLog.html", False
.setRequestHeader "Content-type", "text/xml"
.send
End With

html.body.innerHTML = xml.responseText

Set objTable = html.getElementsByTagName("table")
    For lngTable = 0 To objTable.Length - 1
        For lngRow = 0 To objTable(lngTable).Rows.Length - 1
            For lngCol = 0 To objTable(lngTable).Rows(lngRow).Cells.Length - 1
                ThisWorkbook.Sheets("Sheet2").Cells(ActRw + lngRow + 1, lngCol + 1) = objTable(lngTable).Rows(lngRow).Cells(lngCol).innerText
            Next lngCol
        Next lngRow
        ActRw = ActRw + objTable(lngTable).Rows.Length + 1
    Next lngTable
End Sub

现在我只剩下两个问题了... 我怎样才能从 parent "table" 得到 children "table"(我要找的 table 在一个更大的 table 中,见下文源代码),我只想获取第一行,但不包括行中的 "form"(它是 href link) Source Code

我如何持续获取此信息(此 table 是动态的,每次有人打电话给我时它都会更新,第一行会持续更新)

我的工作代码的 2.0 版:

Option Explicit
 'reference to Microsoft Internet Controls
 'reference to Microsoft HTML Object Library

Sub CallLog()
Dim xml    As Object
Dim html   As Object
Dim objTable As Object
Dim result As String
Dim lRow As Long
Dim lngTable As Long
Dim lngRow As Long
Dim lngCol As Long
Dim ActRw As Long

Set xml = CreateObject("MSXML2.XMLHTTP.6.0")

Set html = CreateObject("htmlFile")

With xml
.Open "POST", "https://www.XXXXXX.xx/login.html", False
.setRequestHeader "Content-type", "application/x-www-form-urlencoded" 'send appropriate Headers
.send "userlogin=true&user=USERNAME&pass=PASSWORD&authcode=false" ' send login info
'MsgBox xml.responseText
.Open "GET", "https://www.XXXXXX.xx/account/callLog.html", False
.setRequestHeader "Content-type", "text/xml"
.send
End With

html.body.innerHTML = xml.responseText

Set objTable = html.getElementsByTagName("table")
ThisWorkbook.Sheets("LogClickPhone").Range("A2") = objTable(1).Rows(1).Cells(0).innerText
ThisWorkbook.Sheets("LogClickPhone").Range("B2") = objTable(1).Rows(1).Cells(1).innerText
ThisWorkbook.Sheets("LogClickPhone").Range("C2") = objTable(1).Rows(1).Cells(2).innerText
ThisWorkbook.Sheets("LogClickPhone").Range("D2") = objTable(1).Rows(1).Cells(3).innerText
ThisWorkbook.Sheets("LogClickPhone").Range("E2") = objTable(1).Rows(1).Cells(4).innerText
End Sub

我设法只获得了我需要的行,但是速度很慢,需要 38.5 秒才能完成。我想我最好使用 MSXML2.DOMDocument.6.0 结构来获取我需要的文本。但我不知道该怎么做。 题: 我如何才能使此代码自动化,使其每 60 秒左右变为 运行? 发送