使用标签导航 html table 提取内部文本 (vbs)

Navigate html table using tags to extract inner text (vbs)

我尝试获取的数据示例(查看源代码):https://www.myfloridalicense.com/LicenseDetail.asp?SID=&id=8B7B2B88CE03567735560917596FA6BD

来源看起来像这样:

<table width="100%" border="0" cellspacing="0" cellpadding="2" bgcolor="#e9edf2">
<tr>
    <td width="2%" height="20"><font size="1">&nbsp;</font></td>
    <td width="30%" valign="top"><font face="verdana" size="-1">Name:</font></td>
    <td width="48%">
        <font face="verdana" size="-1"><b>ABBOTT, HUGH ALLAN&nbsp;<small>(Primary Name)</small></b></font></td>

</tr>

我不知道如何有意导航到源代码的这一部分。我需要以某种方式告诉它在所有带有 TD 的标签中搜索 "Name:",如果它存在,请给我标签

的下一个内文
<b> 

在这种情况下是 ABBOTT,HUGH ALLAN。我需要这种类型的方法,因为随着项目位置的变化,使用 item(#) 查找特定文本并不可靠。我尝试了几种不同的方法,但到目前为止没有成功。比如"for each td in ...."类型的方法。我最终可以找到正确的项目,但它在多个记录中并不可靠。

TIA

编辑 - 这是我拥有的接近的代码:

这假定您有一个包含此 path/name 的文本文件(尽管它说电子邮件在路径中抓取,在这种情况下不是试图获取电子邮件):"C:\Emailgrab\myfloridalicense.com\Extract URL\AgentURLsRaw_Clean.txt" 包含这些链接:

https://www.myfloridalicense.com/LicenseDetail.asp?SID=&id=2BEA648A94BA20C0C989E9E0071103AF https://www.myfloridalicense.com/LicenseDetail.asp?SID=&id=AB8F78E2835A25C2D443B09DE9CDD16F https://www.myfloridalicense.com/LicenseDetail.asp?SID=&id=A6DBB6CDEE69A637B4497807A1FE45A6 https://www.myfloridalicense.com/LicenseDetail.asp?SID=&id=8B7B2B88CE03567735560917596FA6BD https://www.myfloridalicense.com/LicenseDetail.asp?SID=&id=27A84B8EF8F96AD4F09AF94774456A39

还假设您在 path/name 处有此头文件:"C:\Emailgrab\myfloridalicense.com\Extract URL\Complete.csv" 以及这些头文件:

姓氏,First/Mid姓名,地址,执照号码,执照状态(a),执照状态(b),执照到期,URL

VBS代码:

Dim URLFile 
Dim fName
set ie = createobject("internetexplorer.application")
IE.Visible = True
Set objShell = CreateObject("WScript.Shell")
Set WshShell = WScript.CreateObject("WScript.Shell") 
set fso = createobject("scripting.filesystemobject")
Set URLFile = fso.OpenTextFile("C:\Emailgrab\myfloridalicense.com\Extract URL\AgentURLsRaw_Clean.txt")

do while not URLFile.AtEndOfStream 
fName = URLFile.ReadLine()

ie.navigate fName
do until ie.readystate = 4 : wscript.sleep 10: loop 

For Each elm In IE.Document.getElementsByTagName("table")
If elm.getElementsByTagName("TABLE").Length = 16 THEN

name = elm.document.getElementsByTagName("b").item(3).innertext
address = elm.document.getElementsByTagName("b").item(5).innertext
licensenumber = elm.document.getElementsByTagName("b").item(12).innertext
licensestatus = elm.document.getElementsByTagName("b").item(13).innertext
licenseexp = elm.document.getElementsByTagName("b").item(15).innertext

myData =  name & ", " & replace(address, vbCrLf, "") & ", " & licensenumber & ", " & licensestatus & ", " & licenseexp & "," & fName & ", " & vbCrlf

set fso = createobject("scripting.filesystemobject")
set ts = fso.opentextfile("C:\Emailgrab\myfloridalicense.com\Extract URL\Complete.csv",8,true) 
ts.write myData 
ts.close 

end if
next

loop

Wscript.Echo "All Data Copied!"

我对我的方法很满意,但我不太明白为什么它适用于某些链接而不适用于其他链接。您可以在 CSV 中看到,在某些情况下,数据从许可证号开始关闭,这会关闭其余列。此外,在某些情况下,数据中有逗号,因此我添加了一个额外的 LicenseStatus 列来尝试解决这个问题。我只需要一种(最好是简单的)比使用 Item(#) 计数更可靠的方法。希望这可以帮助。谢谢!

更新

这里是基于HTTP请求和RegExp解析为Dictionary的实现,它以带有URL的txt文件作为输入,并将结果写入csv文件:

arrUrls = Split(ReadTextFile("C:\Emailgrab\myfloridalicense.com\Extract URL\AgentURLsRaw_Clean.txt", 0), vbCrLf)
sCsv = ""
For Each sUrl in arrUrls
    XmlHttpRequest "GET", sUrl, "", "", "", sRespText
    HtmlSimplify sRespText
    ParseToDict "<tr><td></td><td>([^<]*?)</td><td>([^<]*?)(?:</td>){0,1}</tr>", sRespText, oResult
    sCsv = sCsv & """" & oResult("Name:") & """" & ","
    sCsv = sCsv & """" & oResult("Main Address:") & """" & ","
    sCsv = sCsv & """" & oResult("License Number:") & """" & ","
    sCsv = sCsv & """" & oResult("Status:") & """" & ","
    sCsv = sCsv & """" & oResult("Expires:") & """" & ","
    sCsv = sCsv & """" & sUrl & """" & vbCrLf
Next
WriteTextFile sCsv, "C:\Emailgrab\myfloridalicense.com\Extract URL\Complete.csv", 0
WScript.Echo "All Data Copied!"

Function ReadTextFile(sPath, iFormat)
    With CreateObject("Scripting.FileSystemObject").OpenTextFile(sPath, 1, False, iFormat)
        ReadTextFile = ""
        If Not .AtEndOfStream Then ReadTextFile = .ReadAll
        .Close
    End With
End Function

Sub WriteTextFile(sCont, sPath, iFormat)
    With CreateObject("Scripting.FileSystemObject").OpenTextFile(sPath, 2, True, iFormat)
        .Write(sCont)
        .Close
    End With
End Sub

Sub HtmlSimplify(sCont)
    With CreateObject("VBScript.RegExp")
        .Global = True
        .MultiLine = True
        .IgnoreCase = True
        .Pattern = "(<[\w\/^<]*)[\s\S]*?>"
        sCont = .Replace(sCont, ">")
        .Pattern = "(?:<font>|</font>|<b>|</b>|<small>|</small>|<br>)"
        sCont = .Replace(sCont, "")
        .Pattern = "&nbsp;"
        sCont = .Replace(sCont, " ")
        .Pattern = "[\f\n\r\t\v]"
        sCont = .Replace(sCont, "")
        .Pattern = " +"
        sCont = .Replace(sCont, " ")
        .Pattern = "> <"
        sCont = .Replace(sCont, "><")
    End With
End Sub

Sub XmlHttpRequest(sMethod, sUrl, arrSetHeaders, sFormData, sRespHeaders, sRespText)
    Dim arrHeader
    With CreateObject("Msxml2.ServerXMLHTTP.3.0")
        .SetOption 2, 13056 ' SXH_SERVER_CERT_IGNORE_ALL_SERVER_ERRORS
        .Open sMethod, sUrl, False
        If IsArray(arrSetHeaders) Then
            For Each arrHeader In arrSetHeaders
                .SetRequestHeader arrHeader(0), arrHeader(1)
            Next
        End If
        .Send sFormData
        sRespHeaders = .GetAllResponseHeaders
        sRespText = .ResponseText
    End With
End Sub

Sub ParseToDict(sPattern, sResponse, oDict)
    Dim oMatch, arrSMatches, sSubMatch
    Set oDict = CreateObject("Scripting.Dictionary")
    With CreateObject("VBScript.RegExp")
        .Global = True
        .MultiLine = True
        .IgnoreCase = True
        .Pattern = sPattern
        For Each oMatch In .Execute(sResponse)
            If Trim(oMatch.SubMatches(0)) <> "" Then oDict(oMatch.SubMatches(0)) = oMatch.SubMatches(1)
        Next
    End With
End Sub

每个网页解析的数据都被放入以项目名称为关键字的字典中。 csv 的内容是通过其名称而不是相对位置引用值来创建的,因此对于任何列出的 URL,所有列都在它们的位置。
此外,我还添加了双引号以避免使用 Current,Inactive(RFC 4180 点 2.6)之类的逗号拆分为单独的列值。
它不是任何网站的通用解决方案。对于另一个网站,您必须创建适当的 RegExp 模式进行解析。在这种情况下,我在 HtmlSimplify 之后将 HTML 内容保存到文件中,并检查它以找出所需的模式。更重要的是,要将这种方法用于更复杂的站点,您可能必须通过切断无关的 HTML 部分从而在几个步骤中进行解析(可能是循环的)从而缩小搜索范围。

来源

考虑这个 VBS 解析器:

' sCont contains table HTML at the moment

With CreateObject("VBScript.RegExp")
    .Global = True
    .MultiLine = True
    .IgnoreCase = True
    ' content simplification
    .Pattern = "(<[\w\/^<]*)[\s\S]*?>"
    sCont = .Replace(sCont, ">")
    .Pattern = "(?:<font>|</font>|<b>|</b>|<small>|</small>|<br>)"
    sCont = .Replace(sCont, "")
    .Pattern = "&nbsp;"
    sCont = .Replace(sCont, " ")
    .Pattern = "[\f\n\r\t\v]"
    sCont = .Replace(sCont, "")
    .Pattern = " +"
    sCont = .Replace(sCont, " ")
    .Pattern = "> <"
    sCont = .Replace(sCont, "><")
End With
ParseToArray "<tr><td></td><td>([^<]*?)</td><td>([^<]*?)(?:</td>){0,1}</tr>", sCont, arrResult

' continue processing of arrResult
WScript.Echo arrResult(0)(1) ' eg shows name
' ...

Sub ParseToArray(sPattern, sResponse, arrMatches)
    Dim oMatch, arrSMatches, sSubMatch
    arrMatches = Array()
    With CreateObject("VBScript.RegExp")
        .Global = True
        .MultiLine = True
        .IgnoreCase = True
        .Pattern = sPattern
        For Each oMatch In .Execute(sResponse)
            arrSMatches = Array()
            For Each sSubMatch in oMatch.SubMatches
                PushItem arrSMatches, sSubMatch
            Next
            PushItem arrMatches, arrSMatches
        Next
    End With
End Sub

Sub PushItem(arrList, varItem)
    ReDim Preserve arrList(UBound(arrList) + 1)
    arrList(UBound(arrList)) = varItem
End Sub

sCont开头如下:

<table cellspacing="0" cellpadding="1" width="100%" border="0" bgcolor="#b6c9dc"><tr><td>
    <table width="100%" border="0" cellspacing="0" cellpadding="3"><tr>
        <td width="32%"><font face="verdana" size="-1"><b>Licensee Information</b></font></td>
    </tr></table>
    <table width="100%" border="0" cellspacing="0" cellpadding="2" bgcolor="#e9edf2">
    <tr>
        <td width="2%" height="20"><font size="1">&nbsp;</font></td>
        <td width="30%" valign="top"><font face="verdana" size="-1">Name:</font></td>
        <td width="48%">
            <font face="verdana" size="-1"><b>ABBOTT, HUGH ALLAN&nbsp;<small>(Primary Name)</small></b></font></td>

    </tr>
    <tr>
        <td width="2%" height="20"><font size="1">&nbsp;</font></td>
        <td width="30%" valign="top"><font face="verdana" size="-1"></font></td>
        <td width="48%"><font face="verdana" size="-1"><b>&nbsp;<small>(DBA Name)</small></b></font></td>
    </tr>
    <tr>
        <td width="2%" height="20"><font size="1">&nbsp;</font></td>
        <td width="30%" valign="top"><font face="verdana" size="-1">Main Address:</font></td>
        <td width="68%"><font face="verdana" size="-1"><b>318 TURKEY CREEK 
        <br>ALACHUA&nbsp;&nbsp;Florida&nbsp;&nbsp;32615</b></font>
        </td>
    </tr>

    <tr>
        <td width="2%" height="20"><font size="1">&nbsp;</font></td>
        <td width="30%" valign="top"><font face="verdana" size="-1">County:</font></td>
        <td width="68%"><font face="verdana" size="-1"><b>ALACHUA
    </tr>



    <tr><td>&nbsp;</td></tr>
    <tr>
        <td width="2%" height="20"><font size="1">&nbsp;</font></td>
        <td width="30%" valign="top"><font face="verdana" size="-1">License Mailing:</font></td>
        <td width="68%"><font face="verdana" size="-1"><b>318 TURKEY CREEK

        <br>ALACHUA&nbsp;&nbsp;FL &nbsp;&nbsp;32615</b></font>

    </tr>

    <tr>
        <td width="2%" height="20"><font size="1">&nbsp;</font></td>
        <td width="30%" valign="top"><font face="verdana" size="-1">County:</font></td>
        <td width="68%"><font face="verdana" size="-1"><b>ALACHUA
    </tr>

    <tr><td>&nbsp;</td></tr>
    <tr>
        <td width="2%" height="20"><font size="1">&nbsp;</font></td>
        <td width="30%" valign="top"><font face="verdana" size="-1">LicenseLocation:</font></td>
        <td width="68%"><font face="verdana" size="-1"><b> 

        <br>&nbsp;&nbsp; &nbsp;&nbsp; </b></font>

    </tr>

    </table>

</td></tr></table>

它获取子数组的数组,如下面的本地调试器屏幕截图所示:

我无法完全实施您的方法,但您回复中的一些元素确实引导我找到了可行的解决方案。如果 "License Number" 列包含许可证号以外的内容,我使用 IF 语句的组合将每个项目移动到必要数量的列上。还使用您的建议让替换功能处理逗号问题。那是我的问题的一部分,就是把事情搞砸了。

Dim URLFile 
Dim fName
set ie = createobject("internetexplorer.application")
IE.Visible = True
Set objShell = CreateObject("WScript.Shell")
Set WshShell = WScript.CreateObject("WScript.Shell") 
set fso = createobject("scripting.filesystemobject")
Set URLFile = fso.OpenTextFile("C:\Emailgrab\myfloridalicense.com\Extract URL\AgentURLsRaw_Clean.txt")

do while not URLFile.AtEndOfStream 
fName = URLFile.ReadLine()

ie.navigate fName
do until ie.readystate = 4 : wscript.sleep 10: loop 

For Each elm In IE.Document.getElementsByTagName("table")
If elm.getElementsByTagName("TABLE").Length = 16 THEN

name = elm.document.getElementsByTagName("b").item(3).innertext
address = replace(elm.document.getElementsByTagName("b").item(5).innertext,","," ")

'License Number
If InStr(elm.document.getElementsByTagName("b").item(12).innertext, "Real Estate Broker or Sales") THEN 
licensenumber = elm.document.getElementsByTagName("b").item(14).innertext 
ELSEIf InStr(elm.document.getElementsByTagName("b").item(12).innertext, "Broker") or InStr(elm.document.getElementsByTagName("b").item(12).innertext, "Sales Associate") THEN
licensenumber = elm.document.getElementsByTagName("b").item(13).innertext
Else licensenumber = elm.document.getElementsByTagName("b").item(12).innertext
End If

'License Status
If InStr(elm.document.getElementsByTagName("b").item(12).innertext, "Real Estate Broker or Sales") THEN 
licensestatus = elm.document.getElementsByTagName("b").item(15).innertext 
ELSEIf InStr(elm.document.getElementsByTagName("b").item(12).innertext, "Broker") or InStr(elm.document.getElementsByTagName("b").item(12).innertext, "Sales Associate") THEN
licensestatus = elm.document.getElementsByTagName("b").item(14).innertext
Else licensestatus = elm.document.getElementsByTagName("b").item(13).innertext
End If

'License Exp
If InStr(elm.document.getElementsByTagName("b").item(12).innertext, "Real Estate Broker or Sales") THEN 
licenseexp = elm.document.getElementsByTagName("b").item(17).innertext 
ELSEIf InStr(elm.document.getElementsByTagName("b").item(12).innertext, "Broker") or InStr(elm.document.getElementsByTagName("b").item(12).innertext, "Sales Associate") THEN
licenseexp = elm.document.getElementsByTagName("b").item(16).innertext
Else licenseexp = elm.document.getElementsByTagName("b").item(15).innertext
End If

myData =  name & ", " & replace(address, vbCrLf, "") & ", " & replace(licensenumber, ","," ") & ", " & replace(licensestatus, ","," ") & ", " & replace(licenseexp,","," ") & "," & fName & ", " & vbCrlf

set fso = createobject("scripting.filesystemobject")
set ts = fso.opentextfile("C:\Emailgrab\myfloridalicense.com\Extract URL\Complete.csv",8,true) 
ts.write myData 
ts.close 

end if
next

loop

Wscript.Echo "All Data Copied!"