如何在 VBScript 中抓取 <h1> <h2> 并保存到文本文件
How do I Scrape <h1> <h2> and save to text file inVBScript
我是新来的。从 Web 上的示例编写 VBScript 代码。我正在尝试从网页中获取 <h1>
和 <h2>
文本并将其保存到文本文件中。下面是网页数据的示例,后面是我的一些失败代码。 运行 Windows PC 上的 7 个家庭高级版。
'这是我正在访问的网页 ===========
<body>
<div class stuff
<div id stuff
<div class="header-info">
<h1>The Girl I Love</h1>
<h2>Tony Bennet</h2>
<more div stuff
'这是我的代码==============================
'=== attach to an already running IE instance:
Set app = CreateObject("Shell.Application")
For Each window In app.Windows()
If InStr(1, window.FullName, "iexplore", vbTextCompare) > 0 Then
Set ie = window
Exit For
End If
Next
'Set up text file to write to
Set fso = CreateObject("Scripting.FileSystemObject")
Set f = fso.OpenTextFile("c:\users\kp\desktop\output.txt", 2, True, -1)
'Various Code line tests - and results
f.Write ie.document.body 'returns [object HTMLBodyElement]
f.Write ie.document.body.innerText 'returns all body text
f.Write ie.document.getElementsByClassName("header-info") 'returns [object HTMLCollection]
f.Write ie.document.getElementsByTagName("<h1>") 'returns [object HTMLCollection]
f.Write ie.document.getElementsByTagName("<h1>").innerText 'FAILS not valid
kpmsg = "you're done"
Wscript.echo kpmsg
完整节目在这里https://onedrive.live.com/redir?resid=E2F0CE17A268A4FA%21348
Set Arg = WScript.Arguments
set WshShell = createObject("Wscript.Shell")
Set Inp = WScript.Stdin
Set Outp = Wscript.Stdout
Sub HttpGet
On Error Resume Next
' Have to use MSXML2 as Microsoft.XMLHTTP caused Access Denied errors after the page had been repeatedly gotten, go figure that one
' Set File = WScript.CreateObject("MSXML2.ServerXMLHTTP.4.0")
Set File = WScript.CreateObject("Microsoft.XMLHTTP")
File.Open "GET", Arg(1), False
File.setRequestHeader "User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 1.1.4322; .NET CLR 3.5.30729; .NET CLR 3.0.30618; .NET4.0C; .NET4.0E; BCD2000; BCD2000)"
File.Send
txt=File.ResponseText
'Putting in line endings
Outp.write txt
If err.number <> 0 then
Outp.writeline ""
Outp.writeline "Error getting file"
Outp.writeline "=================="
Outp.writeline ""
Outp.writeline "Error " & err.number & "(0x" & hex(err.number) & ") " & err.description
Outp.writeline "Source " & err.source
Outp.writeline ""
Outp.writeline "HTTP Error " & File.Status & " " & File.StatusText
Outp.writeline File.getAllResponseHeaders
Outp.writeline LCase(Arg(1))
End If
End Sub
'=============================================
Sub RemoveHTMLTags
Set ie = CreateObject("InternetExplorer.Application")
ie.Visible = 0
ie.Silent = 1
ie.Navigate2 "file://" & FilterPath & "Filter.html"
Do
wscript.sleep 50
Loop Until ie.document.readystate = "complete"
ie.document.body.innerhtml = Inp.readall
Outp.write ie.document.body.innertext
' ie.quit
End Sub
过滤器用于命令提示符。 Filter.vbs 必须是 运行 和 cscript.exe。如果您只键入 filter,它将 运行 一个自动执行此操作的批处理文件。
filter subcommand [parameters]
过滤器仅读取和写入标准输入和标准输出。这些仅在命令提示符下可用。
filter <inputfile >outputfile
filter <inputfile | other_command
other_command | filter >outputfile
other_command | filter | other_command
使用
网络
filter web webaddress
filter ip webaddress
从网络检索文件并将其写入标准输出。
webaddress - a web address fully specified including http://
例子
获取微软主页
filter web http://www.microsoft.com
标签
filter tags
从文本中删除 HTML 个标签。
例子
filter web http://www.microsoft.com | filter tags
Collections 正在阅读 for each thing in collection:statements:Next
我是新来的。从 Web 上的示例编写 VBScript 代码。我正在尝试从网页中获取 <h1>
和 <h2>
文本并将其保存到文本文件中。下面是网页数据的示例,后面是我的一些失败代码。 运行 Windows PC 上的 7 个家庭高级版。
'这是我正在访问的网页 ===========
<body>
<div class stuff
<div id stuff
<div class="header-info">
<h1>The Girl I Love</h1>
<h2>Tony Bennet</h2>
<more div stuff
'这是我的代码==============================
'=== attach to an already running IE instance:
Set app = CreateObject("Shell.Application")
For Each window In app.Windows()
If InStr(1, window.FullName, "iexplore", vbTextCompare) > 0 Then
Set ie = window
Exit For
End If
Next
'Set up text file to write to
Set fso = CreateObject("Scripting.FileSystemObject")
Set f = fso.OpenTextFile("c:\users\kp\desktop\output.txt", 2, True, -1)
'Various Code line tests - and results
f.Write ie.document.body 'returns [object HTMLBodyElement]
f.Write ie.document.body.innerText 'returns all body text
f.Write ie.document.getElementsByClassName("header-info") 'returns [object HTMLCollection]
f.Write ie.document.getElementsByTagName("<h1>") 'returns [object HTMLCollection]
f.Write ie.document.getElementsByTagName("<h1>").innerText 'FAILS not valid
kpmsg = "you're done"
Wscript.echo kpmsg
完整节目在这里https://onedrive.live.com/redir?resid=E2F0CE17A268A4FA%21348
Set Arg = WScript.Arguments
set WshShell = createObject("Wscript.Shell")
Set Inp = WScript.Stdin
Set Outp = Wscript.Stdout
Sub HttpGet
On Error Resume Next
' Have to use MSXML2 as Microsoft.XMLHTTP caused Access Denied errors after the page had been repeatedly gotten, go figure that one
' Set File = WScript.CreateObject("MSXML2.ServerXMLHTTP.4.0")
Set File = WScript.CreateObject("Microsoft.XMLHTTP")
File.Open "GET", Arg(1), False
File.setRequestHeader "User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 1.1.4322; .NET CLR 3.5.30729; .NET CLR 3.0.30618; .NET4.0C; .NET4.0E; BCD2000; BCD2000)"
File.Send
txt=File.ResponseText
'Putting in line endings
Outp.write txt
If err.number <> 0 then
Outp.writeline ""
Outp.writeline "Error getting file"
Outp.writeline "=================="
Outp.writeline ""
Outp.writeline "Error " & err.number & "(0x" & hex(err.number) & ") " & err.description
Outp.writeline "Source " & err.source
Outp.writeline ""
Outp.writeline "HTTP Error " & File.Status & " " & File.StatusText
Outp.writeline File.getAllResponseHeaders
Outp.writeline LCase(Arg(1))
End If
End Sub
'=============================================
Sub RemoveHTMLTags
Set ie = CreateObject("InternetExplorer.Application")
ie.Visible = 0
ie.Silent = 1
ie.Navigate2 "file://" & FilterPath & "Filter.html"
Do
wscript.sleep 50
Loop Until ie.document.readystate = "complete"
ie.document.body.innerhtml = Inp.readall
Outp.write ie.document.body.innertext
' ie.quit
End Sub
过滤器用于命令提示符。 Filter.vbs 必须是 运行 和 cscript.exe。如果您只键入 filter,它将 运行 一个自动执行此操作的批处理文件。
filter subcommand [parameters]
过滤器仅读取和写入标准输入和标准输出。这些仅在命令提示符下可用。
filter <inputfile >outputfile
filter <inputfile | other_command
other_command | filter >outputfile
other_command | filter | other_command
使用
网络
filter web webaddress
filter ip webaddress
从网络检索文件并将其写入标准输出。
webaddress - a web address fully specified including http://
例子
获取微软主页
filter web http://www.microsoft.com
标签
filter tags
从文本中删除 HTML 个标签。
例子
filter web http://www.microsoft.com | filter tags
Collections 正在阅读 for each thing in collection:statements:Next