为什么无法访问 public 网站?
Why is it not possible to access a public website?
此 url 无法使用 URL-复制和粘贴打开,它仅显示错误。我尝试通过 MSXML2.XMLHTTP 访问,但没有成功,结果相同!
这是我的代码:
子 GetDataWebsite()
Const URL = "http://Zvg-port.de/index.php"
Dim HTML As New HTMLDocument
Dim elmt As Object
Dim x As Long
With CreateObject("MSXML2.XMLHTTP")
.Open "POST", URL, False
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
.send
HTML.body.innerHTML = .responseText
End With
Set elmt = HTML.querySelectorAll("tr") 'or any class or tag or name
For x = 0 To elmt.Length - 1
ActiveSheet.Cells(x + 2, 2) = elmt.Item(x).innerText
Next x
结束子
没有任何反应!可能是什么问题呢?谢谢!
对于您的第一个问题,您需要在初始请求中添加额外的参数body。
奇怪的是,如果想使用 querySelectorAll()
.
,就必须继续工作 html
,而不是将任何东西设置到派生变量中
对于你的第二个问题,结果 url 应该在来自搜索页面后被导航到。一些测试表明需要 referer
header。我知道 with/without html session 和 referer
header 的请求会工作,因为我用 Python 测试过,但我没有工作找出 VBA 缺少的位是什么;我目前的尝试正在返回看起来也被截断的奇怪编码。
目前,我看到的最简单的方法,如果坚持使用 VBA,以确保遵循 links,将自动化浏览器,收集结果和结果 links,然后导航到每个 link.
当前代码(回答您的第一个问题):
Option Explicit
Public Sub GetDataZvgPort()
Const URL = "https://www.zvg-portal.de/index.php?button=Suchen"
Dim html As MSHTML.HTMLDocument, xhr As Object
Set html = New MSHTML.HTMLDocument
Set xhr = CreateObject("MSXML2.ServerXMLHTTP.6.0")
Dim headers As Variant
With xhr
.Open "POST", URL, False
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
.send "land_abk=sh&ger_name=Norderstedt&order_by=2&ger_id=X1526"
headers = .getAllResponseHeaders
html.body.innerHTML = .responseText
End With
Dim x As Long, link As String, gatheredLinks()
ReDim gatheredLinks(html.querySelectorAll("td a").Length - 1)
For x = 0 To html.querySelectorAll("table a nobr").Length - 1
ActiveSheet.Cells(x + 2, 2) = html.querySelectorAll("table a nobr").Item(x).innerText
link = Replace$(html.querySelectorAll("td a").Item(x).href, "about:", "https://www.zvg-portal.de/")
ActiveSheet.Cells(x + 2, 3) = link
Dim j As Long
For j = 0 To html.querySelectorAll("tr").Length - 1
If InStr(html.querySelectorAll("tr").Item(j).innerHTML, "Amtsgericht") > 0 Then
ActiveSheet.Cells(x + 2, 4) = html.querySelectorAll("tr").Item(j).getElementsByTagName("b")(0).innerText
Exit For
End If
Next
gatheredLinks(x) = link
Next x
' With xhr
' For x = LBound(gatheredLinks) To UBound(gatheredLinks)
' .Open "GET", gatheredLinks(x), False
' .setRequestHeader "Referer", "https://www.zvg-portal.de/index.php?button=Suchen"
' .setRequestHeader "Content-Type", "text/html; charset=ISO-8859-1"
' .setRequestHeader "User-Agent", "python-requests/2.24.0"
' .setRequestHeader "Accept-Encoding", "gzip, deflate"
' .setRequestHeader "Connection", "keep-alive"
' .setRequestHeader "Accept", "text/html,application/xhtml+xml,application/xml;"
' .send
' ActiveSheet.Cells(x + 2, 5) = .Status
' html.body.innerHTML = .responseText 'test response
' Dim s As String
' s = .responseText
' ActiveSheet.Cells(x + 2, 6) = s
' Stop
'
' 'do something else
' Next
' End With
Stop
End Sub
Python(使用session)可以成功从结果中检索内容 links:
import requests
from bs4 import BeautifulSoup as bs
data = {'ger_name': 'Norderstedt','order_by': '2','land_abk': 'sh','ger_id': 'X1526'}
headers = {'Referer': 'https://www.zvg-portal.de/index.php?button=Suchen'}
with requests.Session() as s:
r = s.post('https://www.zvg-portal.de/index.php?button=Suchen', data=data)
print(r.status_code)
soup = bs(r.content, 'lxml')
links = ['https://www.zvg-portal.de/' + i['href'] for i in soup.select('td a')]
s.headers = headers
for link in links:
r = s.get(link)
# print(r.status_code)
soup = bs(r.content, 'lxml')
print(soup.select_one('td p').text)
Session 不需要。它只是为了提高效率。
没有仍然有效的 session,发送的 header 是:
{'User-Agent': 'python-requests/2.24.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive', 'Referer': 'https://www.zvg-portal.de/index.php?button=Suchen'}
此 url 无法使用 URL-复制和粘贴打开,它仅显示错误。我尝试通过 MSXML2.XMLHTTP 访问,但没有成功,结果相同!
这是我的代码:
子 GetDataWebsite()
Const URL = "http://Zvg-port.de/index.php"
Dim HTML As New HTMLDocument
Dim elmt As Object
Dim x As Long
With CreateObject("MSXML2.XMLHTTP")
.Open "POST", URL, False
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
.send
HTML.body.innerHTML = .responseText
End With
Set elmt = HTML.querySelectorAll("tr") 'or any class or tag or name
For x = 0 To elmt.Length - 1
ActiveSheet.Cells(x + 2, 2) = elmt.Item(x).innerText
Next x
结束子
没有任何反应!可能是什么问题呢?谢谢!
对于您的第一个问题,您需要在初始请求中添加额外的参数body。
奇怪的是,如果想使用 querySelectorAll()
.
html
,而不是将任何东西设置到派生变量中
对于你的第二个问题,结果 url 应该在来自搜索页面后被导航到。一些测试表明需要 referer
header。我知道 with/without html session 和 referer
header 的请求会工作,因为我用 Python 测试过,但我没有工作找出 VBA 缺少的位是什么;我目前的尝试正在返回看起来也被截断的奇怪编码。
目前,我看到的最简单的方法,如果坚持使用 VBA,以确保遵循 links,将自动化浏览器,收集结果和结果 links,然后导航到每个 link.
当前代码(回答您的第一个问题):
Option Explicit
Public Sub GetDataZvgPort()
Const URL = "https://www.zvg-portal.de/index.php?button=Suchen"
Dim html As MSHTML.HTMLDocument, xhr As Object
Set html = New MSHTML.HTMLDocument
Set xhr = CreateObject("MSXML2.ServerXMLHTTP.6.0")
Dim headers As Variant
With xhr
.Open "POST", URL, False
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
.send "land_abk=sh&ger_name=Norderstedt&order_by=2&ger_id=X1526"
headers = .getAllResponseHeaders
html.body.innerHTML = .responseText
End With
Dim x As Long, link As String, gatheredLinks()
ReDim gatheredLinks(html.querySelectorAll("td a").Length - 1)
For x = 0 To html.querySelectorAll("table a nobr").Length - 1
ActiveSheet.Cells(x + 2, 2) = html.querySelectorAll("table a nobr").Item(x).innerText
link = Replace$(html.querySelectorAll("td a").Item(x).href, "about:", "https://www.zvg-portal.de/")
ActiveSheet.Cells(x + 2, 3) = link
Dim j As Long
For j = 0 To html.querySelectorAll("tr").Length - 1
If InStr(html.querySelectorAll("tr").Item(j).innerHTML, "Amtsgericht") > 0 Then
ActiveSheet.Cells(x + 2, 4) = html.querySelectorAll("tr").Item(j).getElementsByTagName("b")(0).innerText
Exit For
End If
Next
gatheredLinks(x) = link
Next x
' With xhr
' For x = LBound(gatheredLinks) To UBound(gatheredLinks)
' .Open "GET", gatheredLinks(x), False
' .setRequestHeader "Referer", "https://www.zvg-portal.de/index.php?button=Suchen"
' .setRequestHeader "Content-Type", "text/html; charset=ISO-8859-1"
' .setRequestHeader "User-Agent", "python-requests/2.24.0"
' .setRequestHeader "Accept-Encoding", "gzip, deflate"
' .setRequestHeader "Connection", "keep-alive"
' .setRequestHeader "Accept", "text/html,application/xhtml+xml,application/xml;"
' .send
' ActiveSheet.Cells(x + 2, 5) = .Status
' html.body.innerHTML = .responseText 'test response
' Dim s As String
' s = .responseText
' ActiveSheet.Cells(x + 2, 6) = s
' Stop
'
' 'do something else
' Next
' End With
Stop
End Sub
Python(使用session)可以成功从结果中检索内容 links:
import requests
from bs4 import BeautifulSoup as bs
data = {'ger_name': 'Norderstedt','order_by': '2','land_abk': 'sh','ger_id': 'X1526'}
headers = {'Referer': 'https://www.zvg-portal.de/index.php?button=Suchen'}
with requests.Session() as s:
r = s.post('https://www.zvg-portal.de/index.php?button=Suchen', data=data)
print(r.status_code)
soup = bs(r.content, 'lxml')
links = ['https://www.zvg-portal.de/' + i['href'] for i in soup.select('td a')]
s.headers = headers
for link in links:
r = s.get(link)
# print(r.status_code)
soup = bs(r.content, 'lxml')
print(soup.select_one('td p').text)
Session 不需要。它只是为了提高效率。
没有仍然有效的 session,发送的 header 是:
{'User-Agent': 'python-requests/2.24.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive', 'Referer': 'https://www.zvg-portal.de/index.php?button=Suchen'}