尝试抓取任何地方或餐馆的 google 首页地址但不幸
Try to crawl the google front page address for any place or restaurant but unlucky
试图从 google 首页信息面板抓取餐馆地址,但得到 "urllib.error.HTTPError: HTTP Error 403: Forbidden"
错误和程序不是 运行。
我对 python 网络抓取比较陌生,请帮忙。
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
import json
import re
import sys
import warnings
if not sys.warnoptions:
warnings.simplefilter("ignore")
#get google URL.
url = "https://www.google.com/search?q=barbeque%20nation%20-%20noida"
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
page = fromstring(response)
soup = BeautifulSoup(page, 'url.parser')
the_page = soup.prettify("utf-8")
hotel_json = {}
for line in soup.find_all('script',attrs={"type" :
"application/ld+json"}):
details = line.text.strip()
details = json.loads(details)
hotel_json["address"]["LrzXr"]=details["address"]["streetAddress"]
break
with open(hotel_json["name"]+".html", "wb") as file:
file.write(html)
with open(hotel_json["name"]+".json", 'w') as outfile:
json.dump(hotel_json, outfile, indent=4)
添加一个user-agentheader
request = urllib.request.Request(url, headers = {'User-Agent' : 'Mozilla/5.0'})
试图从 google 首页信息面板抓取餐馆地址,但得到 "urllib.error.HTTPError: HTTP Error 403: Forbidden" 错误和程序不是 运行。 我对 python 网络抓取比较陌生,请帮忙。
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
import json
import re
import sys
import warnings
if not sys.warnoptions:
warnings.simplefilter("ignore")
#get google URL.
url = "https://www.google.com/search?q=barbeque%20nation%20-%20noida"
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
page = fromstring(response)
soup = BeautifulSoup(page, 'url.parser')
the_page = soup.prettify("utf-8")
hotel_json = {}
for line in soup.find_all('script',attrs={"type" :
"application/ld+json"}):
details = line.text.strip()
details = json.loads(details)
hotel_json["address"]["LrzXr"]=details["address"]["streetAddress"]
break
with open(hotel_json["name"]+".html", "wb") as file:
file.write(html)
with open(hotel_json["name"]+".json", 'w') as outfile:
json.dump(hotel_json, outfile, indent=4)
添加一个user-agentheader
request = urllib.request.Request(url, headers = {'User-Agent' : 'Mozilla/5.0'})