Google 在 Python 中搜索时出错:503 服务不可用
Error with Google search in Python: 503 Service Unavailable
当我尝试在 python 控制台中执行操作时:
from google import search
urls = search("site:facebook.com inurl:login", stop=20)
for url in urls:
print(url)
为了搜索登录页面,我得到一个错误:
urllib.error.HTTPError: HTTP Error 503: Service Unavailable
但是,如果我尝试在 Google 中手动搜索它,它会起作用,google 可能会阻止我的查询吗?
Google 确实试图阻止 "unexpected" 查询通过。在普通浏览器中 UI 它会提供一个验证码。它将考虑流量模式(使用 "smart" 查询进行过快的搜索,垃圾邮件发送者使用已知的 IP 块)和客户端的行为。
您可以通过捕获错误来检查错误的详细信息。
try:
urls = search("site:facebook.com inurl:login", stop=20)
except urllib.error.HTTPError as httperr:
print(httperr.headers) # Dump the headers to see if there's more information
print(httperr.read()) # You can even read this error object just like a normal response file
喜欢Cong Ma said in his answer doing many automated searches on google will result in google block you and you'll get error 503. Only API from google for doing seaches that is currently working is Google Custom Search API. The problem with that is that it was designed to search through your pages. And there is option to set it to search all pages (see this answer), but even then you can have only 100 seacrhes per day. Before there was option to use other APIs but like Bing and Yahoo, but neither of them are free anymore. Only free API that does internet searches is FAROO API. But there is still one option to do google search by using selenium webdriver. Selenium is used to imitate browser usage and it has options to use Firefox, Chrome, Edge or Safari webdrivers (it actually opens Chrome and does your search), but this is annoying because you don't actually want to see the browser. But there is solution for this you can use PhantomJS. Download from here。提取出来看看下面的例子如何使用(我写了一个简单的class你可以用,你只需要将路径更改为PhantomJS):
import time
from urllib.parse import quote_plus
from selenium import webdriver
class Browser:
def __init__(self, path, initiate=True, implicit_wait_time = 10, explicit_wait_time = 2):
self.path = path
self.implicit_wait_time = implicit_wait_time # http://www.aptuz.com/blog/selenium-implicit-vs-explicit-waits/
self.explicit_wait_time = explicit_wait_time # http://www.aptuz.com/blog/selenium-implicit-vs-explicit-waits/
if initiate:
self.start()
return
def start(self):
self.driver = webdriver.PhantomJS(path)
self.driver.implicitly_wait(self.implicit_wait_time)
return
def end(self):
self.driver.quit()
return
def go_to_url(self, url, wait_time = None):
if wait_time is None:
wait_time = self.explicit_wait_time
self.driver.get(url)
print('[*] Fetching results from: {}'.format(url))
time.sleep(wait_time)
return
def get_search_url(self, query, page_num=0, per_page=10, lang='en'):
query = quote_plus(query)
url = 'https://www.google.hr/search?q={}&num={}&start={}&nl={}'.format(query, per_page, page_num*per_page, lang)
return url
def scrape(self):
#xpath migth change in future
links = self.driver.find_elements_by_xpath("//h3[@class='r']/a[@href]") # searches for all links insede h3 tags with class "r"
results = []
for link in links:
d = {'url': link.get_attribute('href'),
'title': link.text}
results.append(d)
return results
def search(self, query, page_num=0, per_page=10, lang='en', wait_time = None):
if wait_time is None:
wait_time = self.explicit_wait_time
url = self.get_search_url(query, page_num, per_page, lang)
self.go_to_url(url, wait_time)
results = self.scrape()
return results
path = '<YOUR PATH TO PHANTOMJS>/phantomjs-2.1.1-windows/bin/phantomjs.exe' ## SET YOU PATH TO phantomjs
br = Browser(path)
results = br.search('site:facebook.com inurl:login')
for r in results:
print(r)
br.end()
当我尝试在 python 控制台中执行操作时:
from google import search
urls = search("site:facebook.com inurl:login", stop=20)
for url in urls:
print(url)
为了搜索登录页面,我得到一个错误:
urllib.error.HTTPError: HTTP Error 503: Service Unavailable
但是,如果我尝试在 Google 中手动搜索它,它会起作用,google 可能会阻止我的查询吗?
Google 确实试图阻止 "unexpected" 查询通过。在普通浏览器中 UI 它会提供一个验证码。它将考虑流量模式(使用 "smart" 查询进行过快的搜索,垃圾邮件发送者使用已知的 IP 块)和客户端的行为。
您可以通过捕获错误来检查错误的详细信息。
try:
urls = search("site:facebook.com inurl:login", stop=20)
except urllib.error.HTTPError as httperr:
print(httperr.headers) # Dump the headers to see if there's more information
print(httperr.read()) # You can even read this error object just like a normal response file
喜欢Cong Ma said in his answer doing many automated searches on google will result in google block you and you'll get error 503. Only API from google for doing seaches that is currently working is Google Custom Search API. The problem with that is that it was designed to search through your pages. And there is option to set it to search all pages (see this answer), but even then you can have only 100 seacrhes per day. Before there was option to use other APIs but like Bing and Yahoo, but neither of them are free anymore. Only free API that does internet searches is FAROO API. But there is still one option to do google search by using selenium webdriver. Selenium is used to imitate browser usage and it has options to use Firefox, Chrome, Edge or Safari webdrivers (it actually opens Chrome and does your search), but this is annoying because you don't actually want to see the browser. But there is solution for this you can use PhantomJS. Download from here。提取出来看看下面的例子如何使用(我写了一个简单的class你可以用,你只需要将路径更改为PhantomJS):
import time
from urllib.parse import quote_plus
from selenium import webdriver
class Browser:
def __init__(self, path, initiate=True, implicit_wait_time = 10, explicit_wait_time = 2):
self.path = path
self.implicit_wait_time = implicit_wait_time # http://www.aptuz.com/blog/selenium-implicit-vs-explicit-waits/
self.explicit_wait_time = explicit_wait_time # http://www.aptuz.com/blog/selenium-implicit-vs-explicit-waits/
if initiate:
self.start()
return
def start(self):
self.driver = webdriver.PhantomJS(path)
self.driver.implicitly_wait(self.implicit_wait_time)
return
def end(self):
self.driver.quit()
return
def go_to_url(self, url, wait_time = None):
if wait_time is None:
wait_time = self.explicit_wait_time
self.driver.get(url)
print('[*] Fetching results from: {}'.format(url))
time.sleep(wait_time)
return
def get_search_url(self, query, page_num=0, per_page=10, lang='en'):
query = quote_plus(query)
url = 'https://www.google.hr/search?q={}&num={}&start={}&nl={}'.format(query, per_page, page_num*per_page, lang)
return url
def scrape(self):
#xpath migth change in future
links = self.driver.find_elements_by_xpath("//h3[@class='r']/a[@href]") # searches for all links insede h3 tags with class "r"
results = []
for link in links:
d = {'url': link.get_attribute('href'),
'title': link.text}
results.append(d)
return results
def search(self, query, page_num=0, per_page=10, lang='en', wait_time = None):
if wait_time is None:
wait_time = self.explicit_wait_time
url = self.get_search_url(query, page_num, per_page, lang)
self.go_to_url(url, wait_time)
results = self.scrape()
return results
path = '<YOUR PATH TO PHANTOMJS>/phantomjs-2.1.1-windows/bin/phantomjs.exe' ## SET YOU PATH TO phantomjs
br = Browser(path)
results = br.search('site:facebook.com inurl:login')
for r in results:
print(r)
br.end()