使用 urllib2 而不是请求来抓取 Google 学者
Scraping Google Scholar with urllib2 instead of requests
我有下面的简单脚本,它可以很好地从 Google 搜索感兴趣的术语的学者中获取文章列表。
import urllib
import urllib2
import requests
from bs4 import BeautifulSoup
SEARCH_SCHOLAR_HOST = "https://scholar.google.com"
SEARCH_SCHOLAR_URL = "/scholar"
def searchScholar(searchStr, limit=10):
"""Search Google Scholar for articles and publications containing terms of interest"""
url = SEARCH_SCHOLAR_HOST + SEARCH_SCHOLAR_URL + "?q=" + urllib.quote_plus(searchStr) + "&ie=UTF-8&oe=UTF-8&hl=en&btnG=Search"
content = requests.get(url, verify=False).text
page = BeautifulSoup(content, 'lxml')
results = {}
count = 0
for entry in page.find_all("h3", attrs={"class": "gs_rt"}):
if count < limit:
try:
text = entry.a.text.encode("ascii", "ignore")
url = entry.a['href']
results[url] = text
count += 1
except:
pass
return results
queryStr = "Albert einstein"
pubs = searchScholar(queryStr, 10)
if len(pubs) == 0:
print "No articles found"
else:
for pub in pubs.keys():
print pub + ' ' + pubs[pub]
但是,我想 运行 此脚本作为远程服务器上的 CGI 应用程序,无需访问控制台,因此我无法安装任何外部 Python 模块。 (我设法 'install' BeautifulSoup 而不是求助于 pip 或 easy_install 只是将 bs4 目录复制到我的 cgi-bin 目录,但是这个技巧不适用于 requests 因为它的数量很大的依赖关系。)
所以,我的问题是:是否可以使用内置的 urllib2 或 httplib Python 模块,而不是请求获取 Google 学术页面,然后将其传递给 BeautifulSoup?应该是,因为我发现一些代码 here 只使用标准库加上 BeautifulSoup 来抓取 Google Scholar,但它相当复杂。我更愿意实现一个更简单的解决方案,只需调整我的脚本以使用标准库而不是请求。
谁能帮帮我?
此代码足以使用 urllib2:
执行简单请求
def get(url):
req = urllib2.Request(url)
req.add_header('User-Agent', 'Mozilla/2.0 (compatible; MSIE 5.5; Windows NT)')
return urllib2.urlopen(req).read()
如果你以后需要做一些更高级的东西,那将是更多的代码。 request 所做的是简化标准库的使用。
将user-agent传给request headers (check what's your user-agent) and requests parameters:
import urllib.parse
import urllib.request
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'}
params_values = {
'gl': 'us',
'hl': 'en'
}
params = urllib.parse.urlencode(params_values).encode('ascii')
req = urllib.request.Request('https://scholar.google.com', params, headers)
with urllib.request.urlopen(req) as response:
html = response.read()
但是,传递 user-agent 不会阻止您的请求被阻止。
要绕过 CAPTCHA 或找出要使用的代理,您可以尝试使用 SerpApi 的 Google Scholar API,这是一个付费的 API,有 100 次免费搜索可供测试。
这样你就不需要弄清楚如何绕过 Google 或其他搜索引擎的块并随着时间的推移维护解析器。
代码和 example in the online IDE 从所有可用页面抓取所有出版物:
import pandas as pd
import os, json
from serpapi import GoogleScholarSearch
from urllib.parse import urlsplit, parse_qsl
def serpapi_scrape_all_publications(query: str):
params = {
"api_key": os.getenv("API_KEY"), # your SerpApi API key
"engine": "google_scholar", # search engine
"hl": "en", # language
"q": query, # search query
"num": "100" # articles per page
}
# where data extraction happens on SerpApi backend.
search = GoogleScholarSearch(params)
publications = []
publications_is_present = True
while publications_is_present:
results = search.get_dict() # JSON -> Python dictionary
for index, publication in enumerate(results.get("organic_results", {}), start=1):
publications.append({
"title": publication.get("title"),
"link": publication.get("link"),
"result_id": publication.get("result_id"),
"snippet": publication.get("snippet"),
"inline_links": publication.get("inline_links"),
"publication_info": publication.get("publication_info")
})
# checks for the next page and updates if present
if "next" in results.get("serpapi_pagination", []):
# split URL in parts as a dict() and update "search" variable to a new page
search.params_dict.update(dict(parse_qsl(urlsplit(results["serpapi_pagination"]["next"]).query)))
else:
publications_is_present = False
print(json.dumps(publications, indent=2, ensure_ascii=False))
serpapi_scrape_all_publications(query="biology")
输出:
[
{
"title": "Fungal decomposition of wood: its biology and ecology",
"link": null,
"result_id": "LiWKgtH72owJ",
"snippet": "",
"inline_links": {
"serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=LiWKgtH72owJ",
"cited_by": {
"total": 1446,
"link": "https://scholar.google.com/scholar?cites=10149701587489662254&as_sdt=400005&sciodt=0,14&hl=en&num=20",
"cites_id": "10149701587489662254",
"serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=400005&cites=10149701587489662254&engine=google_scholar&hl=en&num=20"
},
"related_pages_link": "https://scholar.google.com/scholar?q=related:LiWKgtH72owJ:scholar.google.com/&scioq=biology&hl=en&num=20&as_sdt=0,14",
"versions": {
"total": 6,
"link": "https://scholar.google.com/scholar?cluster=10149701587489662254&hl=en&num=20&as_sdt=0,14",
"cluster_id": "10149701587489662254",
"serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C14&cluster=10149701587489662254&engine=google_scholar&hl=en&num=20"
}
},
"publication_info": {
"summary": "ADM Rayner, L Boddy - 1988"
}
}, ... other results
]
Disclaimer, I work for SerpApi.
我有下面的简单脚本,它可以很好地从 Google 搜索感兴趣的术语的学者中获取文章列表。
import urllib
import urllib2
import requests
from bs4 import BeautifulSoup
SEARCH_SCHOLAR_HOST = "https://scholar.google.com"
SEARCH_SCHOLAR_URL = "/scholar"
def searchScholar(searchStr, limit=10):
"""Search Google Scholar for articles and publications containing terms of interest"""
url = SEARCH_SCHOLAR_HOST + SEARCH_SCHOLAR_URL + "?q=" + urllib.quote_plus(searchStr) + "&ie=UTF-8&oe=UTF-8&hl=en&btnG=Search"
content = requests.get(url, verify=False).text
page = BeautifulSoup(content, 'lxml')
results = {}
count = 0
for entry in page.find_all("h3", attrs={"class": "gs_rt"}):
if count < limit:
try:
text = entry.a.text.encode("ascii", "ignore")
url = entry.a['href']
results[url] = text
count += 1
except:
pass
return results
queryStr = "Albert einstein"
pubs = searchScholar(queryStr, 10)
if len(pubs) == 0:
print "No articles found"
else:
for pub in pubs.keys():
print pub + ' ' + pubs[pub]
但是,我想 运行 此脚本作为远程服务器上的 CGI 应用程序,无需访问控制台,因此我无法安装任何外部 Python 模块。 (我设法 'install' BeautifulSoup 而不是求助于 pip 或 easy_install 只是将 bs4 目录复制到我的 cgi-bin 目录,但是这个技巧不适用于 requests 因为它的数量很大的依赖关系。)
所以,我的问题是:是否可以使用内置的 urllib2 或 httplib Python 模块,而不是请求获取 Google 学术页面,然后将其传递给 BeautifulSoup?应该是,因为我发现一些代码 here 只使用标准库加上 BeautifulSoup 来抓取 Google Scholar,但它相当复杂。我更愿意实现一个更简单的解决方案,只需调整我的脚本以使用标准库而不是请求。
谁能帮帮我?
此代码足以使用 urllib2:
执行简单请求def get(url):
req = urllib2.Request(url)
req.add_header('User-Agent', 'Mozilla/2.0 (compatible; MSIE 5.5; Windows NT)')
return urllib2.urlopen(req).read()
如果你以后需要做一些更高级的东西,那将是更多的代码。 request 所做的是简化标准库的使用。
将user-agent传给request headers (check what's your user-agent) and requests parameters:
import urllib.parse
import urllib.request
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'}
params_values = {
'gl': 'us',
'hl': 'en'
}
params = urllib.parse.urlencode(params_values).encode('ascii')
req = urllib.request.Request('https://scholar.google.com', params, headers)
with urllib.request.urlopen(req) as response:
html = response.read()
但是,传递 user-agent 不会阻止您的请求被阻止。
要绕过 CAPTCHA 或找出要使用的代理,您可以尝试使用 SerpApi 的 Google Scholar API,这是一个付费的 API,有 100 次免费搜索可供测试。
这样你就不需要弄清楚如何绕过 Google 或其他搜索引擎的块并随着时间的推移维护解析器。
代码和 example in the online IDE 从所有可用页面抓取所有出版物:
import pandas as pd
import os, json
from serpapi import GoogleScholarSearch
from urllib.parse import urlsplit, parse_qsl
def serpapi_scrape_all_publications(query: str):
params = {
"api_key": os.getenv("API_KEY"), # your SerpApi API key
"engine": "google_scholar", # search engine
"hl": "en", # language
"q": query, # search query
"num": "100" # articles per page
}
# where data extraction happens on SerpApi backend.
search = GoogleScholarSearch(params)
publications = []
publications_is_present = True
while publications_is_present:
results = search.get_dict() # JSON -> Python dictionary
for index, publication in enumerate(results.get("organic_results", {}), start=1):
publications.append({
"title": publication.get("title"),
"link": publication.get("link"),
"result_id": publication.get("result_id"),
"snippet": publication.get("snippet"),
"inline_links": publication.get("inline_links"),
"publication_info": publication.get("publication_info")
})
# checks for the next page and updates if present
if "next" in results.get("serpapi_pagination", []):
# split URL in parts as a dict() and update "search" variable to a new page
search.params_dict.update(dict(parse_qsl(urlsplit(results["serpapi_pagination"]["next"]).query)))
else:
publications_is_present = False
print(json.dumps(publications, indent=2, ensure_ascii=False))
serpapi_scrape_all_publications(query="biology")
输出:
[
{
"title": "Fungal decomposition of wood: its biology and ecology",
"link": null,
"result_id": "LiWKgtH72owJ",
"snippet": "",
"inline_links": {
"serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=LiWKgtH72owJ",
"cited_by": {
"total": 1446,
"link": "https://scholar.google.com/scholar?cites=10149701587489662254&as_sdt=400005&sciodt=0,14&hl=en&num=20",
"cites_id": "10149701587489662254",
"serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=400005&cites=10149701587489662254&engine=google_scholar&hl=en&num=20"
},
"related_pages_link": "https://scholar.google.com/scholar?q=related:LiWKgtH72owJ:scholar.google.com/&scioq=biology&hl=en&num=20&as_sdt=0,14",
"versions": {
"total": 6,
"link": "https://scholar.google.com/scholar?cluster=10149701587489662254&hl=en&num=20&as_sdt=0,14",
"cluster_id": "10149701587489662254",
"serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C14&cluster=10149701587489662254&engine=google_scholar&hl=en&num=20"
}
},
"publication_info": {
"summary": "ADM Rayner, L Boddy - 1988"
}
}, ... other results
]
Disclaimer, I work for SerpApi.