使用 BeautifulSoup 检索 Google 学术搜索结果时出现问题
Problems in retrieving Google Scholar results with BeautifulSoup
我将继续从我的 开始的分析。我在一个由四列组成的数据框中获得了有关特定工作论文出版物的信息:出版年份、出版顺序(每年出版的顺序,在这种情况下毫无用处)、标题和作者。因此,我想使用这个数据框来抓取 Google Scholar 并检索有关引用次数的信息。
因为有些论文的标题有点笼统,在某些情况下 google 学者的第一个结果实际上并不是我感兴趣的。因此,为了进行更有针对性的研究,在创建 link 为了进行研究,我已经包括了每篇论文的标题和作者。我已经按照 线程编写代码。
注意:因为执行此抓取需要真实姓名,所以我宁愿不创建示例数据框。我已将 .csv 文件上传到我的 GitHub。
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
from random import randint
from time import sleep
url = 'https://raw.githubusercontent.com/nicolacaravaggio/working_paper_roma3/master/rm3_working_paper_list.csv'
df = pd.read_csv(url, error_bad_lines = False)
papers = []
for index, rows in df.iterrows():
list_paper = rows.title + ' ' + rows.author
papers.append(list_paper)
title_list_gs = []
citations_list_gs = []
with requests.Session() as s:
for paper in papers:
sleep(randint(1,3))
url = 'https://scholar.google.com/scholar?q=' + paper + '&ie=UTF-8&oe=UTF-8&hl=en&btnG=Search'
r = s.get(url)
soup = bs(r.content, 'html.parser')
title_gs = soup.select_one('h3.gs_rt a').text if soup.select_one('h3.gs_rt a') is not None else 'No title'
title_list_gs.append(title_gs)
citations_gs = soup.select_one('a:contains("Cited by")').text if soup.select_one('a:contains("Cited by")') is not None else 'No citation count'
citations_list_gs.append(citations_gs)
print('Title:', title_gs, '; Citations:', citations_gs)
然而,我从这个脚本中得到的结果只是一个列表:
Title: No title ; Citations: No Citation count
我不确定问题出在我笨拙的脚本(可能)还是 Google 阻止我从 Scholar 中获取太多信息。事实上,即使是我在此线程中用作起点的 ,它也不总是 return 具有预期的结果。我希望有人能给我一些建议。提前谢谢你。
听起来你正在触发学者机器人检测。从个人爬取 Google Scholar 的经验来看,45 秒足以避免 CAPTCHA 和 bot 检测。我有一个刮板 运行 超过 3 天而没有被发现。如果你确实被标记了,等待大约 2 小时就足以重新开始。 Here is an extract from my code.。
class ScholarScrape():
def __init__(self):
self.page = None
self.last_url = None
self.last_time = time.time()
self.min_time_between_scrape = int(ConfigFile.instance().config.get('scholar','bot_avoidance_time'))
self.header = {'User-Agent':ConfigFile.instance().config.get('scholar','user_agent')}
self.session = requests.Session()
pass
def search(self, query=None, year_lo=None, year_hi=None, title_only=False, publication_string=None, author_string=None, include_citations=True, include_patents=True):
url = self.get_url(query, year_lo, year_hi, title_only, publication_string, author_string, include_citations, include_patents)
while True:
wait_time = self.min_time_between_scrape - (time.time() - self.last_time)
if wait_time > 0:
logger.info("Delaying search by {} seconds to avoid bot detection.".format(wait_time))
time.sleep(wait_time)
self.last_time = time.time()
logger.info("SCHOLARSCRAPE: " + url)
self.page = BeautifulSoup(self.session.get(url, headers=self.header).text, 'html.parser')
self.last_url = url
if "Our systems have detected unusual traffic from your computer network" in str(self.page):
raise BotDetectionException("Google has blocked this computer for a short time because it has detected this scraping script.")
return
def get_url(self, query=None, year_lo=None, year_hi=None, title_only=False, publication_string=None, author_string=None, include_citations=True, include_patents=True):
base_url = "https://scholar.google.com.au/scholar?"
url = base_url + "as_q=" + urllib.parse.quote(query)
if year_lo is not None and bool(re.match(r'.*([1-3][0-9]{3})', str(year_lo))):
url += "&as_ylo=" + str(year_lo)
if year_hi is not None and bool(re.match(r'.*([1-3][0-9]{3})', str(year_hi))):
url += "&as_yhi=" + str(year_hi)
if title_only:
url += "&as_yhi=title"
else:
url += "&as_yhi=any"
if publication_string is not None:
url += "&as_publication=" + urllib.parse.quote('"' + str(publication_string) + '"')
if author_string is not None:
url += "&as_sauthors=" + urllib.parse.quote('"' + str(author_string) + '"')
if include_citations:
url += "&as_vis=0"
else:
url += "&as_vis=1"
if include_patents:
url += "&as_sdt=0"
else:
url += "&as_sdt=1"
return url
def get_results_count(self):
e = self.page.findAll("div", {"class": "gs_ab_mdw"})
try:
item = e[1].text.strip()
except IndexError as ex:
if "Our systems have detected unusual traffic from your computer network" in str(self.page):
raise BotDetectionException("Google has blocked this computer for a short time because it has detected this scraping script.")
else:
raise ex
if self.has_numbers(item):
return self.get_results_count_from_soup_string(item)
for item in e:
item = item.text.strip()
if self.has_numbers(item):
return self.get_results_count_from_soup_string(item)
return 0
@staticmethod
def get_results_count_from_soup_string(element):
if "About" in element:
num = element.split(" ")[1].strip().replace(",","")
else:
num = element.split(" ")[0].strip().replace(",","")
return num
@staticmethod
def has_numbers(input_string):
return any(char.isdigit() for char in input_string)
class BotDetectionException(Exception):
pass
if __name__ == "__main__":
s = ScholarScrape()
s.search(**{
"query":"\"policy shaping\"",
# "publication_string":"JMLR",
"author_string": "gilboa",
"year_lo": "1995",
"year_hi": "2005",
})
x = s.get_results_count()
print(x)
或者,您可以使用第三方解决方案,例如 SerpApi。这是付费 API 免费试用。我们为您处理代理、解决验证码并解析所有丰富的结构化数据。
示例 python 代码(也可在其他库中使用):
from serpapi import GoogleSearch
params = {
"api_key": "secret_api_key",
"engine": "google_scholar",
"q": "Francesco Crespi",
"hl": "en"
}
search = GoogleSearch(params)
results = search.get_dict()
示例 JSON 输出:
{
"position": 1,
"title": "Demand and innovation in productivity growth",
"result_id": "6vfKIRtQWRQJ",
"link": "https://www.tandfonline.com/doi/abs/10.1080/02692170802407429",
"snippet": "The labour productivity impact of demand and innovation is investigated in this paper combining insights from the Kaldorian and Schumpeterian traditions. After a review of studies in such traditions, a general model is proposed for explaining productivity growth in …",
"publication_info": {
"summary": "F Crespi, M Pianta - International Review of Applied Economics, 2008 - Taylor & Francis",
"authors": [
{
"name": "F Crespi",
"link": "https://scholar.google.com/citations?user=gKdC5-0AAAAJ&hl=en&oi=sra",
"serpapi_scholar_link": "https://serpapi.com/search.json?author_id=gKdC5-0AAAAJ&engine=google_scholar_author&hl=en",
"author_id": "gKdC5-0AAAAJ"
},
{
"name": "M Pianta",
"link": "https://scholar.google.com/citations?user=knrxY9EAAAAJ&hl=en&oi=sra",
"serpapi_scholar_link": "https://serpapi.com/search.json?author_id=knrxY9EAAAAJ&engine=google_scholar_author&hl=en",
"author_id": "knrxY9EAAAAJ"
}
]
},
"inline_links": {
"serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=6vfKIRtQWRQJ",
"cited_by": {
"total": 88,
"link": "https://scholar.google.com/scholar?cites=1466291231147096042&as_sdt=2005&sciodt=0,5&hl=en",
"cites_id": "1466291231147096042",
"serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=1466291231147096042&engine=google_scholar&hl=en"
},
"related_pages_link": "https://scholar.google.com/scholar?q=related:6vfKIRtQWRQJ:scholar.google.com/&scioq=Francesco+Crespi&hl=en&as_sdt=0,5",
"versions": {
"total": 7,
"link": "https://scholar.google.com/scholar?cluster=1466291231147096042&hl=en&as_sdt=0,5",
"cluster_id": "1466291231147096042",
"serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=1466291231147096042&engine=google_scholar&hl=en"
}
}
},
...
查看 documentation 了解更多详情。
免责声明:我在 SerpApi 工作。
我将继续从我的
注意:因为执行此抓取需要真实姓名,所以我宁愿不创建示例数据框。我已将 .csv 文件上传到我的 GitHub。
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
from random import randint
from time import sleep
url = 'https://raw.githubusercontent.com/nicolacaravaggio/working_paper_roma3/master/rm3_working_paper_list.csv'
df = pd.read_csv(url, error_bad_lines = False)
papers = []
for index, rows in df.iterrows():
list_paper = rows.title + ' ' + rows.author
papers.append(list_paper)
title_list_gs = []
citations_list_gs = []
with requests.Session() as s:
for paper in papers:
sleep(randint(1,3))
url = 'https://scholar.google.com/scholar?q=' + paper + '&ie=UTF-8&oe=UTF-8&hl=en&btnG=Search'
r = s.get(url)
soup = bs(r.content, 'html.parser')
title_gs = soup.select_one('h3.gs_rt a').text if soup.select_one('h3.gs_rt a') is not None else 'No title'
title_list_gs.append(title_gs)
citations_gs = soup.select_one('a:contains("Cited by")').text if soup.select_one('a:contains("Cited by")') is not None else 'No citation count'
citations_list_gs.append(citations_gs)
print('Title:', title_gs, '; Citations:', citations_gs)
然而,我从这个脚本中得到的结果只是一个列表:
Title: No title ; Citations: No Citation count
我不确定问题出在我笨拙的脚本(可能)还是 Google 阻止我从 Scholar 中获取太多信息。事实上,即使是我在此线程中用作起点的
听起来你正在触发学者机器人检测。从个人爬取 Google Scholar 的经验来看,45 秒足以避免 CAPTCHA 和 bot 检测。我有一个刮板 运行 超过 3 天而没有被发现。如果你确实被标记了,等待大约 2 小时就足以重新开始。 Here is an extract from my code.。
class ScholarScrape():
def __init__(self):
self.page = None
self.last_url = None
self.last_time = time.time()
self.min_time_between_scrape = int(ConfigFile.instance().config.get('scholar','bot_avoidance_time'))
self.header = {'User-Agent':ConfigFile.instance().config.get('scholar','user_agent')}
self.session = requests.Session()
pass
def search(self, query=None, year_lo=None, year_hi=None, title_only=False, publication_string=None, author_string=None, include_citations=True, include_patents=True):
url = self.get_url(query, year_lo, year_hi, title_only, publication_string, author_string, include_citations, include_patents)
while True:
wait_time = self.min_time_between_scrape - (time.time() - self.last_time)
if wait_time > 0:
logger.info("Delaying search by {} seconds to avoid bot detection.".format(wait_time))
time.sleep(wait_time)
self.last_time = time.time()
logger.info("SCHOLARSCRAPE: " + url)
self.page = BeautifulSoup(self.session.get(url, headers=self.header).text, 'html.parser')
self.last_url = url
if "Our systems have detected unusual traffic from your computer network" in str(self.page):
raise BotDetectionException("Google has blocked this computer for a short time because it has detected this scraping script.")
return
def get_url(self, query=None, year_lo=None, year_hi=None, title_only=False, publication_string=None, author_string=None, include_citations=True, include_patents=True):
base_url = "https://scholar.google.com.au/scholar?"
url = base_url + "as_q=" + urllib.parse.quote(query)
if year_lo is not None and bool(re.match(r'.*([1-3][0-9]{3})', str(year_lo))):
url += "&as_ylo=" + str(year_lo)
if year_hi is not None and bool(re.match(r'.*([1-3][0-9]{3})', str(year_hi))):
url += "&as_yhi=" + str(year_hi)
if title_only:
url += "&as_yhi=title"
else:
url += "&as_yhi=any"
if publication_string is not None:
url += "&as_publication=" + urllib.parse.quote('"' + str(publication_string) + '"')
if author_string is not None:
url += "&as_sauthors=" + urllib.parse.quote('"' + str(author_string) + '"')
if include_citations:
url += "&as_vis=0"
else:
url += "&as_vis=1"
if include_patents:
url += "&as_sdt=0"
else:
url += "&as_sdt=1"
return url
def get_results_count(self):
e = self.page.findAll("div", {"class": "gs_ab_mdw"})
try:
item = e[1].text.strip()
except IndexError as ex:
if "Our systems have detected unusual traffic from your computer network" in str(self.page):
raise BotDetectionException("Google has blocked this computer for a short time because it has detected this scraping script.")
else:
raise ex
if self.has_numbers(item):
return self.get_results_count_from_soup_string(item)
for item in e:
item = item.text.strip()
if self.has_numbers(item):
return self.get_results_count_from_soup_string(item)
return 0
@staticmethod
def get_results_count_from_soup_string(element):
if "About" in element:
num = element.split(" ")[1].strip().replace(",","")
else:
num = element.split(" ")[0].strip().replace(",","")
return num
@staticmethod
def has_numbers(input_string):
return any(char.isdigit() for char in input_string)
class BotDetectionException(Exception):
pass
if __name__ == "__main__":
s = ScholarScrape()
s.search(**{
"query":"\"policy shaping\"",
# "publication_string":"JMLR",
"author_string": "gilboa",
"year_lo": "1995",
"year_hi": "2005",
})
x = s.get_results_count()
print(x)
或者,您可以使用第三方解决方案,例如 SerpApi。这是付费 API 免费试用。我们为您处理代理、解决验证码并解析所有丰富的结构化数据。
示例 python 代码(也可在其他库中使用):
from serpapi import GoogleSearch
params = {
"api_key": "secret_api_key",
"engine": "google_scholar",
"q": "Francesco Crespi",
"hl": "en"
}
search = GoogleSearch(params)
results = search.get_dict()
示例 JSON 输出:
{
"position": 1,
"title": "Demand and innovation in productivity growth",
"result_id": "6vfKIRtQWRQJ",
"link": "https://www.tandfonline.com/doi/abs/10.1080/02692170802407429",
"snippet": "The labour productivity impact of demand and innovation is investigated in this paper combining insights from the Kaldorian and Schumpeterian traditions. After a review of studies in such traditions, a general model is proposed for explaining productivity growth in …",
"publication_info": {
"summary": "F Crespi, M Pianta - International Review of Applied Economics, 2008 - Taylor & Francis",
"authors": [
{
"name": "F Crespi",
"link": "https://scholar.google.com/citations?user=gKdC5-0AAAAJ&hl=en&oi=sra",
"serpapi_scholar_link": "https://serpapi.com/search.json?author_id=gKdC5-0AAAAJ&engine=google_scholar_author&hl=en",
"author_id": "gKdC5-0AAAAJ"
},
{
"name": "M Pianta",
"link": "https://scholar.google.com/citations?user=knrxY9EAAAAJ&hl=en&oi=sra",
"serpapi_scholar_link": "https://serpapi.com/search.json?author_id=knrxY9EAAAAJ&engine=google_scholar_author&hl=en",
"author_id": "knrxY9EAAAAJ"
}
]
},
"inline_links": {
"serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=6vfKIRtQWRQJ",
"cited_by": {
"total": 88,
"link": "https://scholar.google.com/scholar?cites=1466291231147096042&as_sdt=2005&sciodt=0,5&hl=en",
"cites_id": "1466291231147096042",
"serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=2005&cites=1466291231147096042&engine=google_scholar&hl=en"
},
"related_pages_link": "https://scholar.google.com/scholar?q=related:6vfKIRtQWRQJ:scholar.google.com/&scioq=Francesco+Crespi&hl=en&as_sdt=0,5",
"versions": {
"total": 7,
"link": "https://scholar.google.com/scholar?cluster=1466291231147096042&hl=en&as_sdt=0,5",
"cluster_id": "1466291231147096042",
"serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C5&cluster=1466291231147096042&engine=google_scholar&hl=en"
}
}
},
...
查看 documentation 了解更多详情。
免责声明:我在 SerpApi 工作。