使用 url 抓取大量 Google 学术页面
Scraping large amount of Google Scholar pages with url
我正在尝试使用 BeautifulSoup 从 Google scholar 上的一位作者那里获取所有出版物的完整作者列表。由于作者的主页只有每篇论文的截断作者列表,我必须打开论文的 link 才能获得完整列表。结果,我每隔几次尝试 运行 进入验证码。
有没有办法避免验证码(例如每次请求后暂停 3 秒)?或者制作原始的 Google 学者个人资料页面以显示完整的作者列表?
最近我遇到了类似的问题。我至少通过实施 随机 和 相当持久的 睡眠来简化我的收集过程,如下所示:
import time
import numpy as np
time.sleep((30-5)*np.random.random()+5) #from 5 to 30 seconds
如果您有足够的时间(比如说在晚上启动您的解析器),您可以进行更大的暂停(大 3 倍以上)以确保您不会收到验证码。
此外,您可以随机更改站点请求中的 user-agent
s,这会更加掩盖您。
最有效的方法是使用验证码解决服务和住宅代理。
如果您不想弄清楚如何使用验证码或弄清楚要使用哪些代理,您可以尝试 Google Scholar API 来自 SerpApi,这是一个付费 API 的免费计划绕过后端的块。
代码和 example in the online IDE 从所有可用页面抓取出版物并能够将结果保存为 CSV:
import pandas as pd
import os, json
from serpapi import GoogleScholarSearch
from urllib.parse import urlsplit, parse_qsl
def serpapi_scrape_all_publications(query: str):
params = {
"api_key": os.getenv("API_KEY"), # your SerpApi API key
"engine": "google_scholar", # search engine
"hl": "en", # language
"q": query, # search query
"num": "100" # articles per page
}
# where data extraction happens on SerpApi backend.
search = GoogleScholarSearch(params)
publications = []
publications_is_present = True
while publications_is_present:
results = search.get_dict() # JSON -> Python dictionary
for publication in results.get("organic_results", {}):
publications.append({
"title": publication.get("title"),
"link": publication.get("link"),
"result_id": publication.get("result_id"),
"snippet": publication.get("snippet"),
"inline_links": publication.get("inline_links"),
"publication_info": publication.get("publication_info")
})
# checks for the next page and updates if present
if "next" in results.get("serpapi_pagination", []):
# split URL in parts as a dict() and update "search" variable to a new page
search.params_dict.update(dict(parse_qsl(urlsplit(results["serpapi_pagination"]["next"]).query)))
else:
publications_is_present = False
print(json.dumps(publications, indent=2, ensure_ascii=False))
serpapi_scrape_all_publications(query="biology")
输出:
[
{
"title": "Fungal decomposition of wood: its biology and ecology",
"link": null,
"result_id": "LiWKgtH72owJ",
"snippet": "",
"inline_links": {
"serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=LiWKgtH72owJ",
"cited_by": {
"total": 1446,
"link": "https://scholar.google.com/scholar?cites=10149701587489662254&as_sdt=400005&sciodt=0,14&hl=en&num=20",
"cites_id": "10149701587489662254",
"serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=400005&cites=10149701587489662254&engine=google_scholar&hl=en&num=20"
},
"related_pages_link": "https://scholar.google.com/scholar?q=related:LiWKgtH72owJ:scholar.google.com/&scioq=biology&hl=en&num=20&as_sdt=0,14",
"versions": {
"total": 6,
"link": "https://scholar.google.com/scholar?cluster=10149701587489662254&hl=en&num=20&as_sdt=0,14",
"cluster_id": "10149701587489662254",
"serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C14&cluster=10149701587489662254&engine=google_scholar&hl=en&num=20"
}
},
"publication_info": {
"summary": "ADM Rayner, L Boddy - 1988"
}
}, ... other results
]
Disclaimer, I work for SerpApi.
我正在尝试使用 BeautifulSoup 从 Google scholar 上的一位作者那里获取所有出版物的完整作者列表。由于作者的主页只有每篇论文的截断作者列表,我必须打开论文的 link 才能获得完整列表。结果,我每隔几次尝试 运行 进入验证码。
有没有办法避免验证码(例如每次请求后暂停 3 秒)?或者制作原始的 Google 学者个人资料页面以显示完整的作者列表?
最近我遇到了类似的问题。我至少通过实施 随机 和 相当持久的 睡眠来简化我的收集过程,如下所示:
import time
import numpy as np
time.sleep((30-5)*np.random.random()+5) #from 5 to 30 seconds
如果您有足够的时间(比如说在晚上启动您的解析器),您可以进行更大的暂停(大 3 倍以上)以确保您不会收到验证码。
此外,您可以随机更改站点请求中的 user-agent
s,这会更加掩盖您。
最有效的方法是使用验证码解决服务和住宅代理。
如果您不想弄清楚如何使用验证码或弄清楚要使用哪些代理,您可以尝试 Google Scholar API 来自 SerpApi,这是一个付费 API 的免费计划绕过后端的块。
代码和 example in the online IDE 从所有可用页面抓取出版物并能够将结果保存为 CSV:
import pandas as pd
import os, json
from serpapi import GoogleScholarSearch
from urllib.parse import urlsplit, parse_qsl
def serpapi_scrape_all_publications(query: str):
params = {
"api_key": os.getenv("API_KEY"), # your SerpApi API key
"engine": "google_scholar", # search engine
"hl": "en", # language
"q": query, # search query
"num": "100" # articles per page
}
# where data extraction happens on SerpApi backend.
search = GoogleScholarSearch(params)
publications = []
publications_is_present = True
while publications_is_present:
results = search.get_dict() # JSON -> Python dictionary
for publication in results.get("organic_results", {}):
publications.append({
"title": publication.get("title"),
"link": publication.get("link"),
"result_id": publication.get("result_id"),
"snippet": publication.get("snippet"),
"inline_links": publication.get("inline_links"),
"publication_info": publication.get("publication_info")
})
# checks for the next page and updates if present
if "next" in results.get("serpapi_pagination", []):
# split URL in parts as a dict() and update "search" variable to a new page
search.params_dict.update(dict(parse_qsl(urlsplit(results["serpapi_pagination"]["next"]).query)))
else:
publications_is_present = False
print(json.dumps(publications, indent=2, ensure_ascii=False))
serpapi_scrape_all_publications(query="biology")
输出:
[
{
"title": "Fungal decomposition of wood: its biology and ecology",
"link": null,
"result_id": "LiWKgtH72owJ",
"snippet": "",
"inline_links": {
"serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=LiWKgtH72owJ",
"cited_by": {
"total": 1446,
"link": "https://scholar.google.com/scholar?cites=10149701587489662254&as_sdt=400005&sciodt=0,14&hl=en&num=20",
"cites_id": "10149701587489662254",
"serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=400005&cites=10149701587489662254&engine=google_scholar&hl=en&num=20"
},
"related_pages_link": "https://scholar.google.com/scholar?q=related:LiWKgtH72owJ:scholar.google.com/&scioq=biology&hl=en&num=20&as_sdt=0,14",
"versions": {
"total": 6,
"link": "https://scholar.google.com/scholar?cluster=10149701587489662254&hl=en&num=20&as_sdt=0,14",
"cluster_id": "10149701587489662254",
"serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C14&cluster=10149701587489662254&engine=google_scholar&hl=en&num=20"
}
},
"publication_info": {
"summary": "ADM Rayner, L Boddy - 1988"
}
}, ... other results
]
Disclaimer, I work for SerpApi.