使用 urllib.request 和 BeautifulSoup 下载文件时如何使用来自某个 HTML 元素的文本重命名文件
How to rename file with text from a certain HTML element when using urllib.request and BeautifulSoup to download files
我有一个算法可以使用 urllib.request
和 BeautifulSoup
(Python 3.6) 下载 PDF 文章:
import requests as r
from bs4 import BeautifulSoup as soup
import os
import urllib.request
#make a list of all web pages' urls
webpages=[]
for i in range(9):
root_url = 'xxx.com/articles/page'+ str(i)
webpages.append(root_url)
#make a list of PDF links
pdf_links = []
for item in webpages:
headers = {'User-Agent': 'Mozilla/5.0'}
data = r.get(item, headers=headers)
page_soup = soup(data.text, 'html.parser')
links = [span.attrs['href'] for span in page_soup.find_all('a', href = True)]
for link in links:
link_string = str(link)
if link_string.endswith('pdf'):
pdf_links.append(link_string)
#download the files
for pdf_link in pdf_links:
save_to = os.path.basename(pdf_link.strip())
urllib.request.urlretrieve(pdf_link.strip(), save_to)
我需要用文章标题重命名每个下载的 PDF 文章,文章标题存储在特定的 div
class:
<div class="article article title">
<h2>The Disaster of Deforestation</h2>
</div>
有一个更大的 div
存储文章标题和相应的 PDF link:
<div article-id="1741" class="online article_row_view">
<div class="article article title">
<h2>The Disaster of Deforestation</h2>
</div>
<span class="file-pdf"> <a href="xsdf.pdf" title="BowenA.pdf">PDF</a></span>
</div>
我不知道如何自动重命名文件,更不用说特定的 HTML 元素了。如有任何帮助,我们将不胜感激!
这是一个完整的解决方案,可以遍历导航中的所有页面并为您下载所有 pdf:
import requests
from bs4 import BeautifulSoup, Tag, Comment, NavigableString
from pathlib import Path
from urllib.parse import urljoin
BASE_URL = 'https://cross-currents.berkeley.edu/archives'
def make_soup(url: str) -> BeautifulSoup:
res = requests.get(url, headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0'})
res.raise_for_status()
html = res.text
soup = BeautifulSoup(html, 'html.parser')
return soup
def extract_articles(soup: BeautifulSoup) -> list:
articles = []
for result in soup.select('.node--view-mode-search-result'):
author = result.select_one('.field--name-field-citation-authors').text.strip()
date = result.select_one('.field--name-field-issue-date').text.strip()
title = result.select_one('.field-name-node-title').text.strip()
journal = result.find('em', recursive=False).text.strip()
pdf_url = result.select_one('a[href*=".pdf"]')['href']
articles.append({
'author': author,
'date': date,
'title': title,
'journal': journal,
'pdf_url': pdf_url,
})
return articles
def make_safe_filename(text: str) -> str:
"""convert forbidden chars to underscores"""
return ''.join(c if (c.isalnum() or c.isspace()) else '_' for c in text).strip('_ ')
def get_next_page_url(soup: BeautifulSoup) -> str:
try:
path = soup.select_one('.pager a[rel="next"]')['href']
return urljoin(BASE_URL, path)
except (TypeError, KeyError):
return None
def download_file(url: str, filename: str) -> str:
with requests.get(url, stream=True) as res, open(filename, 'wb') as f:
res.raise_for_status()
for chunk in res.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
return filename
def scrape_archive():
save_dir = Path(r'd:\downloads')
save_dir.mkdir(exist_ok=True, parents=True)
url = 'https://cross-currents.berkeley.edu/archives?author=&title=&type=onlinearticle&issue=All®ion=All&page=0'
while True:
soup = make_soup(url)
articles = extract_articles(soup)
for a in articles:
pdf_url = a['pdf_url']
filename = make_safe_filename(a['title'])
save_path = str(save_dir / (filename + '.pdf'))
print('Downloading:', a['title'])
download_file(pdf_url, save_path)
print('Finished')
# go to next page if exists
next_url = get_next_page_url(soup)
if not next_url:
break
url = next_url
print('Moving to next page', url)
scrape_archive()
这里我只使用标题来生成 pdf 文件名,但您可以混合和组合 journal
、date
、author
等来生成更好的文件名。
还要记得根据自己的喜好更改 save_dir
。
我有一个算法可以使用 urllib.request
和 BeautifulSoup
(Python 3.6) 下载 PDF 文章:
import requests as r
from bs4 import BeautifulSoup as soup
import os
import urllib.request
#make a list of all web pages' urls
webpages=[]
for i in range(9):
root_url = 'xxx.com/articles/page'+ str(i)
webpages.append(root_url)
#make a list of PDF links
pdf_links = []
for item in webpages:
headers = {'User-Agent': 'Mozilla/5.0'}
data = r.get(item, headers=headers)
page_soup = soup(data.text, 'html.parser')
links = [span.attrs['href'] for span in page_soup.find_all('a', href = True)]
for link in links:
link_string = str(link)
if link_string.endswith('pdf'):
pdf_links.append(link_string)
#download the files
for pdf_link in pdf_links:
save_to = os.path.basename(pdf_link.strip())
urllib.request.urlretrieve(pdf_link.strip(), save_to)
我需要用文章标题重命名每个下载的 PDF 文章,文章标题存储在特定的 div
class:
<div class="article article title">
<h2>The Disaster of Deforestation</h2>
</div>
有一个更大的 div
存储文章标题和相应的 PDF link:
<div article-id="1741" class="online article_row_view">
<div class="article article title">
<h2>The Disaster of Deforestation</h2>
</div>
<span class="file-pdf"> <a href="xsdf.pdf" title="BowenA.pdf">PDF</a></span>
</div>
我不知道如何自动重命名文件,更不用说特定的 HTML 元素了。如有任何帮助,我们将不胜感激!
这是一个完整的解决方案,可以遍历导航中的所有页面并为您下载所有 pdf:
import requests
from bs4 import BeautifulSoup, Tag, Comment, NavigableString
from pathlib import Path
from urllib.parse import urljoin
BASE_URL = 'https://cross-currents.berkeley.edu/archives'
def make_soup(url: str) -> BeautifulSoup:
res = requests.get(url, headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0'})
res.raise_for_status()
html = res.text
soup = BeautifulSoup(html, 'html.parser')
return soup
def extract_articles(soup: BeautifulSoup) -> list:
articles = []
for result in soup.select('.node--view-mode-search-result'):
author = result.select_one('.field--name-field-citation-authors').text.strip()
date = result.select_one('.field--name-field-issue-date').text.strip()
title = result.select_one('.field-name-node-title').text.strip()
journal = result.find('em', recursive=False).text.strip()
pdf_url = result.select_one('a[href*=".pdf"]')['href']
articles.append({
'author': author,
'date': date,
'title': title,
'journal': journal,
'pdf_url': pdf_url,
})
return articles
def make_safe_filename(text: str) -> str:
"""convert forbidden chars to underscores"""
return ''.join(c if (c.isalnum() or c.isspace()) else '_' for c in text).strip('_ ')
def get_next_page_url(soup: BeautifulSoup) -> str:
try:
path = soup.select_one('.pager a[rel="next"]')['href']
return urljoin(BASE_URL, path)
except (TypeError, KeyError):
return None
def download_file(url: str, filename: str) -> str:
with requests.get(url, stream=True) as res, open(filename, 'wb') as f:
res.raise_for_status()
for chunk in res.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
return filename
def scrape_archive():
save_dir = Path(r'd:\downloads')
save_dir.mkdir(exist_ok=True, parents=True)
url = 'https://cross-currents.berkeley.edu/archives?author=&title=&type=onlinearticle&issue=All®ion=All&page=0'
while True:
soup = make_soup(url)
articles = extract_articles(soup)
for a in articles:
pdf_url = a['pdf_url']
filename = make_safe_filename(a['title'])
save_path = str(save_dir / (filename + '.pdf'))
print('Downloading:', a['title'])
download_file(pdf_url, save_path)
print('Finished')
# go to next page if exists
next_url = get_next_page_url(soup)
if not next_url:
break
url = next_url
print('Moving to next page', url)
scrape_archive()
这里我只使用标题来生成 pdf 文件名,但您可以混合和组合 journal
、date
、author
等来生成更好的文件名。
还要记得根据自己的喜好更改 save_dir
。