Beautiful Soup 从多个页面下载 PDF
Beautiful Soup to Download PDFs from Multiple Pages
Andrej 好心帮我写了这段代码,但现在我想知道如何导航到每个页面并下载所有带有 text/title "Public 的 PDF评论”在名称中?
import requests
from bs4 import BeautifulSoup
url = "https://www.ci.atherton.ca.us/Archive.aspx?AMID=41"
key = "Archive.aspx?ADID="
soup = BeautifulSoup(requests.get(url).content, "html.parser")
for link in soup.find_all("a"):
if key in link.get("href", ""):
print("https://www.ci.atherton.ca.us/" + link.get("href"))
打印:
https://www.ci.atherton.ca.us/Archive.aspx?ADID=3581
https://www.ci.atherton.ca.us/Archive.aspx?ADID=3570
https://www.ci.atherton.ca.us/Archive.aspx?ADID=3564
https://www.ci.atherton.ca.us/Archive.aspx?ADID=3559
https://www.ci.atherton.ca.us/Archive.aspx?ADID=3556
https://www.ci.atherton.ca.us/Archive.aspx?ADID=3554
https://www.ci.atherton.ca.us/Archive.aspx?ADID=3552
尝试:
import requests
from bs4 import BeautifulSoup
url = "https://www.ci.atherton.ca.us/Archive.aspx?AMID=41"
key = "Archive.aspx?ADID="
soup = BeautifulSoup(requests.get(url).content, "html.parser")
all_links = []
for link in soup.find_all("a"):
if key in link.get("href", ""):
all_links.append("https://www.ci.atherton.ca.us/" + link.get("href"))
for link in all_links:
print("Checking {}...".format(link))
soup = BeautifulSoup(requests.get(link).content, "html.parser")
for a in soup.find_all(
lambda tag: tag.name == "a" and "public comment" in tag.text.lower()
):
pdf_link = "https://www.ci.atherton.ca.us" + a["href"]
filename = a["href"].split("/")[-1] + ".pdf"
print("Downloading {} to {}".format(pdf_link, filename))
with open(filename, "wb") as f_out:
f_out.write(requests.get(pdf_link).content)
打印:
...
Checking https://www.ci.atherton.ca.us/Archive.aspx?ADID=3514...
Checking https://www.ci.atherton.ca.us/Archive.aspx?ADID=3505...
Downloading https://www.ci.atherton.ca.us/DocumentCenter/View/8628/Public-Comments-1202021---ITEM-No-15 to Public-Comments-1202021---ITEM-No-15.pdf
Checking https://www.ci.atherton.ca.us/Archive.aspx?ADID=3498...
Checking https://www.ci.atherton.ca.us/Archive.aspx?ADID=3479...
Downloading https://www.ci.atherton.ca.us/DocumentCenter/View/8516/Wayne-Lee---Public-Comments_12162020 to Wayne-Lee---Public-Comments_12162020.pdf
Downloading https://www.ci.atherton.ca.us/DocumentCenter/View/8532/Discher-Stephanie_Public-Comments_12162020 to Discher-Stephanie_Public-Comments_12162020.pdf
...
并将 URL 中的 PDF 保存到文件中。
Andrej 好心帮我写了这段代码,但现在我想知道如何导航到每个页面并下载所有带有 text/title "Public 的 PDF评论”在名称中?
import requests
from bs4 import BeautifulSoup
url = "https://www.ci.atherton.ca.us/Archive.aspx?AMID=41"
key = "Archive.aspx?ADID="
soup = BeautifulSoup(requests.get(url).content, "html.parser")
for link in soup.find_all("a"):
if key in link.get("href", ""):
print("https://www.ci.atherton.ca.us/" + link.get("href"))
打印:
https://www.ci.atherton.ca.us/Archive.aspx?ADID=3581
https://www.ci.atherton.ca.us/Archive.aspx?ADID=3570
https://www.ci.atherton.ca.us/Archive.aspx?ADID=3564
https://www.ci.atherton.ca.us/Archive.aspx?ADID=3559
https://www.ci.atherton.ca.us/Archive.aspx?ADID=3556
https://www.ci.atherton.ca.us/Archive.aspx?ADID=3554
https://www.ci.atherton.ca.us/Archive.aspx?ADID=3552
尝试:
import requests
from bs4 import BeautifulSoup
url = "https://www.ci.atherton.ca.us/Archive.aspx?AMID=41"
key = "Archive.aspx?ADID="
soup = BeautifulSoup(requests.get(url).content, "html.parser")
all_links = []
for link in soup.find_all("a"):
if key in link.get("href", ""):
all_links.append("https://www.ci.atherton.ca.us/" + link.get("href"))
for link in all_links:
print("Checking {}...".format(link))
soup = BeautifulSoup(requests.get(link).content, "html.parser")
for a in soup.find_all(
lambda tag: tag.name == "a" and "public comment" in tag.text.lower()
):
pdf_link = "https://www.ci.atherton.ca.us" + a["href"]
filename = a["href"].split("/")[-1] + ".pdf"
print("Downloading {} to {}".format(pdf_link, filename))
with open(filename, "wb") as f_out:
f_out.write(requests.get(pdf_link).content)
打印:
...
Checking https://www.ci.atherton.ca.us/Archive.aspx?ADID=3514...
Checking https://www.ci.atherton.ca.us/Archive.aspx?ADID=3505...
Downloading https://www.ci.atherton.ca.us/DocumentCenter/View/8628/Public-Comments-1202021---ITEM-No-15 to Public-Comments-1202021---ITEM-No-15.pdf
Checking https://www.ci.atherton.ca.us/Archive.aspx?ADID=3498...
Checking https://www.ci.atherton.ca.us/Archive.aspx?ADID=3479...
Downloading https://www.ci.atherton.ca.us/DocumentCenter/View/8516/Wayne-Lee---Public-Comments_12162020 to Wayne-Lee---Public-Comments_12162020.pdf
Downloading https://www.ci.atherton.ca.us/DocumentCenter/View/8532/Discher-Stephanie_Public-Comments_12162020 to Discher-Stephanie_Public-Comments_12162020.pdf
...
并将 URL 中的 PDF 保存到文件中。