通过 python 中的 Beautifulsoup 抓取并下载修改后名称的 Pdf 文件
Scrape and Download Pdf files with modified names through Beautifulsoup in python
我想从 https://www.archives.gov/research/pentagon-papers
下载 PDF 文件
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = "https://www.archives.gov/research/pentagon-papers"
# If there is no such folder, the script will create one automatically
folder_location = r'E:\webscraping'
if not os.path.exists(folder_location): os.mkdir(folder_location)
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
# Downloading the files
for link in soup.select("a[href$='.pdf']"):
# Name the pdf files using the last portion of each link which are unique in this case
filename = os.path.join(folder_location, link['href'].split('/')[-1])
with open(filename, 'wb') as f:
f.write(requests.get(urljoin(url, link['href'])).content)
不过,我希望文件的名称不像文件名,而是它们的描述。例如,我希望 table 中的第三个文件被命名为 [Part II] U.S. Involvement in the Franco-Viet Minh War, 1950-1954.pdf
而不是 Pentagon-Papers-Part-II.pdf
在 for
循环的 link
元素中,它存储为 contents
但我不知道如何提取它。
如您所愿,使用 <a>
标签中的文本作为名称怎么样?
方法如下:
import os
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
url = "https://www.archives.gov/research/pentagon-papers"
# If there is no such folder, the script will create one automatically
folder_location = r'E:\webscraping'
if not os.path.exists(folder_location):
os.mkdir(folder_location)
soup = BeautifulSoup(requests.get(url).text, "html.parser")
# Downloading the files
for link in soup.select("a[href$='.pdf']"):
filename = os.path.join(
folder_location,
(
link.getText()
.rstrip()
.replace(" ", "_")
.replace(",", "")
.replace(".", "")
),
)
with open(f"{filename}.pdf", 'wb') as f:
f.write(requests.get(urljoin(url, link['href'])).content)
这应该按照描述生成文件:
E:\webscraping/Index
E:\webscraping/[Part_I]_Vietnam_and_the_US_1940-1950
E:\webscraping/[Part_II]_US_Involvement_in_the_Franco-Viet_Minh_War_1950-1954
E:\webscraping/[Part_III]_The_Geneva_Accords
E:\webscraping/[Part_IV_A_1]_Evolution_of_the_War_NATO_and_SEATO:_A_Comparison
E:\webscraping/[Part_IV_A_2]_Evolution_of_the_War_Aid_for_France_in_Indochina_1950-54
E:\webscraping/[Part_IV_A_3]_Evolution_of_the_War_US_and_France's_Withdrawal_from_Vietnam_1954-56
E:\webscraping/[Part_IV_A_4]_Evolution_of_the_War_US_Training_of_Vietnamese_National_Army_1954-59
E:\webscraping/[Part_IV_A_5]_Evolution_of_the_War_Origins_of_the_Insurgency
E:\webscraping/[Part_IV_B_1]_Evolution_of_the_War_Counterinsurgency:_The_Kennedy_Commitments_and_Programs_1961
and more ...
我想从 https://www.archives.gov/research/pentagon-papers
下载 PDF 文件import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = "https://www.archives.gov/research/pentagon-papers"
# If there is no such folder, the script will create one automatically
folder_location = r'E:\webscraping'
if not os.path.exists(folder_location): os.mkdir(folder_location)
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
# Downloading the files
for link in soup.select("a[href$='.pdf']"):
# Name the pdf files using the last portion of each link which are unique in this case
filename = os.path.join(folder_location, link['href'].split('/')[-1])
with open(filename, 'wb') as f:
f.write(requests.get(urljoin(url, link['href'])).content)
不过,我希望文件的名称不像文件名,而是它们的描述。例如,我希望 table 中的第三个文件被命名为 [Part II] U.S. Involvement in the Franco-Viet Minh War, 1950-1954.pdf
而不是 Pentagon-Papers-Part-II.pdf
在 for
循环的 link
元素中,它存储为 contents
但我不知道如何提取它。
如您所愿,使用 <a>
标签中的文本作为名称怎么样?
方法如下:
import os
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
url = "https://www.archives.gov/research/pentagon-papers"
# If there is no such folder, the script will create one automatically
folder_location = r'E:\webscraping'
if not os.path.exists(folder_location):
os.mkdir(folder_location)
soup = BeautifulSoup(requests.get(url).text, "html.parser")
# Downloading the files
for link in soup.select("a[href$='.pdf']"):
filename = os.path.join(
folder_location,
(
link.getText()
.rstrip()
.replace(" ", "_")
.replace(",", "")
.replace(".", "")
),
)
with open(f"{filename}.pdf", 'wb') as f:
f.write(requests.get(urljoin(url, link['href'])).content)
这应该按照描述生成文件:
E:\webscraping/Index
E:\webscraping/[Part_I]_Vietnam_and_the_US_1940-1950
E:\webscraping/[Part_II]_US_Involvement_in_the_Franco-Viet_Minh_War_1950-1954
E:\webscraping/[Part_III]_The_Geneva_Accords
E:\webscraping/[Part_IV_A_1]_Evolution_of_the_War_NATO_and_SEATO:_A_Comparison
E:\webscraping/[Part_IV_A_2]_Evolution_of_the_War_Aid_for_France_in_Indochina_1950-54
E:\webscraping/[Part_IV_A_3]_Evolution_of_the_War_US_and_France's_Withdrawal_from_Vietnam_1954-56
E:\webscraping/[Part_IV_A_4]_Evolution_of_the_War_US_Training_of_Vietnamese_National_Army_1954-59
E:\webscraping/[Part_IV_A_5]_Evolution_of_the_War_Origins_of_the_Insurgency
E:\webscraping/[Part_IV_B_1]_Evolution_of_the_War_Counterinsurgency:_The_Kennedy_Commitments_and_Programs_1961
and more ...