使用请求时损坏的 pdf (python)
Corrupted pdf when using requests (python)
我正在尝试从一个网站下载所有 pdf 文件,但创建的每个 pdf 都已损坏...
import requests
from bs4 import BeautifulSoup
url ="https://www.geeksforgeeks.org/how-to-extract-pdf-tables-in-python/"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a')
i = 0
for link in links:
if('.pdf' in link.get('href', [])):
i += 1
print("Downloading file: ", i)
response = requests.get(link.get('href'))
pdf = open("pdf"+str(i)+".pdf", 'wb')
pdf.write(response.content)
pdf.close()
print("File ", i, " downloaded")
print("All PDF files downloaded")
将 headers
添加到您的请求中
import requests
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; PPC Mac OS X 10_8_7 rv:5.0; en-US) AppleWebKit/533.31.5 (KHTML, like Gecko) Version/4.0 Safari/533.31.5',
}
from bs4 import BeautifulSoup
url ="https://www.geeksforgeeks.org/how-to-extract-pdf-tables-in-python/"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a')
i = 0
for link in links:
if('.pdf' in link.get('href', [])):
i += 1
print("Downloading file: ", i)
response = requests.get(link.get('href'), headers=headers)
pdf = open("pdf"+str(i)+".pdf", 'wb')
pdf.write(response.content)
pdf.close()
print("File ", i, " downloaded")
print("All PDF files downloaded")
我正在尝试从一个网站下载所有 pdf 文件,但创建的每个 pdf 都已损坏...
import requests
from bs4 import BeautifulSoup
url ="https://www.geeksforgeeks.org/how-to-extract-pdf-tables-in-python/"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a')
i = 0
for link in links:
if('.pdf' in link.get('href', [])):
i += 1
print("Downloading file: ", i)
response = requests.get(link.get('href'))
pdf = open("pdf"+str(i)+".pdf", 'wb')
pdf.write(response.content)
pdf.close()
print("File ", i, " downloaded")
print("All PDF files downloaded")
将 headers
添加到您的请求中
import requests
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; PPC Mac OS X 10_8_7 rv:5.0; en-US) AppleWebKit/533.31.5 (KHTML, like Gecko) Version/4.0 Safari/533.31.5',
}
from bs4 import BeautifulSoup
url ="https://www.geeksforgeeks.org/how-to-extract-pdf-tables-in-python/"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a')
i = 0
for link in links:
if('.pdf' in link.get('href', [])):
i += 1
print("Downloading file: ", i)
response = requests.get(link.get('href'), headers=headers)
pdf = open("pdf"+str(i)+".pdf", 'wb')
pdf.write(response.content)
pdf.close()
print("File ", i, " downloaded")
print("All PDF files downloaded")