BeautifulSoup 找不到页面中的每个 link
BeautifulSoup can not find every link in page
这是我的代码:
from bs4 import BeautifulSoup
import requests
from requests import get
import os
def file_download():
domain = "ec.europa.eu"
page = requests.get("https://ec.europa.eu/eurostat/web/main/data/database")
html = page.text
soup = BeautifulSoup(html, "html.parser")
for link in soup.find_all('a'):
url = link.get('href')
print(url)
if ".gz" in url:
file_name = url.split("file=", 1)[1]
if os.path.exists(file_name):
print("File already exists.")
continue
else:
with open(file_name, 'wb') as file:
print('Downloading...')
response = get(url)
file.write(response.content)
continue
else:
continue
print('\nEvery file has been downloaded!')
在上面的代码中,我似乎无法从页面中找到所有可能的 link。
在 chrome 检查中,复制的元素为我提供了我写的评论。
这就是我想用 beautifulsoup 以及其他类似的 links.
找到的
最好避免通过树结构访问文件(因为这需要大量 JSON 交互)。
一种更简单的方法是使用他们所有文件的文件列表:
from bs4 import BeautifulSoup
import requests
session = requests.Session()
req_all = session.get("https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?dir=data&sort=1&sort=2&start=all")
soup = BeautifulSoup(req_all.content, "lxml")
table = soup.find('table', id='filetable')
for a in table.find_all('a', href=True):
if a.text == "Download":
href = a['href']
if '.gz' in href:
filename = href.rsplit('%2F', 1)[1]
if not os.path.exists(filename):
with open(filename, 'wb') as f_gz:
f_gz.write(requests.get(href).content)
print(filename)
这是我的代码:
from bs4 import BeautifulSoup
import requests
from requests import get
import os
def file_download():
domain = "ec.europa.eu"
page = requests.get("https://ec.europa.eu/eurostat/web/main/data/database")
html = page.text
soup = BeautifulSoup(html, "html.parser")
for link in soup.find_all('a'):
url = link.get('href')
print(url)
if ".gz" in url:
file_name = url.split("file=", 1)[1]
if os.path.exists(file_name):
print("File already exists.")
continue
else:
with open(file_name, 'wb') as file:
print('Downloading...')
response = get(url)
file.write(response.content)
continue
else:
continue
print('\nEvery file has been downloaded!')
在上面的代码中,我似乎无法从页面中找到所有可能的 link。 在 chrome 检查中,复制的元素为我提供了我写的评论。 这就是我想用 beautifulsoup 以及其他类似的 links.
找到的最好避免通过树结构访问文件(因为这需要大量 JSON 交互)。
一种更简单的方法是使用他们所有文件的文件列表:
from bs4 import BeautifulSoup
import requests
session = requests.Session()
req_all = session.get("https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?dir=data&sort=1&sort=2&start=all")
soup = BeautifulSoup(req_all.content, "lxml")
table = soup.find('table', id='filetable')
for a in table.find_all('a', href=True):
if a.text == "Download":
href = a['href']
if '.gz' in href:
filename = href.rsplit('%2F', 1)[1]
if not os.path.exists(filename):
with open(filename, 'wb') as f_gz:
f_gz.write(requests.get(href).content)
print(filename)