从网页获取所有 zip 文件的 url
Getting url for all zip files from webpage
我正在查看 webpage,里面有一堆 zip 文件。
每个 zip 文件都有 url 作为 https://www.ercot.com/misdownload/servlets/mirDownload?mimic_duns=000000000&doclookupId=814778337
我只想提取 _csv.zip
个文件的 urls 并将文件提取到一个 csv 文件中,并丢弃 _xml.zip
个文件的 urls。 xml.zip
和 csv.zip
都有相同的数据,但我更喜欢使用 csv.zip
.
我不确定如何处理这个问题或从哪里开始。
编辑:
如果您遇到“访问被拒绝”的情况,请注意该网页可能只能由美国 IP 地址访问。
当您单击 urls 时,它会将一个 zip 文件下载到 PC。我基本上想要:
将zip文件下载到PC
将 zip 中的 csv 文件内容加载到 pandas 数据帧
所有 zip 文件和一个合并的 csv 文件 (21 MB) 都是 here,所以不需要抓取。
但如果你喜欢,这是我的看法。
import os.path
from shutil import copyfileobj
import pandas as pd
import requests
from bs4 import BeautifulSoup
base_url = "https://www.ercot.com"
entry_url = f"{base_url}/misapp/GetReports.do?reportTypeId=12331&reportTitle=DAM%20Settlement%20Point%20Prices&showHTMLView=&mimicKey"
download_dir = "ercot"
def scrape_zips():
with requests.Session() as connection:
print("Finding all zip files...")
zip_urls = [
f"{base_url}{source_url['href']}" for source_url in
BeautifulSoup(
connection.get(entry_url).text,
"lxml"
).find_all("a")[::2]
]
os.makedirs(download_dir, exist_ok=True)
total_urls = len(zip_urls)
for idx, url in enumerate(zip_urls, start=1):
file_name = url.split("=", -1)[-1]
zip_object = connection.get(url, stream=True)
print(f"Fetching file {file_name} -> {idx} out of {total_urls}")
with open(os.path.join(download_dir, f"{file_name}.zip"), "wb") as output:
copyfileobj(zip_object.raw, output)
zip_object.close()
def list_files(dir_name: str):
yield from (
next(os.walk(dir_name), (None, None, []))[2]
)
def merge_zips_to_df():
print("Merging csv files...")
df = pd.concat(
pd.read_csv(os.path.join(download_dir, csv_file)) for csv_file
in list_files(download_dir)
)
print(df.head(20))
df.to_csv(os.path.join(download_dir, "merdged_csv_files.csv"), index=False)
if __name__ == "__main__":
scrape_zips()
merge_zips_to_df()
这应该会给您以下输出:
Finding all zip files...
Fetching file 816055622 -> 1 out of 31
Fetching file 815870449 -> 2 out of 31
Fetching file 815686938 -> 3 out of 31
Fetching file 815503551 -> 4 out of 31
Fetching file 815315296 -> 5 out of 31
Fetching file 815127892 -> 6 out of 31
Fetching file 814952388 -> 7 out of 31
Fetching file 814778337 -> 8 out of 31
Fetching file 814599101 -> 9 out of 31
Fetching file 814416972 -> 10 out of 31
Fetching file 814224618 -> 11 out of 31
Fetching file 814040277 -> 12 out of 31
Fetching file 813865857 -> 13 out of 31
Fetching file 813688802 -> 14 out of 31
Fetching file 813516414 -> 15 out of 31
Fetching file 813341752 -> 16 out of 31
Fetching file 813159478 -> 17 out of 31
Fetching file 812976112 -> 18 out of 31
Fetching file 812784659 -> 19 out of 31
Fetching file 812599985 -> 20 out of 31
Fetching file 812424952 -> 21 out of 31
Fetching file 812241625 -> 22 out of 31
Fetching file 812053445 -> 23 out of 31
Fetching file 811874015 -> 24 out of 31
Fetching file 811685701 -> 25 out of 31
Fetching file 811501577 -> 26 out of 31
Fetching file 811319918 -> 27 out of 31
Fetching file 811147926 -> 28 out of 31
Fetching file 810973966 -> 29 out of 31
Fetching file 810793357 -> 30 out of 31
Fetching file 810615891 -> 31 out of 31
Merging csv files...
DeliveryDate HourEnding SettlementPoint SettlementPointPrice DSTFlag
0 12/22/2021 01:00 AEEC 25.07 N
1 12/22/2021 01:00 AJAXWIND_RN 25.07 N
2 12/22/2021 01:00 ALGOD_ALL_RN 25.01 N
3 12/22/2021 01:00 ALVIN_RN 24.11 N
4 12/22/2021 01:00 AMADEUS_ALL 25.07 N
5 12/22/2021 01:00 AMISTAD_ALL 25.06 N
6 12/22/2021 01:00 AMOCOOIL_CC1 25.98 N
7 12/22/2021 01:00 AMOCOOIL_CC2 25.98 N
8 12/22/2021 01:00 AMOCO_PUN1 25.98 N
9 12/22/2021 01:00 AMOCO_PUN2 25.98 N
10 12/22/2021 01:00 AMO_AMOCO_1 25.98 N
11 12/22/2021 01:00 AMO_AMOCO_2 25.98 N
12 12/22/2021 01:00 AMO_AMOCO_5 25.98 N
13 12/22/2021 01:00 AMO_AMOCO_G1 25.98 N
14 12/22/2021 01:00 AMO_AMOCO_G2 25.98 N
15 12/22/2021 01:00 AMO_AMOCO_G3 25.98 N
16 12/22/2021 01:00 AMO_AMOCO_S1 25.98 N
17 12/22/2021 01:00 AMO_AMOCO_S2 25.98 N
18 12/22/2021 01:00 ANACACHO_ANA 25.05 N
19 12/22/2021 01:00 ANCHOR_ALL 25.08 N
我正在查看 webpage,里面有一堆 zip 文件。
每个 zip 文件都有 url 作为 https://www.ercot.com/misdownload/servlets/mirDownload?mimic_duns=000000000&doclookupId=814778337
我只想提取 _csv.zip
个文件的 urls 并将文件提取到一个 csv 文件中,并丢弃 _xml.zip
个文件的 urls。 xml.zip
和 csv.zip
都有相同的数据,但我更喜欢使用 csv.zip
.
我不确定如何处理这个问题或从哪里开始。
编辑:
如果您遇到“访问被拒绝”的情况,请注意该网页可能只能由美国 IP 地址访问。
当您单击 urls 时,它会将一个 zip 文件下载到 PC。我基本上想要:
将zip文件下载到PC
将 zip 中的 csv 文件内容加载到 pandas 数据帧
所有 zip 文件和一个合并的 csv 文件 (21 MB) 都是 here,所以不需要抓取。
但如果你喜欢,这是我的看法。
import os.path
from shutil import copyfileobj
import pandas as pd
import requests
from bs4 import BeautifulSoup
base_url = "https://www.ercot.com"
entry_url = f"{base_url}/misapp/GetReports.do?reportTypeId=12331&reportTitle=DAM%20Settlement%20Point%20Prices&showHTMLView=&mimicKey"
download_dir = "ercot"
def scrape_zips():
with requests.Session() as connection:
print("Finding all zip files...")
zip_urls = [
f"{base_url}{source_url['href']}" for source_url in
BeautifulSoup(
connection.get(entry_url).text,
"lxml"
).find_all("a")[::2]
]
os.makedirs(download_dir, exist_ok=True)
total_urls = len(zip_urls)
for idx, url in enumerate(zip_urls, start=1):
file_name = url.split("=", -1)[-1]
zip_object = connection.get(url, stream=True)
print(f"Fetching file {file_name} -> {idx} out of {total_urls}")
with open(os.path.join(download_dir, f"{file_name}.zip"), "wb") as output:
copyfileobj(zip_object.raw, output)
zip_object.close()
def list_files(dir_name: str):
yield from (
next(os.walk(dir_name), (None, None, []))[2]
)
def merge_zips_to_df():
print("Merging csv files...")
df = pd.concat(
pd.read_csv(os.path.join(download_dir, csv_file)) for csv_file
in list_files(download_dir)
)
print(df.head(20))
df.to_csv(os.path.join(download_dir, "merdged_csv_files.csv"), index=False)
if __name__ == "__main__":
scrape_zips()
merge_zips_to_df()
这应该会给您以下输出:
Finding all zip files...
Fetching file 816055622 -> 1 out of 31
Fetching file 815870449 -> 2 out of 31
Fetching file 815686938 -> 3 out of 31
Fetching file 815503551 -> 4 out of 31
Fetching file 815315296 -> 5 out of 31
Fetching file 815127892 -> 6 out of 31
Fetching file 814952388 -> 7 out of 31
Fetching file 814778337 -> 8 out of 31
Fetching file 814599101 -> 9 out of 31
Fetching file 814416972 -> 10 out of 31
Fetching file 814224618 -> 11 out of 31
Fetching file 814040277 -> 12 out of 31
Fetching file 813865857 -> 13 out of 31
Fetching file 813688802 -> 14 out of 31
Fetching file 813516414 -> 15 out of 31
Fetching file 813341752 -> 16 out of 31
Fetching file 813159478 -> 17 out of 31
Fetching file 812976112 -> 18 out of 31
Fetching file 812784659 -> 19 out of 31
Fetching file 812599985 -> 20 out of 31
Fetching file 812424952 -> 21 out of 31
Fetching file 812241625 -> 22 out of 31
Fetching file 812053445 -> 23 out of 31
Fetching file 811874015 -> 24 out of 31
Fetching file 811685701 -> 25 out of 31
Fetching file 811501577 -> 26 out of 31
Fetching file 811319918 -> 27 out of 31
Fetching file 811147926 -> 28 out of 31
Fetching file 810973966 -> 29 out of 31
Fetching file 810793357 -> 30 out of 31
Fetching file 810615891 -> 31 out of 31
Merging csv files...
DeliveryDate HourEnding SettlementPoint SettlementPointPrice DSTFlag
0 12/22/2021 01:00 AEEC 25.07 N
1 12/22/2021 01:00 AJAXWIND_RN 25.07 N
2 12/22/2021 01:00 ALGOD_ALL_RN 25.01 N
3 12/22/2021 01:00 ALVIN_RN 24.11 N
4 12/22/2021 01:00 AMADEUS_ALL 25.07 N
5 12/22/2021 01:00 AMISTAD_ALL 25.06 N
6 12/22/2021 01:00 AMOCOOIL_CC1 25.98 N
7 12/22/2021 01:00 AMOCOOIL_CC2 25.98 N
8 12/22/2021 01:00 AMOCO_PUN1 25.98 N
9 12/22/2021 01:00 AMOCO_PUN2 25.98 N
10 12/22/2021 01:00 AMO_AMOCO_1 25.98 N
11 12/22/2021 01:00 AMO_AMOCO_2 25.98 N
12 12/22/2021 01:00 AMO_AMOCO_5 25.98 N
13 12/22/2021 01:00 AMO_AMOCO_G1 25.98 N
14 12/22/2021 01:00 AMO_AMOCO_G2 25.98 N
15 12/22/2021 01:00 AMO_AMOCO_G3 25.98 N
16 12/22/2021 01:00 AMO_AMOCO_S1 25.98 N
17 12/22/2021 01:00 AMO_AMOCO_S2 25.98 N
18 12/22/2021 01:00 ANACACHO_ANA 25.05 N
19 12/22/2021 01:00 ANCHOR_ALL 25.08 N