scrapy 进入下一页并下载所有文件
scrapy to get into next page and download all files
我是 scrapy 和 python 的新手,我可以从 URL 获取详细信息,我想进入 link 并下载所有文件(.htm 和 .txt) .
我的代码
import scrapy
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=&filenum=&State=&Country=&SIC=2834&owner=exclude&Find=Find+Companies&action=getcompany"]
def parse(self, response):
for link in response.xpath('//table[@summary="Results"]//td[@scope="row"]/a/@href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_page)
def parse_page(self, response):
for links in response.xpath('//table[@summary="Results"]//a[@id="documentsbutton"]/@href').extract():
targetLink = response.urljoin(links)
yield {"links":targetLink}
并且我需要进入link并下载所有以.htm 和.txt 文件结尾的文件。以下代码无效..
if link.endswith('.htm'):
link = urlparse.urljoin(base_url, link)
req = Request(link, callback=self.save_pdf)
yield req
def save_pdf(self, response):
path = response.url.split('/')[-1]
with open(path, 'wb') as f:
f.write(response.body)
谁能帮我解决这个问题?提前致谢。
尝试以下方法将文件下载到您的桌面或您在脚本中提到的任何位置:
import scrapy, os
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=&filenum=&State=&Country=&SIC=2834&owner=exclude&Find=Find+Companies&action=getcompany"]
def parse(self, response):
for link in response.xpath('//table[@summary="Results"]//td[@scope="row"]/a/@href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
def parse_links(self, response):
for links in response.xpath('//table[@summary="Results"]//a[@id="documentsbutton"]/@href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(@summary,"Document")]//td[@scope="row"]/a/@href').extract():
if links.endswith(".htm") or links.endswith(".txt"):
baseLink = response.urljoin(links)
yield scrapy.Request(url = baseLink, callback = self.download_files)
def download_files(self, response):
path = response.url.split('/')[-1]
dirf = r"C:\Users\WCS\Desktop\Storage"
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
with open(path, 'wb') as f:
f.write(response.body)
更清楚:您需要明确指定 dirf = r"C:\Users\WCS\Desktop\Storage"
,其中 C:\Users\WCS\Desktop
或其他内容将是您想要的位置。但是,脚本会自动创建 Storage
文件夹来保存这些文件。
我是 scrapy 和 python 的新手,我可以从 URL 获取详细信息,我想进入 link 并下载所有文件(.htm 和 .txt) .
我的代码
import scrapy
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=&filenum=&State=&Country=&SIC=2834&owner=exclude&Find=Find+Companies&action=getcompany"]
def parse(self, response):
for link in response.xpath('//table[@summary="Results"]//td[@scope="row"]/a/@href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_page)
def parse_page(self, response):
for links in response.xpath('//table[@summary="Results"]//a[@id="documentsbutton"]/@href').extract():
targetLink = response.urljoin(links)
yield {"links":targetLink}
并且我需要进入link并下载所有以.htm 和.txt 文件结尾的文件。以下代码无效..
if link.endswith('.htm'):
link = urlparse.urljoin(base_url, link)
req = Request(link, callback=self.save_pdf)
yield req
def save_pdf(self, response):
path = response.url.split('/')[-1]
with open(path, 'wb') as f:
f.write(response.body)
谁能帮我解决这个问题?提前致谢。
尝试以下方法将文件下载到您的桌面或您在脚本中提到的任何位置:
import scrapy, os
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=&filenum=&State=&Country=&SIC=2834&owner=exclude&Find=Find+Companies&action=getcompany"]
def parse(self, response):
for link in response.xpath('//table[@summary="Results"]//td[@scope="row"]/a/@href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
def parse_links(self, response):
for links in response.xpath('//table[@summary="Results"]//a[@id="documentsbutton"]/@href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(@summary,"Document")]//td[@scope="row"]/a/@href').extract():
if links.endswith(".htm") or links.endswith(".txt"):
baseLink = response.urljoin(links)
yield scrapy.Request(url = baseLink, callback = self.download_files)
def download_files(self, response):
path = response.url.split('/')[-1]
dirf = r"C:\Users\WCS\Desktop\Storage"
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
with open(path, 'wb') as f:
f.write(response.body)
更清楚:您需要明确指定 dirf = r"C:\Users\WCS\Desktop\Storage"
,其中 C:\Users\WCS\Desktop
或其他内容将是您想要的位置。但是,脚本会自动创建 Storage
文件夹来保存这些文件。