如何使用 Scrapy 解析没有特定 .pdf-link 的 PDF?
How to use Scrapy to parse PDFs without a specific .pdf-link?
我尝试下载 PDF,但如果 https://ratsinformation.stadt-koeln.de/si0057.asp?__ksinr=23723 I see no .pdf
links which could be grabbed by Scrapy. This example shows the missing .pdf in the URL https://ratsinformation.stadt-koeln.de/getfile.asp?id=850608&type=do。
Scrapy 是否也能够处理 getfile.asp
链接以检测文件本身?
这是获取特定页面上所有 pdf 链接的方法:
import scrapy
from scrapy.pipelines.files import FilesPipeline
class PdfPipeline(FilesPipeline):
# to save with the name of the pdf from the website instead of hash
def file_path(self, request, response=None, info=None):
file_name = request.url.split('/')[-1]
return file_name
class StadtKoelnAmtsblattSpider(scrapy.Spider):
name = 'stadt_koeln_amtsblatt'
start_urls = ['https://ratsinformation.stadt-koeln.de/si0057.asp?__ksinr=23723']
custom_settings = {
"ITEM_PIPELINES": {
PdfPipeline: 100
},
"FILES_STORE": "downloaded_files"
}
def parse(self, response):
links = response.xpath("//a[@class='btn btn-blue']/@href").getall()
links = [response.urljoin(link) for link in links] # to make them absolute urls
yield {
"file_urls": links
}
我每次尝试下载文件时都收到错误消息。
OSError: [Errno 22] Invalid argument: 'downloaded_files\getfile.asp?id=821665&type=do'
错误是由PdfPipeline
引起的,因为url没有文件名,所以你必须在parse
方法中获取文件名,然后在管道如下。
import scrapy
from scrapy.pipelines.files import FilesPipeline
class PdfPipeline(FilesPipeline):
# to save with the name of the pdf from the website instead of hash
def file_path(self, request, response=None, info=None, *, item=None):
return item["filename"]
class StadtKoelnAmtsblattSpider(scrapy.Spider):
name = 'stadt_koeln_amtsblatt'
start_urls = ['https://ratsinformation.stadt-koeln.de/si0057.asp?__ksinr=23723']
custom_settings = {
"ITEM_PIPELINES": {
PdfPipeline: 100
},
"FILES_STORE": "downloaded_files",
"USER_AGENT": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36"
}
def parse(self, response):
for i, item in enumerate(response.xpath("//a[contains(@title, 'Dokument Download')]")):
title = item.xpath("./text()").get()
urls = item.xpath("./@href").getall()
if title:
yield {
"filename": title + str(i) + ".pdf",# to take care of duplicated file names
"file_urls": [response.urljoin(url) for url in urls]
}
我尝试下载 PDF,但如果 https://ratsinformation.stadt-koeln.de/si0057.asp?__ksinr=23723 I see no .pdf
links which could be grabbed by Scrapy. This example shows the missing .pdf in the URL https://ratsinformation.stadt-koeln.de/getfile.asp?id=850608&type=do。
Scrapy 是否也能够处理 getfile.asp
链接以检测文件本身?
这是获取特定页面上所有 pdf 链接的方法:
import scrapy
from scrapy.pipelines.files import FilesPipeline
class PdfPipeline(FilesPipeline):
# to save with the name of the pdf from the website instead of hash
def file_path(self, request, response=None, info=None):
file_name = request.url.split('/')[-1]
return file_name
class StadtKoelnAmtsblattSpider(scrapy.Spider):
name = 'stadt_koeln_amtsblatt'
start_urls = ['https://ratsinformation.stadt-koeln.de/si0057.asp?__ksinr=23723']
custom_settings = {
"ITEM_PIPELINES": {
PdfPipeline: 100
},
"FILES_STORE": "downloaded_files"
}
def parse(self, response):
links = response.xpath("//a[@class='btn btn-blue']/@href").getall()
links = [response.urljoin(link) for link in links] # to make them absolute urls
yield {
"file_urls": links
}
我每次尝试下载文件时都收到错误消息。
OSError: [Errno 22] Invalid argument: 'downloaded_files\getfile.asp?id=821665&type=do'
错误是由PdfPipeline
引起的,因为url没有文件名,所以你必须在parse
方法中获取文件名,然后在管道如下。
import scrapy
from scrapy.pipelines.files import FilesPipeline
class PdfPipeline(FilesPipeline):
# to save with the name of the pdf from the website instead of hash
def file_path(self, request, response=None, info=None, *, item=None):
return item["filename"]
class StadtKoelnAmtsblattSpider(scrapy.Spider):
name = 'stadt_koeln_amtsblatt'
start_urls = ['https://ratsinformation.stadt-koeln.de/si0057.asp?__ksinr=23723']
custom_settings = {
"ITEM_PIPELINES": {
PdfPipeline: 100
},
"FILES_STORE": "downloaded_files",
"USER_AGENT": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36"
}
def parse(self, response):
for i, item in enumerate(response.xpath("//a[contains(@title, 'Dokument Download')]")):
title = item.xpath("./text()").get()
urls = item.xpath("./@href").getall()
if title:
yield {
"filename": title + str(i) + ".pdf",# to take care of duplicated file names
"file_urls": [response.urljoin(url) for url in urls]
}