无法在蜘蛛中正确使用 csv 管道和图像管道
Can't use csv pipelines and images pipelines within a spider correctly
我正在想办法在 csv 文件中写入前两个字段并使用后两个字段同时下载文件夹中的图像。我创建了两个自定义管道来实现这一点。
这是蜘蛛:
import scrapy
class PagalWorldSpider(scrapy.Spider):
name = 'pagalworld'
start_urls = ['https://www.pagalworld.pw/indian-pop-mp3-songs-2021/files.html']
custom_settings = {
'ITEM_PIPELINES': {
'my_project.pipelines.PagalWorldImagePipeline': 1,
'my_project.pipelines.CSVExportPipeline': 300
},
'IMAGES_STORE': r"C:\Users\WCS\Desktop\Images",
}
def start_requests(self):
for start_url in self.start_urls:
yield scrapy.Request(start_url,callback=self.parse)
def parse(self, response):
for item in response.css(".files-list .listbox a[href]::attr(href)").getall():
inner_page_link = response.urljoin(item)
yield scrapy.Request(inner_page_link,callback=self.parse_download_links)
def parse_download_links(self,response):
title = response.css("h1.title::text").get()
categories = ', '.join(response.css("ul.breadcrumb > li > a::text").getall())
file_link = response.css(".file-details audio > source::attr(src)").get()
image_link = response.urljoin(response.css(".alb-img-det > img[data-src]::attr('data-src')").get())
image_name = file_link.split("-")[-1].strip().replace(" ","_").replace(".mp3","")
yield {"Title":title,"categories":categories,"image_urls":[image_link],"image_name":image_name}
如果我按原样执行脚本,我会在一个 csv 文件中获得所有四个字段,这些字段是我在 parse_download_links
方法中产生的。该脚本还可以准确地下载和重命名图像。
前两个字段Title
和categories
是我希望写入csv文件的内容,而不是image_urls
和image_name
。但是,image_urls
和 image_name
这两个字段用于下载和重命名图像。
如何正确使用这两个管道?
您不必为此目的创建 CSV 管道。阅读 this.
import scrapy
class PagalWorldSpider(scrapy.Spider):
name = 'pagalworld'
start_urls = ['https://www.pagalworld.pw/indian-pop-mp3-songs-2021/files.html']
custom_settings = {
'ITEM_PIPELINES': {
'my_project.pipelines.PagalWorldImagePipeline': 1,
# 'my_project.pipelines.CSVExportPipeline': 300
},
'IMAGES_STORE': r'C:\Users\WCS\Desktop\Images',
'FEEDS': {
r'file:///C:\Users\WCS\Desktop\output.csv': {'format': 'csv', 'overwrite': True}
},
'FEED_EXPORT_FIELDS': ['Title', 'categories']
}
def start_requests(self):
for start_url in self.start_urls:
yield scrapy.Request(start_url, callback=self.parse)
def parse(self, response):
for item in response.css(".files-list .listbox a[href]::attr(href)").getall():
inner_page_link = response.urljoin(item)
yield scrapy.Request(inner_page_link, callback=self.parse_download_links)
def parse_download_links(self,response):
title = response.css("h1.title::text").get()
categories = ', '.join(response.css("ul.breadcrumb > li > a::text").getall())
file_link = response.css(".file-details audio > source::attr(src)").get()
image_link = response.urljoin(response.css(".alb-img-det > img[data-src]::attr('data-src')").get())
image_name = file_link.split("-")[-1].strip().replace(" ", "_").replace(".mp3", "")
yield {"Title": title, "categories": categories, "image_urls": [image_link], "image_name": image_name}
输出:
Heartfail - Mika Singh mp3 song Download PagalWorld.com,"Home, MUSIC, INDIPOP, Indian Pop Mp3 Songs 2021"
Fakir - Hansraj Raghuwanshi mp3 song Download PagalWorld.com,"Home, MUSIC, INDIPOP, Indian Pop Mp3 Songs 2021"
Humsafar - Suyyash Rai mp3 song Download PagalWorld.com,"Home, MUSIC, INDIPOP, Indian Pop Mp3 Songs 2021"
...
...
...
编辑:
main.py:
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
if __name__ == "__main__":
spider = 'pagalworld'
settings = get_project_settings()
settings['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
process = CrawlerProcess(settings)
process.crawl(spider)
process.start()
我正在想办法在 csv 文件中写入前两个字段并使用后两个字段同时下载文件夹中的图像。我创建了两个自定义管道来实现这一点。
这是蜘蛛:
import scrapy
class PagalWorldSpider(scrapy.Spider):
name = 'pagalworld'
start_urls = ['https://www.pagalworld.pw/indian-pop-mp3-songs-2021/files.html']
custom_settings = {
'ITEM_PIPELINES': {
'my_project.pipelines.PagalWorldImagePipeline': 1,
'my_project.pipelines.CSVExportPipeline': 300
},
'IMAGES_STORE': r"C:\Users\WCS\Desktop\Images",
}
def start_requests(self):
for start_url in self.start_urls:
yield scrapy.Request(start_url,callback=self.parse)
def parse(self, response):
for item in response.css(".files-list .listbox a[href]::attr(href)").getall():
inner_page_link = response.urljoin(item)
yield scrapy.Request(inner_page_link,callback=self.parse_download_links)
def parse_download_links(self,response):
title = response.css("h1.title::text").get()
categories = ', '.join(response.css("ul.breadcrumb > li > a::text").getall())
file_link = response.css(".file-details audio > source::attr(src)").get()
image_link = response.urljoin(response.css(".alb-img-det > img[data-src]::attr('data-src')").get())
image_name = file_link.split("-")[-1].strip().replace(" ","_").replace(".mp3","")
yield {"Title":title,"categories":categories,"image_urls":[image_link],"image_name":image_name}
如果我按原样执行脚本,我会在一个 csv 文件中获得所有四个字段,这些字段是我在 parse_download_links
方法中产生的。该脚本还可以准确地下载和重命名图像。
前两个字段Title
和categories
是我希望写入csv文件的内容,而不是image_urls
和image_name
。但是,image_urls
和 image_name
这两个字段用于下载和重命名图像。
如何正确使用这两个管道?
您不必为此目的创建 CSV 管道。阅读 this.
import scrapy
class PagalWorldSpider(scrapy.Spider):
name = 'pagalworld'
start_urls = ['https://www.pagalworld.pw/indian-pop-mp3-songs-2021/files.html']
custom_settings = {
'ITEM_PIPELINES': {
'my_project.pipelines.PagalWorldImagePipeline': 1,
# 'my_project.pipelines.CSVExportPipeline': 300
},
'IMAGES_STORE': r'C:\Users\WCS\Desktop\Images',
'FEEDS': {
r'file:///C:\Users\WCS\Desktop\output.csv': {'format': 'csv', 'overwrite': True}
},
'FEED_EXPORT_FIELDS': ['Title', 'categories']
}
def start_requests(self):
for start_url in self.start_urls:
yield scrapy.Request(start_url, callback=self.parse)
def parse(self, response):
for item in response.css(".files-list .listbox a[href]::attr(href)").getall():
inner_page_link = response.urljoin(item)
yield scrapy.Request(inner_page_link, callback=self.parse_download_links)
def parse_download_links(self,response):
title = response.css("h1.title::text").get()
categories = ', '.join(response.css("ul.breadcrumb > li > a::text").getall())
file_link = response.css(".file-details audio > source::attr(src)").get()
image_link = response.urljoin(response.css(".alb-img-det > img[data-src]::attr('data-src')").get())
image_name = file_link.split("-")[-1].strip().replace(" ", "_").replace(".mp3", "")
yield {"Title": title, "categories": categories, "image_urls": [image_link], "image_name": image_name}
输出:
Heartfail - Mika Singh mp3 song Download PagalWorld.com,"Home, MUSIC, INDIPOP, Indian Pop Mp3 Songs 2021"
Fakir - Hansraj Raghuwanshi mp3 song Download PagalWorld.com,"Home, MUSIC, INDIPOP, Indian Pop Mp3 Songs 2021"
Humsafar - Suyyash Rai mp3 song Download PagalWorld.com,"Home, MUSIC, INDIPOP, Indian Pop Mp3 Songs 2021"
...
...
...
编辑:
main.py:
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
if __name__ == "__main__":
spider = 'pagalworld'
settings = get_project_settings()
settings['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
process = CrawlerProcess(settings)
process.crawl(spider)
process.start()