我正在获取空的导出文件
im getting empty exported file scrapy
我正在尝试编写一个解析器来抓取,但出现了问题,可以帮助我解决问题吗?我将 spider 链接到 items.py
import scrapy
from dyplom.items import DyplomtwoItem
class Dyplom(scrapy.Spider):
name = "dyplom"
start_urls = ['https://www.edimdoma.ru/retsepty?tags%5Brecipe_cuisine%5D%5B%5D=%D0%B0%D0%BC%D0%B5%D1%80%D0%B8%D0%BA%D0%B0%D0%BD%D1%81%D0%BA%D0%B0%D1%8F+%D0%BA%D1%83%D1%85%D0%BD%D1%8F&with_ingredient=&with_ingredient_condition=and&without_ingredient=&user_ids=&field=&direction=&query=']
for i in range(2, 6):
start_urls.append("https://www.edimdoma.ru/retsepty?_=1529256600422"
"&direction=&field=&page=" + str(i) +
"&query=&tags%5Brecipe_cuisine%5D%5B%5D=&user"
"_ids=&with_ingredient=&without_ingredient=")
def parse(self, response):
for href in response.xpath("//article[contains(@class, 'card')]/a//@href"):
# add the scheme, eg http://
url = "https://www.edimdoma.ru" + href.extract()
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
item = DyplomtwoItem()
item['id'] = response.xpath("//div[contains(@class, 'button button_print')]"
"//a[contains(@class, 'drop-down_item')]/@href").extract()[0]
item['title'] = response.xpath("//h1[contains(@class, 'recipe-header_name')]"
"/descendant::text()").extract()
item['image'] = response.xpath("//div[contains(@class, 'content-media')]/img//@src").extract()
item['recipe'] = response.xpath("//div[contains(@class, 'content-box_content')]/div[contains"
"(@class, 'plain-text recipe_step_text')]/descendant::text()").extract()
yield item
我将你的项目中的 class 收录在了刮板中,这样我就可以剖析你所做的事情。这与您的 items.py
.
基本相同
事实证明,您的选择器存在一些问题,您没有选择所有文本。您需要 getall()
附加到食谱而不是 extract()
。
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from itemloaders import ItemLoader
#from dyplom.items import DyplomtwoItem
class DyplomItem(scrapy.Item):
id = Field(output_processor = TakeFirst())
title = Field(output_processor = TakeFirst())
image = Field(output_processor = TakeFirst())
recipe = Field()
class Dyplom(scrapy.Spider):
name = "dyplom"
start_urls = ['https://www.edimdoma.ru/retsepty?tags%5Brecipe_cuisine%5D%5B%5D=%D0%B0%D0%BC%D0%B5%D1%80%D0%B8%D0%BA%D0%B0%D0%BD%D1%81%D0%BA%D0%B0%D1%8F+%D0%BA%D1%83%D1%85%D0%BD%D1%8F&with_ingredient=&with_ingredient_condition=and&without_ingredient=&user_ids=&field=&direction=&query=']
for i in range(2, 6):
start_urls.append("https://www.edimdoma.ru/retsepty?_=1529256600422"
"&direction=&field=&page=" + str(i) +
"&query=&tags%5Brecipe_cuisine%5D%5B%5D=&user"
"_ids=&with_ingredient=&without_ingredient=")
def parse(self, response):
for href in response.xpath("//article[contains(@class, 'card')]/a//@href"):
# add the scheme, eg http://
url = "https://www.edimdoma.ru" + href.extract()
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
loaders = ItemLoader(DyplomItem())
loaders.add_value('id', response.xpath("((//div[contains(@class, 'button button_print')])[1]//a)[1]/@href").get())
loaders.add_value('title', response.xpath("//div[@class='content-box']//h1//text()").get())
loaders.add_value('image', response.xpath("(//div[contains(@class, 'content-media')]//img/@src)[1]").get())
for text_stuff in response.xpath("//div[contains(@class, 'plain-text recipe_step_text')]/descendant::text()").getall():
loaders.add_value('recipe',text_stuff)
yield loaders.load_item()
输出:
{'id': '/retsepty/146847-skrembl-s-bekonom/print?wi=true',
'image': 'https://e3.edimdoma.ru/data/recipes/0014/6847/146847-ed4_wide.jpg?1631992625',
'recipe': ['Бекон нарежьте кубиком. Можно взять и сырокопченый, и '
'варенокопченый, и свежий бекон.',
'В сковороде растопите сливочное масло.'],
'title': 'Скрэмбл с беконом'}
我正在尝试编写一个解析器来抓取,但出现了问题,可以帮助我解决问题吗?我将 spider 链接到 items.py
import scrapy
from dyplom.items import DyplomtwoItem
class Dyplom(scrapy.Spider):
name = "dyplom"
start_urls = ['https://www.edimdoma.ru/retsepty?tags%5Brecipe_cuisine%5D%5B%5D=%D0%B0%D0%BC%D0%B5%D1%80%D0%B8%D0%BA%D0%B0%D0%BD%D1%81%D0%BA%D0%B0%D1%8F+%D0%BA%D1%83%D1%85%D0%BD%D1%8F&with_ingredient=&with_ingredient_condition=and&without_ingredient=&user_ids=&field=&direction=&query=']
for i in range(2, 6):
start_urls.append("https://www.edimdoma.ru/retsepty?_=1529256600422"
"&direction=&field=&page=" + str(i) +
"&query=&tags%5Brecipe_cuisine%5D%5B%5D=&user"
"_ids=&with_ingredient=&without_ingredient=")
def parse(self, response):
for href in response.xpath("//article[contains(@class, 'card')]/a//@href"):
# add the scheme, eg http://
url = "https://www.edimdoma.ru" + href.extract()
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
item = DyplomtwoItem()
item['id'] = response.xpath("//div[contains(@class, 'button button_print')]"
"//a[contains(@class, 'drop-down_item')]/@href").extract()[0]
item['title'] = response.xpath("//h1[contains(@class, 'recipe-header_name')]"
"/descendant::text()").extract()
item['image'] = response.xpath("//div[contains(@class, 'content-media')]/img//@src").extract()
item['recipe'] = response.xpath("//div[contains(@class, 'content-box_content')]/div[contains"
"(@class, 'plain-text recipe_step_text')]/descendant::text()").extract()
yield item
我将你的项目中的 class 收录在了刮板中,这样我就可以剖析你所做的事情。这与您的 items.py
.
事实证明,您的选择器存在一些问题,您没有选择所有文本。您需要 getall()
附加到食谱而不是 extract()
。
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from itemloaders import ItemLoader
#from dyplom.items import DyplomtwoItem
class DyplomItem(scrapy.Item):
id = Field(output_processor = TakeFirst())
title = Field(output_processor = TakeFirst())
image = Field(output_processor = TakeFirst())
recipe = Field()
class Dyplom(scrapy.Spider):
name = "dyplom"
start_urls = ['https://www.edimdoma.ru/retsepty?tags%5Brecipe_cuisine%5D%5B%5D=%D0%B0%D0%BC%D0%B5%D1%80%D0%B8%D0%BA%D0%B0%D0%BD%D1%81%D0%BA%D0%B0%D1%8F+%D0%BA%D1%83%D1%85%D0%BD%D1%8F&with_ingredient=&with_ingredient_condition=and&without_ingredient=&user_ids=&field=&direction=&query=']
for i in range(2, 6):
start_urls.append("https://www.edimdoma.ru/retsepty?_=1529256600422"
"&direction=&field=&page=" + str(i) +
"&query=&tags%5Brecipe_cuisine%5D%5B%5D=&user"
"_ids=&with_ingredient=&without_ingredient=")
def parse(self, response):
for href in response.xpath("//article[contains(@class, 'card')]/a//@href"):
# add the scheme, eg http://
url = "https://www.edimdoma.ru" + href.extract()
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
loaders = ItemLoader(DyplomItem())
loaders.add_value('id', response.xpath("((//div[contains(@class, 'button button_print')])[1]//a)[1]/@href").get())
loaders.add_value('title', response.xpath("//div[@class='content-box']//h1//text()").get())
loaders.add_value('image', response.xpath("(//div[contains(@class, 'content-media')]//img/@src)[1]").get())
for text_stuff in response.xpath("//div[contains(@class, 'plain-text recipe_step_text')]/descendant::text()").getall():
loaders.add_value('recipe',text_stuff)
yield loaders.load_item()
输出:
{'id': '/retsepty/146847-skrembl-s-bekonom/print?wi=true',
'image': 'https://e3.edimdoma.ru/data/recipes/0014/6847/146847-ed4_wide.jpg?1631992625',
'recipe': ['Бекон нарежьте кубиком. Можно взять и сырокопченый, и '
'варенокопченый, и свежий бекон.',
'В сковороде растопите сливочное масло.'],
'title': 'Скрэмбл с беконом'}