我正在获取空的导出文件

Question

我正在尝试编写一个解析器来抓取，但出现了问题，可以帮助我解决问题吗？我将 spider 链接到 items.py

import scrapy

from dyplom.items import DyplomtwoItem
class Dyplom(scrapy.Spider):
    name = "dyplom"
start_urls = ['https://www.edimdoma.ru/retsepty?tags%5Brecipe_cuisine%5D%5B%5D=%D0%B0%D0%BC%D0%B5%D1%80%D0%B8%D0%BA%D0%B0%D0%BD%D1%81%D0%BA%D0%B0%D1%8F+%D0%BA%D1%83%D1%85%D0%BD%D1%8F&with_ingredient=&with_ingredient_condition=and&without_ingredient=&user_ids=&field=&direction=&query=']

for i in range(2, 6):
    start_urls.append("https://www.edimdoma.ru/retsepty?_=1529256600422"
                                        "&direction=&field=&page=" + str(i) +
                                        "&query=&tags%5Brecipe_cuisine%5D%5B%5D=&user"
                                        "_ids=&with_ingredient=&without_ingredient=")


def parse(self, response):
    for href in response.xpath("//article[contains(@class, 'card')]/a//@href"):
        # add the scheme, eg http://
        url = "https://www.edimdoma.ru" + href.extract()
        yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
    item = DyplomtwoItem()
    item['id'] = response.xpath("//div[contains(@class, 'button button_print')]"
                                "//a[contains(@class, 'drop-down_item')]/@href").extract()[0]
    item['title'] = response.xpath("//h1[contains(@class, 'recipe-header_name')]"
                               "/descendant::text()").extract()
    item['image'] = response.xpath("//div[contains(@class, 'content-media')]/img//@src").extract()

    item['recipe'] = response.xpath("//div[contains(@class, 'content-box_content')]/div[contains"
                                "(@class, 'plain-text recipe_step_text')]/descendant::text()").extract()

    yield item

Answer 1

我将你的项目中的 class 收录在了刮板中，这样我就可以剖析你所做的事情。这与您的 items.py.

基本相同

事实证明，您的选择器存在一些问题，您没有选择所有文本。您需要 getall() 附加到食谱而不是 extract()。

import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from itemloaders import ItemLoader

#from dyplom.items import DyplomtwoItem

class DyplomItem(scrapy.Item):
    id = Field(output_processor = TakeFirst())
    title = Field(output_processor = TakeFirst())
    image = Field(output_processor = TakeFirst())
    recipe = Field()

class Dyplom(scrapy.Spider):
    name = "dyplom"
    start_urls = ['https://www.edimdoma.ru/retsepty?tags%5Brecipe_cuisine%5D%5B%5D=%D0%B0%D0%BC%D0%B5%D1%80%D0%B8%D0%BA%D0%B0%D0%BD%D1%81%D0%BA%D0%B0%D1%8F+%D0%BA%D1%83%D1%85%D0%BD%D1%8F&with_ingredient=&with_ingredient_condition=and&without_ingredient=&user_ids=&field=&direction=&query=']

    for i in range(2, 6):
        start_urls.append("https://www.edimdoma.ru/retsepty?_=1529256600422"
                                     "&direction=&field=&page=" + str(i) +
                                     "&query=&tags%5Brecipe_cuisine%5D%5B%5D=&user"
                                     "_ids=&with_ingredient=&without_ingredient=")
    
    def parse(self, response):
        for href in response.xpath("//article[contains(@class, 'card')]/a//@href"):
            # add the scheme, eg http://
            url = "https://www.edimdoma.ru" + href.extract()
            yield scrapy.Request(url, callback=self.parse_dir_contents)
    def parse_dir_contents(self, response):
        loaders = ItemLoader(DyplomItem())
        loaders.add_value('id', response.xpath("((//div[contains(@class, 'button button_print')])[1]//a)[1]/@href").get())
        loaders.add_value('title', response.xpath("//div[@class='content-box']//h1//text()").get())
        loaders.add_value('image', response.xpath("(//div[contains(@class, 'content-media')]//img/@src)[1]").get())
        for text_stuff in response.xpath("//div[contains(@class, 'plain-text recipe_step_text')]/descendant::text()").getall():
            loaders.add_value('recipe',text_stuff)
            yield loaders.load_item()

输出：

{'id': '/retsepty/146847-skrembl-s-bekonom/print?wi=true',
 'image': 'https://e3.edimdoma.ru/data/recipes/0014/6847/146847-ed4_wide.jpg?1631992625',
 'recipe': ['Бекон нарежьте кубиком. Можно взять и сырокопченый, и '
            'варенокопченый, и свежий бекон.',
            'В сковороде растопите сливочное масло.'],
 'title': 'Скрэмбл с беконом'}

我正在获取空的导出文件

im getting empty exported file scrapy

python

scrapy