获取每个标签对应的每个文本

Question

我正在尝试从网页的 left-side 列中获取一些数据。目的是使用 scrapy_playwright 单击所有 show more 按钮，并获取属于 show more 列表的每个元素的标题。但是，当我运行我的抓取工具时，它会为所有列表重复相同的 header make。我需要为每组列表获取这些唯一信息。

这是我的刮板：

import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
from scrapy_playwright.page import PageCoroutine

class ConfusedItem(scrapy.Item):
    clicks = Field(output_processor = TakeFirst())
    category = Field(output_processor = TakeFirst())

class ConfusedSpider(scrapy.Spider):
    name = 'confused'
    allowed_domains = ['x']
    start_urls = ['https://www.theparking.eu/used-cars/#!/used-cars/%3Fid_categorie%3D0']

    custom_settings = {
        'User_Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
        'DOWNLOAD_DELAY':0.5
    }

    def start_requests(self):
        for url in self.start_urls:
            for i in range(0, 11):
                yield scrapy.Request(
                    url = url, 
                    callback = self.parse,
                    meta= dict(
                            playwright = True,
                            playwright_include_page = True,
                            playwright_page_coroutines = [
                            PageCoroutine("click", selector=f"(//div[@class='toggle-bottom-filter'])[{i}]"),
                            PageCoroutine("wait_for_timeout", 5000),
                                    ]
                                ),
            )

    def parse(self, response):
        container = response.xpath("(//div[@id]//ul[@class='list-filter disp-bloc list-model1'])//li")
        test= response.xpath("(//div[@class='elem-filter id_marque clearfix'])")
        for items in container:
            for values in test:
                loader = ItemLoader(ConfusedItem(), selector = items)
                loader.add_xpath('clicks', './/@onclick')
                loader.add_value('category', values.xpath("(//h2[@class=' select-load select-off'])//text()").getall())
                yield loader.load_item()

        
process = CrawlerProcess(
    settings = {
        'FEED_URI':'json_data.jl',
        'FEED_FORMAT':'jsonlines'
    }
)
process.crawl(ConfusedSpider)
process.start()

输出：

{'category': 'Make',
 'clicks': "javascript:ctrl.set_criteria('id_vendeur',2,'Dealer')"}
2022-01-27 15:17:04 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.theparking.eu/used-cars/?_escaped_fragment_=%2Fused-cars%2F%253Fid_categorie%253D0>
{'category': 'Make',
 'clicks': "javascript:ctrl.set_criteria('id_fraicheur',30,'less than 30 day')"}
2022-01-27 15:17:04 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.theparking.eu/used-cars/?_escaped_fragment_=%2Fused-cars%2F%253Fid_categorie%253D0>
{'category': 'Make',
 'clicks': "javascript:ctrl.set_criteria('id_fraicheur',31,'more than 30 day')"}

预期输出：

{'category': 'SELLER TYPE',
 'clicks': "javascript:ctrl.set_criteria('id_vendeur',2,'Dealer')"}
2022-01-27 15:17:04 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.theparking.eu/used-cars/?_escaped_fragment_=%2Fused-cars%2F%253Fid_categorie%253D0>
{'category': 'FIRST LISTING DATE',
 'clicks': "javascript:ctrl.set_criteria('id_fraicheur',30,'less than 30 day')"}
2022-01-27 15:17:04 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.theparking.eu/used-cars/?_escaped_fragment_=%2Fused-cars%2F%253Fid_categorie%253D0>
{'category': 'FIRST LISTING DATE',
 'clicks': "javascript:ctrl.set_criteria('id_fraicheur',31,'more than 30 day')"}

Answer 1

您的代码有 2 个问题。一是你的 xpath 选择器不正确，二是你没有使用 scrapy playwright，因此点击没有完成。循环和更改项目索引是不正确的，因为一旦您单击一个项目，该项目就会从 DOM 中删除，因此下一个项目现在位于第一个索引处。此外，要启用 scrapy-playwright，您至少需要进行以下附加设置：

'DOWNLOAD_HANDLERS': {
            "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
        },
'TWISTED_REACTOR': "twisted.internet.asyncioreactor.AsyncioSelectorReactor",

我已经在下面的代码中纠正了这两个问题。您将需要添加一些错误处理，并找到一种更好的方法来了解您应该在代码中启用多少次点击。

import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
from scrapy_playwright.page import PageCoroutine


class ConfusedItem(scrapy.Item):
    clicks = Field(output_processor=TakeFirst())
    category = Field(output_processor=TakeFirst())


class ConfusedSpider(scrapy.Spider):
    name = 'confused'
    allowed_domains = ['x']
    start_urls = ['https://www.theparking.eu/used-cars/#!/used-cars/%3Fid_categorie%3D0']

    custom_settings = {
        'User_Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
        'DOWNLOAD_DELAY': 0.5
    }

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url=url,
                callback=self.parse,
                meta=dict(
                    playwright=True,
                    playwright_page_coroutines=[
                        PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
                        PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
                        PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
                        PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
                        PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
                        PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
                        PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
                        PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
                        PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
                    ]
                ),
            )

    def parse(self, response):
        for category in response.xpath("//div[@id='face_links']/div"):
            name = category.xpath("./h2/text()").get()
            for item in category.xpath("./ul/li"):
                loader = ItemLoader(ConfusedItem(), selector=item)
                loader.add_xpath('clicks', './@onclick')
                loader.add_value("category", name)
                yield loader.load_item()



process = CrawlerProcess(
    settings={
        'FEED_URI': 'json_data.jl',
        'FEED_FORMAT': 'jsonlines',
        'DOWNLOAD_HANDLERS': {
            "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
        },
        'TWISTED_REACTOR': "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
        "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 60000,
        "PLAYWRIGHT_BROWSER_TYPE": "webkit"
    }
)
process.crawl(ConfusedSpider)
process.start()

示例输出如下

获取每个标签对应的每个文本

Getting each text corresponding to the each tag

scrapy