获取每个标签对应的每个文本

Getting each text corresponding to the each tag

我正在尝试从网页的 left-side 列中获取一些数据。目的是使用 scrapy_playwright 单击所有 show more 按钮,并获取属于 show more 列表的每个元素的标题。但是,当我 运行 我的抓取工具时,它会为所有列表重复相同的 header make。我需要为每组列表获取这些唯一信息。

这是我的刮板:

import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
from scrapy_playwright.page import PageCoroutine

class ConfusedItem(scrapy.Item):
    clicks = Field(output_processor = TakeFirst())
    category = Field(output_processor = TakeFirst())

class ConfusedSpider(scrapy.Spider):
    name = 'confused'
    allowed_domains = ['x']
    start_urls = ['https://www.theparking.eu/used-cars/#!/used-cars/%3Fid_categorie%3D0']

    custom_settings = {
        'User_Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
        'DOWNLOAD_DELAY':0.5
    }

    def start_requests(self):
        for url in self.start_urls:
            for i in range(0, 11):
                yield scrapy.Request(
                    url = url, 
                    callback = self.parse,
                    meta= dict(
                            playwright = True,
                            playwright_include_page = True,
                            playwright_page_coroutines = [
                            PageCoroutine("click", selector=f"(//div[@class='toggle-bottom-filter'])[{i}]"),
                            PageCoroutine("wait_for_timeout", 5000),
                                    ]
                                ),
            )

    def parse(self, response):
        container = response.xpath("(//div[@id]//ul[@class='list-filter disp-bloc list-model1'])//li")
        test= response.xpath("(//div[@class='elem-filter id_marque clearfix'])")
        for items in container:
            for values in test:
                loader = ItemLoader(ConfusedItem(), selector = items)
                loader.add_xpath('clicks', './/@onclick')
                loader.add_value('category', values.xpath("(//h2[@class=' select-load select-off'])//text()").getall())
                yield loader.load_item()

        
process = CrawlerProcess(
    settings = {
        'FEED_URI':'json_data.jl',
        'FEED_FORMAT':'jsonlines'
    }
)
process.crawl(ConfusedSpider)
process.start()

输出:

{'category': 'Make',
 'clicks': "javascript:ctrl.set_criteria('id_vendeur',2,'Dealer')"}
2022-01-27 15:17:04 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.theparking.eu/used-cars/?_escaped_fragment_=%2Fused-cars%2F%253Fid_categorie%253D0>
{'category': 'Make',
 'clicks': "javascript:ctrl.set_criteria('id_fraicheur',30,'less than 30 day')"}
2022-01-27 15:17:04 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.theparking.eu/used-cars/?_escaped_fragment_=%2Fused-cars%2F%253Fid_categorie%253D0>
{'category': 'Make',
 'clicks': "javascript:ctrl.set_criteria('id_fraicheur',31,'more than 30 day')"}

预期输出:

{'category': 'SELLER TYPE',
 'clicks': "javascript:ctrl.set_criteria('id_vendeur',2,'Dealer')"}
2022-01-27 15:17:04 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.theparking.eu/used-cars/?_escaped_fragment_=%2Fused-cars%2F%253Fid_categorie%253D0>
{'category': 'FIRST LISTING DATE',
 'clicks': "javascript:ctrl.set_criteria('id_fraicheur',30,'less than 30 day')"}
2022-01-27 15:17:04 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.theparking.eu/used-cars/?_escaped_fragment_=%2Fused-cars%2F%253Fid_categorie%253D0>
{'category': 'FIRST LISTING DATE',
 'clicks': "javascript:ctrl.set_criteria('id_fraicheur',31,'more than 30 day')"}

您的代码有 2 个问题。一是你的 xpath 选择器不正确,二是你没有使用 scrapy playwright,因此点击没有完成。循环和更改项目索引是不正确的,因为一旦您单击一个项目,该项目就会从 DOM 中删除,因此下一个项目现在位于第一个索引处。此外,要启用 scrapy-playwright,您至少需要进行以下附加设置:

'DOWNLOAD_HANDLERS': {
            "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
        },
'TWISTED_REACTOR': "twisted.internet.asyncioreactor.AsyncioSelectorReactor",

我已经在下面的代码中纠正了这两个问题。您将需要添加一些错误处理,并找到一种更好的方法来了解您应该在代码中启用多少次点击。

import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
from scrapy_playwright.page import PageCoroutine


class ConfusedItem(scrapy.Item):
    clicks = Field(output_processor=TakeFirst())
    category = Field(output_processor=TakeFirst())


class ConfusedSpider(scrapy.Spider):
    name = 'confused'
    allowed_domains = ['x']
    start_urls = ['https://www.theparking.eu/used-cars/#!/used-cars/%3Fid_categorie%3D0']

    custom_settings = {
        'User_Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
        'DOWNLOAD_DELAY': 0.5
    }

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url=url,
                callback=self.parse,
                meta=dict(
                    playwright=True,
                    playwright_page_coroutines=[
                        PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
                        PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
                        PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
                        PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
                        PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
                        PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
                        PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
                        PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
                        PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
                    ]
                ),
            )

    def parse(self, response):
        for category in response.xpath("//div[@id='face_links']/div"):
            name = category.xpath("./h2/text()").get()
            for item in category.xpath("./ul/li"):
                loader = ItemLoader(ConfusedItem(), selector=item)
                loader.add_xpath('clicks', './@onclick')
                loader.add_value("category", name)
                yield loader.load_item()



process = CrawlerProcess(
    settings={
        'FEED_URI': 'json_data.jl',
        'FEED_FORMAT': 'jsonlines',
        'DOWNLOAD_HANDLERS': {
            "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
        },
        'TWISTED_REACTOR': "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
        "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 60000,
        "PLAYWRIGHT_BROWSER_TYPE": "webkit"
    }
)
process.crawl(ConfusedSpider)
process.start()


示例输出如下