Python Playwright 的异步不处理所有抓取的页面

Question

在 Playwright 中抓取和解析 Javascript 页面。大约有 100 个 URL，但该过程未全部完成就结束了。

这可能是什么原因造成的？代码目前有效。

for语法是不是放错地方了？

如果您能告诉我我是否错误地使用了 async，我将不胜感激。

更改为当前代码。 以下命令在 Scrapy 中执行。 scrapy runspider kuti_info.py

import scrapy
import requests
from bs4 import BeautifulSoup
from time import sleep
from scrapy.selector import Selector
from playwright.sync_api import sync_playwright
import asyncio

class KutiSpider(scrapy.Spider):
    name = 'kuti'
    allowed_domains = ['xxxxxxx.jp']
    start_urls = ['https://xxxxxxx.jp/']


    def parse(self, response):
        urls = response.xpath('//ul[@class="areaList"]/a/@href')[0].get()
        yield response.follow(url=urls, callback=self.parse_area)

        # urls = response.xpath('//ul[@class="areaList"]')
        # for url in urls:
        #     yield response.follow(url=url.xpath('.//a/@href').get(), callback=self.parse_area)


    def parse_area(self, response):
        urls = response.xpath('//div[@class="salonName"]')
        for url in urls:

            yield response.follow(url=url.xpath('.//h3/a/@href').get(), callback=self.parse_shop)


        # next_page = response.xpath('//div[@class="pager"]//li/a[contains(text(), "次へ")]/@href').get()
        # if next_page:
        #     yield response.follow(url=next_page, callback=self.parse_area)

    async def parse_shop(self, response):
        try:
            r = requests.get(response.url)
            soup = BeautifulSoup(r.text, 'html.parser')
            repo = soup.find('div', {'class': 'abbr uTxt'})
        except:
            pass

        urls = response.xpath('//div[@class="viewMore"]/a/@href').get()
        for url in [urls]:
            newurls = response.urljoin(url) href="/therapistlist.php?id=!!!!"
            yield response.follow(url=newurls, callback=self.parse_therapist)

        # yield SeleniumRequest(url=str(newurls), screenshot=True, callback=self.parse_therapist, wait_time=2)
        try:
            yield {
            'shop_name': response.xpath('//span[@class="now"]/a/span/text()').get(),
            'shop_url': response.xpath('//dd/a/@href').get(),
            'area': response.xpath('//div[@class="basicInfo"]/dl/dt[contains(text(), "エリア")]/following-sibling::dd/text()').get(),
            'report-therapi-name': response.xpath('//div[@class="heading"]//span[@class="thName"]/a[1]/text()').get(),
            'report': repo.text
            }
        except:
            pass

    async def parse_therapist(self, response):
        with sync_playwright() as p:
            browser = p.chromium.launch()
            page = browser.new_page()
            page.goto(response.url)
            sleep(2)
            html = page.content()
            selector = Selector(text=html)
            idurls =  selector.xpath('//li[@therapist_id]/a/@href').get()
            # browser.close()
            yield response.follow(url=idurls, callback=self.parse_thera_page)

    async def parse_thera_page(self, response):
        with sync_playwright() as p:
            browser = p.chromium.launch()
            page = browser.new_page()
            print(response.url)
            page.goto(response.url)
            sleep(2)
            html = page.content()
            selector = Selector(text=html)
            print(selector.xpath('//p[@class="TopicPath"]/span[@class="now"]/a/span/text()'))
    # try:
    #     r = requests.get(response.url)
    #     soup = BeautifulSoup(r.text, 'html.parser')
    #     repo = soup.find('div', {'class': 'txt'})
    # except:
    #     pass
            yield {
            'therapist_name': selector.xpath('//p[@class="TopicPath"]/span[@class="now"]/a/span/text()').get(),
            # 'report': repo.text
            }

Answer 1

我在某些地方看到 .get()，所以它只从列表中获取第一项 - 即。它从 ~250 位治疗师名单中获得第一位治疗师。也许这就是您获得较少结果的问题。

我发现 therapistlist.php?id=... 使用 JavaScript 从 therapistlist.php?id=...&more 读取所有数据作为 JSON（最后是 &more）并呈现页面.通过这种方式，我将治疗师列表读取为 JSON 数据而没有 Playwright，所以我得到结果的速度要快得多。

我在大约 1 分钟内获得了大约 800 名治疗师。

如果您以 CSV 格式写入数据，那么您可能会遇到另一个问题。在 CSV 中，所有项目都必须具有相同的列 - 如果 Scrapy 看到 {'therapist_name': ...} 和列 therapist_name 而它在 shop data 中没有，那么它会跳过它 - 你可以仅在没有治疗师的商店获取文件。我在商店数据中添加了字段 therapist_name，现在 CSV 还保存了治疗师。

import scrapy
from time import sleep
from scrapy.selector import Selector

class KutiSpider(scrapy.Spider):
    name = 'kuti'
    allowed_domains = ['men-esthe.jp']
    start_urls = ['https://men-esthe.jp/']


    def parse(self, response):
        print('[parse] url:', response.url)
        
        urls = response.xpath('//ul[@class="areaList"]/a/@href')[0].get()
        print('[parse] len(urls):', len(urls), type(urls))
        
        yield response.follow(url=urls, callback=self.parse_area)

        # urls = response.xpath('//ul[@class="areaList"]')
        # for url in urls:
        #     yield response.follow(url=url.xpath('.//a/@href').get(), callback=self.parse_area)


    def parse_area(self, response):
        print('[parse_area] url:', response.url)

        urls = response.xpath('//div[@class="salonName"]')
        print('[parse_area] len(urls):', len(urls), type(urls))

        for url in urls:
            url = url.xpath('.//h3/a/@href').get()
            yield response.follow(url, callback=self.parse_shop)

        # next_page = response.xpath('//div[@class="pager"]//li/a[contains(text(), "次へ")]/@href').get()
        # if next_page:
        #     yield response.follow(url=next_page, callback=self.parse_area)

    def parse_shop(self, response):
        print('[parse_shop] url:', response.url)

        urls = response.xpath('//div[@class="viewMore"]/a/@href')
        print('[parse_shop] len(urls):', len(urls), type(urls))

        for url in urls.getall():
            print('[parse_shop] url:', url)
            yield response.follow(url=url + '&more', callback=self.parse_therapist)

        yield {
            'shop_name': response.xpath('//span[@class="now"]/a/span/text()').get(),
            'shop_url': response.xpath('//dd/a/@href').get(),
            'area': response.xpath('//div[@class="basicInfo"]/dl/dt[contains(text(), "エリア")]/following-sibling::dd/text()').get(),
            'report-therapi-name': response.xpath('//div[@class="heading"]//span[@class="thName"]/a[1]/text()').get(),
            'report': response.css('div.abbr.uTxt').text,
            'therapist_name': "",
        }
            

    def parse_therapist(self, response):
        print('[parse_therapist] url:', response.url)

        data = response.json()

        for item in data:
            url = '/therapist.php?id=' + item['id']
            yield response.follow(url=url, callback=self.parse_thera_page)

    def parse_thera_page(self, response):
        print('[parse_thera_page] url:', response.url)
        
        print('now:', response.xpath('//p[@class="TopicPath"]/span[@class="now"]/a/span/text()'))
        
        yield {
            'shop_name': '',
            'shop_url': '',
            'area': '',
            'report-therapi-name': '',
            'report': '',
            'therapist_name': response.xpath('//p[@class="TopicPath"]/span[@class="now"]/a/span/text()').get(),
        }

Python Playwright 的异步不处理所有抓取的页面

Python Playwright's async does not process all of the scraped pages

scrapy

web-scraping

python-asyncio

playwright

playwright-python