如何使用 Scrapy 和 Selenium 在循环中只抓取可点击的链接

How to scrape only clickable links in a loop with Scrapy and Selenium

我正在尝试使用 Scrapy 和 Selenium 从 Javascript 网站上抓取一些关于网球比赛的信息。开头的 URL 用于包含给定日期所有匹配项的页面。每个页面上的第一个任务是使所有匹配项从一些水平选项卡后面可见 - 涵盖了这一点。第二个任务是抓取位于 link 后面但不存在于起始 URL 页面上的匹配页面 - 需要单击特定标签。

我发现所有这些标签都没有问题,并且编写了一个循环,使用 Selenium 单击标签并在每次迭代后产生 Request。我遇到的问题是,每次我点击 link 时,页面都会发生变化,我可爱的元素列表会从 DOM 中分离出来,我会收到 StaleElementReferenceException 错误。我明白为什么会这样,但我正在努力想出一个解决方案。

到目前为止,这是我的代码:

import datetime as dt
from dateutil.rrule import DAILY, rrule
from scrapy import Spider, Request
from scrapy.crawler import CrawlerProcess
from scrapy.http.response import Response
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

MATCHES_XPATH = "//span[@class='link sc-10gv6xe-4 eEAcym pointer']"
ELEMENT_TEST = "//span[@class='link sc-15d69aw-2 hhbGos']"


class ScraperS24(Spider):

    name = "scores24_scraper"

    custom_settings = {
        "USER_AGENT": "*",
        "LOG_LEVEL": "WARNING",
        "DOWNLOADER_MIDDLEWARES": {
            'scraper.polgara.middlewares.SeleniumMiddleware': 543,
        },
    }
    httperror_allowed_codes = [301]

    def __init__(self):
        dates = list(rrule(DAILY, dtstart=dt.datetime(2015, 1, 6), until=dt.datetime(2015, 1, 21)))
        self.start_urls = [f"https://scores24.live/en/tennis/{d.strftime('%Y-%m-%d')}" for d in dates]
        super().__init__()

    def parse(self, response: Response):
        print(f"Parsing date - {response.url}")
        driver = response.request.meta["driver"]
        tabs = driver.find_elements_by_xpath("//button[@class='hjlkds-7 khtDIT']")
        for t in tabs: driver.execute_script("arguments[0].click();", t)
        matches = driver.find_elements_by_xpath(MATCHES_XPATH)
        wait = WebDriverWait(driver, 20)
        for m in matches:
            driver.execute_script("arguments[0].click();", m)
            try:
                wait.until(EC.presence_of_element_located((By.XPATH, ELEMENT_TEST)))
            except TimeoutException:
                driver.back()
            else:
                url = str(driver.current_url)
                driver.back()
                yield Request(url, callback=self._parse_match)

    def _parse_match(self, response):
        print(f"Parsing match - {response.url}")


process = CrawlerProcess()
process.crawl(ScraperS24)
process.start()

以及 Selenium 中间件:

class SeleniumMiddleware:

    @classmethod
    def from_crawler(cls, crawler):
        middleware = cls()
        crawler.signals.connect(middleware.spider_opened, signals.spider_opened)
        crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
        return middleware

    def process_request(self, request: Request, spider: Spider):
        logger.debug(f"Selenium processing request - {request.url}")
        self.driver.get(request.url)
        request.meta.update({'driver': self.driver})
        return HtmlResponse(
            request.url,
            body=self.driver.page_source,
            encoding='utf-8',
            request=request,
        )

    def spider_opened(self, spider):
        options = webdriver.FirefoxOptions()
        options.add_argument("--headless")
        self.driver = webdriver.Firefox(
            options=options,
            executable_path=cn.GECKODRIVER_PATH,
        )

    def spider_closed(self, spider):
        self.driver.quit()

我尝试使用答案 :

调整循环
for idx, _ in enumerate(matches):
    matches = driver.find_elements_by_xpath(MATCHES_XPATH)
    driver.execute_script("arguments[0].click();", matches[idx])
    try:
        wait.until(EC.presence_of_element_located((By.XPATH, ELEMENT_TEST)))
    except TimeoutException:
        driver.back()
    else:
        url = str(driver.current_url)
        driver.back()
        yield Request(url, callback=self._parse_match)

但是,因为 Scrapy 是并行抓取的,所以 driver 在循环结束之前很快被用来加载一个新的起始 URL 页面,因此 [=16= 上的列表索引] 被搞砸了。

关于我可以从这里去哪里的任何想法?有没有办法让我使用适应的循环并强制 Scrapy 在移动到下一个 URL 之前完成开始 URL 页面上的所有匹配标签的抓取?有关于如何做到这一点的答案,但他们依赖于在每个页面上存储 URLs 的列表 - 我没有...

您尝试抓取的页面不需要您使用 Selenium,因为数据已经包含在该页面的 html 中。

匹配项的大部分信息都可以在匹配项 json 对象中找到,因此您可能不需要抓取后续页面,具体取决于您要获取的信息。

请参阅下面的代码,向您展示如何直接从 html 解析匹配数据。

import scrapy
import datetime as dt
from dateutil.rrule import DAILY, rrule
import json
from scrapy.crawler import CrawlerProcess

class ScraperS24(scrapy.Spider):

    name = "scores24_scraper"

    custom_settings = {
        "USER_AGENT": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36",
    }

    def __init__(self):
        dates = list(rrule(DAILY, dtstart=dt.datetime(2015, 1, 6), until=dt.datetime(2015, 1, 6)))
        self.start_urls = [f"https://scores24.live/en/tennis/{d.strftime('%Y-%m-%d')}" for d in dates]
        super().__init__()

    def parse(self, response):
        # obtain the json response object
        data = response.xpath("//script[contains(text(),'window.__APP__STATE__=JSON.parse')]/text()").re_first(r"window\.__APP__STATE__=JSON\.parse\((.*)\);")
        data_json = json.loads(data)
        data_json = json.loads(data_json)
        tennis_data = data_json["matchFeed"]["_feed"]["tennis"] # this is a list

        for tournament in tennis_data:
            for match in tournament["matches"]:
                match_url = f"m-{match['slug']}"
                yield match # outputs the summary data on each match. Comment this line if following links to each match
                # to follow the links to each match uncomment the line below
                #yield response.follow(match_url, callback=self.parse_match) 

    def parse_match(self, response):
        self.logger.info("Parsing match on url: " + response.url)
        # parse the match details here


if __name__ == '__main__':
    process = CrawlerProcess()
    process.crawl(ScraperS24)
    process.start()

如果您 运行 爬虫,您将获得类似于下图的每场比赛的结果。