如何使用 selenium 浏览器生成的 html 内容将动态网站内容发送到 scrapy?

How can I send Dynamic website content to scrapy with the html content generated by selenium browser?

我正在从事某些与股票相关的项目,在过去的 5 年里,我的任务是每天抓取所有数据。即从 2016 年至今。我特别想到使用 selenium,因为我可以使用爬虫和机器人来根据日期抓取数据。所以我使用了 selenium 的按钮点击,现在我希望 selenium 浏览器显示的相同数据由 scrappy 提供。 这是我现在正在研究的 website。 我在 scrappy spider 里面写了下面的代码。

class FloorSheetSpider(scrapy.Spider):
    name = "nepse"

    def start_requests(self):

        driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
        
     
        floorsheet_dates = ['01/03/2016','01/04/2016', up to till date '01/10/2022']

        for date in floorsheet_dates:
            driver.get(
                "https://merolagani.com/Floorsheet.aspx")

            driver.find_element(By.XPATH, "//input[@name='ctl00$ContentPlaceHolder1$txtFloorsheetDateFilter']"
                                ).send_keys(date)
            driver.find_element(By.XPATH, "(//a[@title='Search'])[3]").click()
            total_length = driver.find_element(By.XPATH,
                                               "//span[@id='ctl00_ContentPlaceHolder1_PagerControl2_litRecords']").text
            z = int((total_length.split()[-1]).replace(']', ''))    
            for data in range(z, z + 1):
                driver.find_element(By.XPATH, "(//a[@title='Page {}'])[2]".format(data)).click()
                self.url = driver.page_source
                yield Request(url=self.url, callback=self.parse)

               
    def parse(self, response, **kwargs):
        for value in response.xpath('//tbody/tr'):
            print(value.css('td::text').extract()[1])
            print("ok"*200)

更新:回答后出错

2022-01-14 14:11:36 [twisted] CRITICAL: 
Traceback (most recent call last):
  File "/home/navaraj/PycharmProjects/first_scrapy/env/lib/python3.8/site-packages/twisted/internet/defer.py", line 1661, in _inlineCallbacks
    result = current_context.run(gen.send, result)
  File "/home/navaraj/PycharmProjects/first_scrapy/env/lib/python3.8/site-packages/scrapy/crawler.py", line 88, in crawl
    start_requests = iter(self.spider.start_requests())
TypeError: 'NoneType' object is not iterable

我想将当前网络 html 内容发送到 scrapy 供稿器,但过去 2 天我收到异常错误任何帮助或建议将不胜感激。

两种解决方案差别不大。解决方案 #2 更适合您的问题,但请选择您喜欢的任何内容。

解决方案 1 - 使用来自驱动程序的 html 主体创建响应并立即抓取它(您也可以将其作为参数传递给函数):

import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from scrapy.http import HtmlResponse


class FloorSheetSpider(scrapy.Spider):
    name = "nepse"

    def start_requests(self):

        # driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
        driver = webdriver.Chrome()

        floorsheet_dates = ['01/03/2016','01/04/2016']#, up to till date '01/10/2022']

        for date in floorsheet_dates:
            driver.get(
                "https://merolagani.com/Floorsheet.aspx")

            driver.find_element(By.XPATH, "//input[@name='ctl00$ContentPlaceHolder1$txtFloorsheetDateFilter']"
                                ).send_keys(date)
            driver.find_element(By.XPATH, "(//a[@title='Search'])[3]").click()
            total_length = driver.find_element(By.XPATH,
                                               "//span[@id='ctl00_ContentPlaceHolder1_PagerControl2_litRecords']").text
            z = int((total_length.split()[-1]).replace(']', ''))
            for data in range(1, z + 1):
                driver.find_element(By.XPATH, "(//a[@title='Page {}'])[2]".format(data)).click()
                self.body = driver.page_source

                response = HtmlResponse(url=driver.current_url, body=self.body, encoding='utf-8')
                for value in response.xpath('//tbody/tr'):
                    print(value.css('td::text').extract()[1])
                    print("ok"*200)

        # return an empty requests list
        return []

解决方案 2 - 使用超级简单的下载器中间件:

parse 方法可能会有延迟,请耐心等待)。

import scrapy
from scrapy import Request
from scrapy.http import HtmlResponse
from selenium import webdriver
from selenium.webdriver.common.by import By


class SeleniumMiddleware(object):
    def process_request(self, request, spider):
        url = spider.driver.current_url
        body = spider.driver.page_source
        return HtmlResponse(url=url, body=body, encoding='utf-8', request=request)


class FloorSheetSpider(scrapy.Spider):
    name = "nepse"

    custom_settings = {
        'DOWNLOADER_MIDDLEWARES': {
            'tempbuffer.spiders.yetanotherspider.SeleniumMiddleware': 543,
            # 'projects_name.path.to.your.pipeline': 543
        }
    }
    driver = webdriver.Chrome()

    def start_requests(self):

        # driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())


        floorsheet_dates = ['01/03/2016','01/04/2016']#, up to till date '01/10/2022']

        for date in floorsheet_dates:
            self.driver.get(
                "https://merolagani.com/Floorsheet.aspx")

            self.driver.find_element(By.XPATH, "//input[@name='ctl00$ContentPlaceHolder1$txtFloorsheetDateFilter']"
                                ).send_keys(date)
            self.driver.find_element(By.XPATH, "(//a[@title='Search'])[3]").click()
            total_length = self.driver.find_element(By.XPATH,
                                               "//span[@id='ctl00_ContentPlaceHolder1_PagerControl2_litRecords']").text
            z = int((total_length.split()[-1]).replace(']', ''))
            for data in range(1, z + 1):
                self.driver.find_element(By.XPATH, "(//a[@title='Page {}'])[2]".format(data)).click()
                self.body = self.driver.page_source
                self.url = self.driver.current_url

                yield Request(url=self.url, callback=self.parse, dont_filter=True)

    def parse(self, response, **kwargs):
        print('test ok')
        for value in response.xpath('//tbody/tr'):
            print(value.css('td::text').extract()[1])
            print("ok"*200)

请注意,我使用了 chrome,所以请像您的原始代码一样将其改回 firefox。