通过 ajax 获取请求使用 scrapy 从无限滚动页面中抓取数据

Scraping data from infinite scrolling page using scrapy via ajax get request

我是网络抓取的新手,我想从网站上抓取所有事件的信息Events in Oslo

我编写了一个示例代码来抓取数据,如下所示:

'''A Python script to scrape data from 10times.com'''
import scrapy
import requests

class EventFinder(scrapy.Spider):
    '''Creating a custom spider class'''

    name = 'EventSpider'  #name of the spider
    start_urls = ['https://10times.com/oslo-no?datefrom=2020-08-01&dateto=2021-07-31']

    custom_settings = {
       'FEED_URI' : 'tmp/event_details.csv'
       # All the scraped data will be stored in event_details.csv under temp folder
   }
    
    def fetch(url)
    

    def parse(self, response):
        '''Function to get each event url'''

        event_url = response.css(".mb-0 .text-decoration-none::attr(href)")

        for link in event_url:
            yield response.follow(link.get(),callback = self.parse_links)

    def parse_links(self, response):
        '''function to scrape data and yield the data in a csv file'''

        event_name = response.css("h1::text").extract()
        event_date = response.css(".mb-0 span::text").extract()
        event_timings = response.css("#hvrout1 td:nth-child(1)::text").extract()
        event_location = response.css("#map_dirr span , #map_dirr h3").css("::text").extract()
        event_type = response.css("#hvrout2::text").extract()
        event_tags = response.css("#hvrout2 a::text").extract()

        for item in zip(event_name,event_date,event_timings,event_location,event_type,event_tags):
            scraped_info = {
            'Event Name' : item[0],
            'Date' : item[1],
            'Timings' : item[2],
            'Location' : item[3],
            'Event Type' : item[4],
            'Event Tags' : item[5],
               }
            yield scraped_info

我编写的代码能够为第一页上列出的所有事件抓取数据,但是当我们向下滚动页面时,页面会通过 Ajax GET 请求动态加载更多数据,并且它无法抓取该数据。我看过一些视频并阅读了一些文章,但我无法弄清楚如何滚动滚动时动态生成的数据。如有任何帮助,我们将不胜感激。

'''A Python script to scrape data from 10times.com'''
import scrapy
import requests


class EventFinder(scrapy.Spider):

    name = 'EventSpider'  # name of the spider
    #start_urls = ['https://10times.com/oslo-no?datefrom=2020-08-01&dateto=2021-07-31']
    url = 'https://10times.com/ajax?for=scroll&path=/oslo-no&datefrom=2020-08-01&dateto=2021-07-31&ajax=1&page='
    page = 1
    start_urls = [url + str(page)]

    custom_settings = {
        'FEED_URI': 'tmp/event_details.csv'
        # All the scraped data will be stored in event_details.csv under temp folder
    }

    def parse(self, response):
        '''Function to get each event url'''

        event_url = response.css(".mb-0 .text-decoration-none::attr(href)")

        for link in event_url:
            yield response.follow(link.get(), callback=self.parse_links)

        # ONLY TWO PAGES
        next_page = self.url + str(self.page+1)
        yield scrapy.Request(next_page, callback=self.parse)

    def parse_links(self, response):
        '''function to scrape data and yield the data in a csv file'''

        event_name = response.css("h1::text").extract()
        event_date = response.css(".mb-0 span::text").extract()
        event_timings = response.css("#hvrout1 td:nth-child(1)::text").extract()
        event_location = response.css("#map_dirr span , #map_dirr h3").css("::text").extract()
        event_type = response.css("#hvrout2::text").extract()
        event_tags = response.css("#hvrout2 a::text").extract()

        for item in zip(event_name, event_date, event_timings, event_location, event_type, event_tags):
            scraped_info = {
                'Event Name': item[0],
                'Date': item[1],
                'Timings': item[2],
                'Location': item[3],
                'Event Type': item[4],
                'Event Tags': item[5],
            }
            yield scraped_info

输出:

{'Event Name': 'Nasjonale Konferanse Om Hjerneslag', 'Date': '18 - 19 Feb 2021', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': '\n         Oslo Kongressenter Folkets Hus AS', 'Event Type': ' Trade Show', 'Event Tags': 'Medical & Pharma'}
{'Event Name': 'Education Fair in Oslo', 'Date': '17 - 18 Feb 2021', 'Timings': '  10:00 AM - 07:00 PM                         (General)\n                         ', 'Location': '\n         Oslo Spektrum', 'Event Type': ' Trade Show', 'Event Tags': 'Education & Training'}
{'Event Name': 'EAAE Deans Summit', 'Date': '22 - 23 Apr 2021', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': 'Venue to be announced', 'Event Type': ' Conference', 'Event Tags': 'Education & Training'}
{'Event Name': 'NAFEMS Physics Based Digital Twins', 'Date': '23 - 24 Mar 2021', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': 'Venue to be announced', 'Event Type': ' Conference', 'Event Tags': 'Industrial Engineering'}
{'Event Name': 'Oslo Life Science Conference', 'Date': '15 - 18 Feb 2021', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': '\n         University of Oslo', 'Event Type': ' Conference', 'Event Tags': 'Science & Research'}
{'Event Name': 'European Academy of Paediatric Dentistry Interim seminar', 'Date': '23 - 24 Apr 2021', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': '\n         Oslo Kongressenter Folkets Hus AS', 'Event Type': ' Conference', 'Event Tags': 'Medical & Pharma'}
{'Event Name': 'GLOBVAC Conference', 'Date': '20 - 21 Apr 2021', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': 'Clarion Hotel The Hub', 'Event Type': ' Conference', 'Event Tags': 'Wellness, Health & Fitness'}
{'Event Name': 'European Conference on Community Psychology', 'Date': '03 - 04 Jun 2021', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': 'Red Cross Conference Center', 'Event Type': ' Conference', 'Event Tags': 'Wellness, Health & Fitness'}
{'Event Name': 'Baltic Nordic Acoustics Meeting', 'Date': '03 - 05 May 2021', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': 'Clarion Hotel Oslo', 'Event Type': ' Conference', 'Event Tags': 'IT & Technology'}
{'Event Name': 'The European Port House Conference', 'Date': '27 - 28 May 2021', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': 'Venue to be announced', 'Event Type': ' Conference', 'Event Tags': 'Business Services'}
{'Event Name': 'Petroleum Systems Conference', 'Date': '02 - 03 Feb 2021', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': 'Human Factors AS', 'Event Type': ' Conference', 'Event Tags': 'Power & Energy'}
{'Event Name': 'Oslo Yoga Festival', 'Date': '29 - 31 Jan 2021', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': 'Sagene samfunnshus', 'Event Type': ' Trade Show', 'Event Tags': 'Wellness, Health & Fitness'}
{'Event Name': 'NUGA Conference', 'Date': '28 - 30 Jan 2021', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': '\n         Clarion Hotel The Hub', 'Event Type': ' Conference', 'Event Tags': 'Medical & Pharma'}
{'Event Name': 'Anti-Corruption Nordics', 'Date': '26 - 28 Jan 2021', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': 'Venue to be announced', 'Event Type': ' Conference', 'Event Tags': 'Education & Training'}
{'Event Name': 'Software', 'Date': '10 - 11 Feb 2021', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': 'Venue to be announced', 'Event Type': ' Conference', 'Event Tags': 'IT & Technology'}
{'Event Name': 'International Joint Conference on Metallurgical and Materials Engineering', 'Date': '18 - 20 Jun 2021', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': 'Venue to be announced', 'Event Type': ' Conference', 'Event Tags': 'Industrial Engineering'}
{'Event Name': 'International Conference on Frontiers of Chemical Materials and Process', 'Date': '18 - 20 Jun 2021', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': 'Venue to be announced', 'Event Type': ' Conference', 'Event Tags': 'Science & Research'}
{'Event Name': 'International Conference on Material Engineering and Advanced Manufacturing Technology', 'Date': '18 - 20 Jun 2021', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': 'Venue to be announced', 'Event Type': ' Conference', 'Event Tags': 'Industrial Engineering'}
{'Event Name': '600Minutes Executive IT', 'Date': ' 02 Dec 2020', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': 'Venue to be announced', 'Event Type': ' Conference', 'Event Tags': 'Science & Research'}
{'Event Name': "IDC's Multicloud Conference", 'Date': ' 18 Nov 2020', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': '\n         Gamle Logen - Selskapslokaler Oslo', 'Event Type': ' Conference', 'Event Tags': 'IT & Technology'}
{'Event Name': 'European Intelligence and Security Informatics Conference', 'Date': '10 - 11 Jun 2021', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': '\n         University of Oslo', 'Event Type': ' Conference', 'Event Tags': 'Security & Defense'}
{'Event Name': 'Digitalization of Automation Systems', 'Date': '25 - 26 Nov 2020', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': 'Meet Ullevaal', 'Event Type': ' Conference', 'Event Tags': 'Industrial Engineering'}
{'Event Name': 'Annual Privacy Forum', 'Date': '17 - 18 Jun 2021', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': 'Venue to be announced', 'Event Type': ' Conference', 'Event Tags': 'IT & Technology'}
{'Event Name': 'International Association of Lighting Designers Enlighten Europe', 'Date': '18 - 20 Nov 2020', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': '\n         Radisson Blu Scandinavia Hotel, Oslo', 'Event Type': ' Conference', 'Event Tags': 'Building & Construction'}
{'Event Name': 'Tedx Oslo', 'Date': ' 12 Nov 2020', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': 'Venue to be announced', 'Event Type': ' Conference', 'Event Tags': 'Business Services'}
{'Event Name': 'Oslo World Music Festival', 'Date': '27 Oct - 01 Nov 2020', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': '\n         Oslo Visitor Centre', 'Event Type': ' Trade Show', 'Event Tags': 'Entertainment & Media'}
{'Event Name': 'Nordic Educational Meeting', 'Date': '10 - 11 Nov 2020', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': '\n         Radisson Blu Scandinavia Hotel, Oslo', 'Event Type': ' Conference', 'Event Tags': 'Education & Training'}
{'Event Name': 'Nordic Place Branding Conference', 'Date': ' 26 Oct 2020', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': '\n         Rådhuset', 'Event Type': ' Conference', 'Event Tags': 'Banking & Finance'}
{'Event Name': 'Specsavers Clinical Conference', 'Date': ' 13 Oct 2020', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': '\n         Radisson Blu Scandinavia Hotel, Oslo', 'Event Type': ' Conference', 'Event Tags': 'Medical & Pharma'}
{'Event Name': 'IDC Future of Work conference', 'Date': ' 28 Oct 2020', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': '\n         Gamle Logen - Selskapslokaler Oslo', 'Event Type': ' Conference', 'Event Tags': 'Business Services'}
{'Event Name': 'CMO Executive Forum NO', 'Date': ' 27 Oct 2020', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': 'Ekebergrestauranten', 'Event Type': ' Conference', 'Event Tags': 'Business Services'}
{'Event Name': "EARMA's Annual Conference", 'Date': '29 Sep - 01 Oct 2020', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': '\n         Oslo Kongressenter Folkets Hus AS', 'Event Type': ' Conference', 'Event Tags': 'Banking & Finance'}
{'Event Name': 'EOCCS Learning Community Symposium', 'Date': '24 - 25 Sep 2020', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': 'BI Norwegian Business School', 'Event Type': ' Conference', 'Event Tags': 'Education & Training'}
{'Event Name': 'CHFR Symposium', 'Date': '23 - 25 Sep 2020', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': '\n         Scandic Holmenkollen Park', 'Event Type': ' Conference', 'Event Tags': 'Medical & Pharma'}
{'Event Name': 'Nordic and Baltic Stata Conference', 'Date': ' 24 Sep 2020', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': 'Cancer Registry Norway', 'Event Type': ' Conference', 'Event Tags': 'IT & Technology'}
{'Event Name': '600Minutes CFO', 'Date': ' 13 Oct 2020', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': 'Clarion Hotel Oslo', 'Event Type': ' Conference', 'Event Tags': 'Business Services'}
{'Event Name': 'Healthy Buildings Europe', 'Date': '21 - 23 Jun 2021', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': 'Inland Norway University of Applied Sciences', 'Event Type': ' Conference', 'Event Tags': 'Building & Construction'}
{'Event Name': 'Access MBA Tour Oslo', 'Date': ' 24 Sep 2020', 'Timings': '  04:30 PM - 09:30 PM', 'Location': 'Venue to be announced', 'Event Type': ' Conference', 'Event Tags': 'Business Services'}
{'Event Name': 'Oslo Urban Arena', 'Date': '10 - 11 Sep 2020', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': 'Samfunnssalen Event & Konferanse', 'Event Type': ' Conference', 'Event Tags': 'IT & Technology'}
{'Event Name': 'World Congress on Cancer', 'Date': '14 - 16 Sep 2020', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': '\n         Soria Moria hotell og konferansesenter', 'Event Type': ' Conference', 'Event Tags': 'Medical & Pharma'}
{'Event Name': 'International Conference on Defects in Semiconductors', 'Date': '26 - 30 Jul 2021', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': '\n         University of Oslo', 'Event Type': ' Conference', 'Event Tags': 'Electric & Electronics'}
{'Event Name': 'International Conference on Ict Systems Security and Privacy Protection', 'Date': '22 - 24 Jun 2021', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': '\n         University of Oslo', 'Event Type': ' Conference', 'Event Tags': 'IT & Technology'}
{'Event Name': 'International Conference on Intelligent Information Systems', 'Date': '17 - 18 Jul 2021', 'Timings': ' 09:00 AM-06:00 PM (expected)', 'Location': 'Scandic KNA Hotel', 'Event Type': ' Conference', 'Event Tags': 'Business Services'}