通过烧瓶访问时在 Scrapy 中定义规则

Defining Rules in Scrapy while accessing through flask

我是 运行 scrapy 蜘蛛,使用烧瓶和钩针编织。在这里,我使用 Rule Link 提取器来定义规则。在规则中,我正在设置从 flask 应用程序传递的 allow_domains

spider.py

class myCrawler(CrawlSpider):
    name = 'symphony'
    base_url=''
    start_urls = []
    allowed_domains = ''

    def __init__(self, category='', **kwargs):
        super().__init__(**kwargs)
        self.base_url = category
        self.allowed_domains = ['.'.join(urlparse(self.base_url).netloc.split('.')[-2:])]
        self.start_urls.append(self.base_url)
        print(f"Base url is {self.base_url} and allowed domain is {self.allowed_domains}")      

    custom_settings = {
        # in order to reduce the risk of getting blocked
        'DOWNLOADER_MIDDLEWARES': {'sitescrapper.sitescrapper.middlewares.RotateUserAgentMiddleware': 400, },
        'COOKIES_ENABLED': False,
        'CONCURRENT_REQUESTS': 6,
        'DOWNLOAD_DELAY': 2,
        # Duplicates pipeline
        'ITEM_PIPELINES': {'sitescrapper.sitescrapper.pipelines.DuplicatesPipeline': 300},

        # In order to create a CSV file:
        'FEEDS': {'csv_file.csv': {'format': 'csv'}}
    }

    rules = (
        Rule(
            LinkExtractor(allow_domains='.'.join(urlparse(self.base_url).netloc.split('.')[-2:])),
            process_links=process_links,
            callback='parse_item',
            follow=True
        ),
    )

这里我给了LinkExtractor(allow_domains='.'.join(urlparse(self.base_url).netloc.split('.')[-2:]))。但是 self 没有在那里定义,这会引发错误。那么如何将表达式 '.'.join(urlparse(self.base_url).netloc.split('.')[-2:]) 的值分配给变量 allow_domains [这也与 self.allowed_domains] 相同,否则有没有更好的方法来做到这一点

这里的问题是 CrawlSpider 构造函数 (init) 也在处理规则参数,所以如果我们需要分配它们,我们必须在调用默认构造函数。

class myCrawler(CrawlSpider):
    name = 'symphony'
    rotate_user_agent = True
    base_url=''
    start_urls = []
    allowed_domains = ''

    def __init__(self, category='', **kwargs):
        self.base_url = category
        self.allowed_domains = ['.'.join(urlparse(self.base_url).netloc.split('.')[-2:])]
        self.start_urls.append(self.base_url)
        print(f"Base url is {self.base_url} and allowed domain is {self.allowed_domains}")  

        self.rules = (
            Rule(
                LinkExtractor(allow_domains=self.allowed_domains),
                process_links=process_links,
                callback='parse_item',
                follow=True
            ),
        )   
        super().__init__(**kwargs)

    custom_settings = {
            # in order to reduce the risk of getting blocked
            'DOWNLOADER_MIDDLEWARES': {'sitescrapper.sitescrapper.middlewares.RotateUserAgentMiddleware': 400, },
            'COOKIES_ENABLED': False,
            'CONCURRENT_REQUESTS': 6,
            'DOWNLOAD_DELAY': 2,

            # Duplicates pipeline
            'ITEM_PIPELINES': {'sitescrapper.sitescrapper.pipelines.DuplicatesPipeline': 300},

            # In order to create a CSV file:
            'FEEDS': {'csv_file.csv': {'format': 'csv'}}
        }