通过烧瓶访问时在 Scrapy 中定义规则
Defining Rules in Scrapy while accessing through flask
我是 运行 scrapy 蜘蛛,使用烧瓶和钩针编织。在这里,我使用 Rule Link 提取器来定义规则。在规则中,我正在设置从 flask 应用程序传递的 allow_domains
。
spider.py
class myCrawler(CrawlSpider):
name = 'symphony'
base_url=''
start_urls = []
allowed_domains = ''
def __init__(self, category='', **kwargs):
super().__init__(**kwargs)
self.base_url = category
self.allowed_domains = ['.'.join(urlparse(self.base_url).netloc.split('.')[-2:])]
self.start_urls.append(self.base_url)
print(f"Base url is {self.base_url} and allowed domain is {self.allowed_domains}")
custom_settings = {
# in order to reduce the risk of getting blocked
'DOWNLOADER_MIDDLEWARES': {'sitescrapper.sitescrapper.middlewares.RotateUserAgentMiddleware': 400, },
'COOKIES_ENABLED': False,
'CONCURRENT_REQUESTS': 6,
'DOWNLOAD_DELAY': 2,
# Duplicates pipeline
'ITEM_PIPELINES': {'sitescrapper.sitescrapper.pipelines.DuplicatesPipeline': 300},
# In order to create a CSV file:
'FEEDS': {'csv_file.csv': {'format': 'csv'}}
}
rules = (
Rule(
LinkExtractor(allow_domains='.'.join(urlparse(self.base_url).netloc.split('.')[-2:])),
process_links=process_links,
callback='parse_item',
follow=True
),
)
这里我给了LinkExtractor(allow_domains='.'.join(urlparse(self.base_url).netloc.split('.')[-2:]))
。但是 self
没有在那里定义,这会引发错误。那么如何将表达式 '.'.join(urlparse(self.base_url).netloc.split('.')[-2:])
的值分配给变量 allow_domains
[这也与 self.allowed_domains
] 相同,否则有没有更好的方法来做到这一点
这里的问题是 CrawlSpider 构造函数 (init) 也在处理规则参数,所以如果我们需要分配它们,我们必须在调用默认构造函数。
class myCrawler(CrawlSpider):
name = 'symphony'
rotate_user_agent = True
base_url=''
start_urls = []
allowed_domains = ''
def __init__(self, category='', **kwargs):
self.base_url = category
self.allowed_domains = ['.'.join(urlparse(self.base_url).netloc.split('.')[-2:])]
self.start_urls.append(self.base_url)
print(f"Base url is {self.base_url} and allowed domain is {self.allowed_domains}")
self.rules = (
Rule(
LinkExtractor(allow_domains=self.allowed_domains),
process_links=process_links,
callback='parse_item',
follow=True
),
)
super().__init__(**kwargs)
custom_settings = {
# in order to reduce the risk of getting blocked
'DOWNLOADER_MIDDLEWARES': {'sitescrapper.sitescrapper.middlewares.RotateUserAgentMiddleware': 400, },
'COOKIES_ENABLED': False,
'CONCURRENT_REQUESTS': 6,
'DOWNLOAD_DELAY': 2,
# Duplicates pipeline
'ITEM_PIPELINES': {'sitescrapper.sitescrapper.pipelines.DuplicatesPipeline': 300},
# In order to create a CSV file:
'FEEDS': {'csv_file.csv': {'format': 'csv'}}
}
我是 运行 scrapy 蜘蛛,使用烧瓶和钩针编织。在这里,我使用 Rule Link 提取器来定义规则。在规则中,我正在设置从 flask 应用程序传递的 allow_domains
。
spider.py
class myCrawler(CrawlSpider):
name = 'symphony'
base_url=''
start_urls = []
allowed_domains = ''
def __init__(self, category='', **kwargs):
super().__init__(**kwargs)
self.base_url = category
self.allowed_domains = ['.'.join(urlparse(self.base_url).netloc.split('.')[-2:])]
self.start_urls.append(self.base_url)
print(f"Base url is {self.base_url} and allowed domain is {self.allowed_domains}")
custom_settings = {
# in order to reduce the risk of getting blocked
'DOWNLOADER_MIDDLEWARES': {'sitescrapper.sitescrapper.middlewares.RotateUserAgentMiddleware': 400, },
'COOKIES_ENABLED': False,
'CONCURRENT_REQUESTS': 6,
'DOWNLOAD_DELAY': 2,
# Duplicates pipeline
'ITEM_PIPELINES': {'sitescrapper.sitescrapper.pipelines.DuplicatesPipeline': 300},
# In order to create a CSV file:
'FEEDS': {'csv_file.csv': {'format': 'csv'}}
}
rules = (
Rule(
LinkExtractor(allow_domains='.'.join(urlparse(self.base_url).netloc.split('.')[-2:])),
process_links=process_links,
callback='parse_item',
follow=True
),
)
这里我给了LinkExtractor(allow_domains='.'.join(urlparse(self.base_url).netloc.split('.')[-2:]))
。但是 self
没有在那里定义,这会引发错误。那么如何将表达式 '.'.join(urlparse(self.base_url).netloc.split('.')[-2:])
的值分配给变量 allow_domains
[这也与 self.allowed_domains
] 相同,否则有没有更好的方法来做到这一点
这里的问题是 CrawlSpider 构造函数 (init) 也在处理规则参数,所以如果我们需要分配它们,我们必须在调用默认构造函数。
class myCrawler(CrawlSpider):
name = 'symphony'
rotate_user_agent = True
base_url=''
start_urls = []
allowed_domains = ''
def __init__(self, category='', **kwargs):
self.base_url = category
self.allowed_domains = ['.'.join(urlparse(self.base_url).netloc.split('.')[-2:])]
self.start_urls.append(self.base_url)
print(f"Base url is {self.base_url} and allowed domain is {self.allowed_domains}")
self.rules = (
Rule(
LinkExtractor(allow_domains=self.allowed_domains),
process_links=process_links,
callback='parse_item',
follow=True
),
)
super().__init__(**kwargs)
custom_settings = {
# in order to reduce the risk of getting blocked
'DOWNLOADER_MIDDLEWARES': {'sitescrapper.sitescrapper.middlewares.RotateUserAgentMiddleware': 400, },
'COOKIES_ENABLED': False,
'CONCURRENT_REQUESTS': 6,
'DOWNLOAD_DELAY': 2,
# Duplicates pipeline
'ITEM_PIPELINES': {'sitescrapper.sitescrapper.pipelines.DuplicatesPipeline': 300},
# In order to create a CSV file:
'FEEDS': {'csv_file.csv': {'format': 'csv'}}
}