scrapy - CrawlSpider 没有执行我的回调
scrapy - CrawlSpider is not executing my callback
我创建了一个蜘蛛来收集 scratch.mit.edu 上的用户名。
它成功导航到个人资料页面,但没有 运行 回调函数。我认为这可能与我编写 allow 属性的方式有关。
我的代码:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class ForumnSpider(CrawlSpider):
name = 'forumn'
allowed_domains = ['scratch.mit.edu']
start_urls = [
'https://scratch.mit.edu/users/accountcraft123/'
]
rules = (
Rule(
LinkExtractor(),
),
Rule(
LinkExtractor(
allow=('/users/'),
),
callback='parse_item',
),
)
def parse_item(self, response):
self.logger.info('This is a profile page. %s', response.url)
response.xpath('//div[@class="header-text"]/h2/text()').get()
@iL0g1c,到目前为止它正在工作
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class ForumnSpider(CrawlSpider):
name = 'forumn'
allowed_domains = ['scratch.mit.edu']
start_urls = ['https://scratch.mit.edu/users/accountcraft123/']
rules = (
Rule(LinkExtractor(allow=(r'/users/.*')), follow=True,callback='parse'),
)
def parse(self, response):
self.logger.info('This is a profile page. %s', response.url)
response.xpath('//div[@class="header-text"]/h2/text()').get()
我创建了一个蜘蛛来收集 scratch.mit.edu 上的用户名。 它成功导航到个人资料页面,但没有 运行 回调函数。我认为这可能与我编写 allow 属性的方式有关。
我的代码:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class ForumnSpider(CrawlSpider):
name = 'forumn'
allowed_domains = ['scratch.mit.edu']
start_urls = [
'https://scratch.mit.edu/users/accountcraft123/'
]
rules = (
Rule(
LinkExtractor(),
),
Rule(
LinkExtractor(
allow=('/users/'),
),
callback='parse_item',
),
)
def parse_item(self, response):
self.logger.info('This is a profile page. %s', response.url)
response.xpath('//div[@class="header-text"]/h2/text()').get()
@iL0g1c,到目前为止它正在工作
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class ForumnSpider(CrawlSpider):
name = 'forumn'
allowed_domains = ['scratch.mit.edu']
start_urls = ['https://scratch.mit.edu/users/accountcraft123/']
rules = (
Rule(LinkExtractor(allow=(r'/users/.*')), follow=True,callback='parse'),
)
def parse(self, response):
self.logger.info('This is a profile page. %s', response.url)
response.xpath('//div[@class="header-text"]/h2/text()').get()