Scrapy 反转 url Python 中的参数顺序

Scrapy reverses order of arguments in url Python

我是 运行 从 http://www.johnlscott.com/agent-search.aspx 爬到办公室花名册的爬虫。

office 花名册地址如下所示:http://www.johnlscott.com/agent-search.aspx?p=agentResults.asp&OfficeID=8627 - but Scrapy crawls http://www.johnlscott.com/agent-search.aspx?OfficeID=8627&p=agentResults.asp 这是一个死页。 .aspx后两部分互换

我什至手动将每个地址显式加载为 start_urls,它仍然发生。

我在 python-2.7、Windows 8.1

上使用最新的 Scrapy

代码示例:

class JLSSpider(CrawlSpider):

    name = 'JLS'
    allowed_domains = ['johnlscott.com']
    # start_urls = ['http://www.johnlscott.com/agent-search.aspx']

    rules = (
        Rule(callback="parse_start_url", follow=True),)

    def start_requests(self):
        with open('hrefnums.csv', 'rbU') as ifile:
            read = csv.reader(ifile)
            for row in read:
                for col in row:
                    # I have a csv of the office IDs: (Just letting it crawl through them creates the same issue)
                    yield self.make_requests_from_url("http://www.johnlscott.com/agent-search.aspx?p=agentResults.asp&OfficeID=%s" % col)


    def parse_start_url(self, response):
        items = []
        sel = Selector(response)
        sections = sel.xpath("//tr/td/table[@id='tbAgents']/tr")
        for section in sections:
            item = JLSItem()
            item['name'] = section.xpath("td[2]/text()")[0].extract().replace(u'\xa0', ' ').strip()         
            items.append(item)
        return(items)

像这样抓取没有问题:

from scrapy.contrib.spiders import CrawlSpider
from scrapy.http import Request


class JLSSpider(CrawlSpider):
    name = 'JLS'
    allowed_domains = ['johnlscott.com']

    def start_requests(self):
        yield Request("http://www.johnlscott.com/agent-search.aspx?p=agentResults.asp&OfficeID=8627", callback=self.parse_item)

    def parse_item(self, response):
        print response.body

您可以使用代码中的选项 canonicalize=False 来防止交换 url 部分:

import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor

class JLSSpider(CrawlSpider):

    name = 'JLS'
    allowed_domains = ['johnlscott.com']
    start_urls = ['http://www.johnlscott.com/agent-search.aspx']

    rules = (
        # http://www.johnlscott.com/agent-search.aspx?p=agentResults.asp&OfficeID=7859
        Rule(
            LinkExtractor(
                allow=('p=agentResults.asp&OfficeID=', 
                ), 
                canonicalize=False
            ),
            callback='parse_roster',
            follow=True),
    )

    def parse_roster(self, response):
        pass