Scrapy 反转 url Python 中的参数顺序
Scrapy reverses order of arguments in url Python
我是 运行 从 http://www.johnlscott.com/agent-search.aspx 爬到办公室花名册的爬虫。
office 花名册地址如下所示:http://www.johnlscott.com/agent-search.aspx?p=agentResults.asp&OfficeID=8627 - but Scrapy crawls http://www.johnlscott.com/agent-search.aspx?OfficeID=8627&p=agentResults.asp 这是一个死页。 .aspx后两部分互换
我什至手动将每个地址显式加载为 start_urls,它仍然发生。
我在 python-2.7、Windows 8.1
上使用最新的 Scrapy
代码示例:
class JLSSpider(CrawlSpider):
name = 'JLS'
allowed_domains = ['johnlscott.com']
# start_urls = ['http://www.johnlscott.com/agent-search.aspx']
rules = (
Rule(callback="parse_start_url", follow=True),)
def start_requests(self):
with open('hrefnums.csv', 'rbU') as ifile:
read = csv.reader(ifile)
for row in read:
for col in row:
# I have a csv of the office IDs: (Just letting it crawl through them creates the same issue)
yield self.make_requests_from_url("http://www.johnlscott.com/agent-search.aspx?p=agentResults.asp&OfficeID=%s" % col)
def parse_start_url(self, response):
items = []
sel = Selector(response)
sections = sel.xpath("//tr/td/table[@id='tbAgents']/tr")
for section in sections:
item = JLSItem()
item['name'] = section.xpath("td[2]/text()")[0].extract().replace(u'\xa0', ' ').strip()
items.append(item)
return(items)
像这样抓取没有问题:
from scrapy.contrib.spiders import CrawlSpider
from scrapy.http import Request
class JLSSpider(CrawlSpider):
name = 'JLS'
allowed_domains = ['johnlscott.com']
def start_requests(self):
yield Request("http://www.johnlscott.com/agent-search.aspx?p=agentResults.asp&OfficeID=8627", callback=self.parse_item)
def parse_item(self, response):
print response.body
您可以使用代码中的选项 canonicalize=False
来防止交换 url 部分:
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
class JLSSpider(CrawlSpider):
name = 'JLS'
allowed_domains = ['johnlscott.com']
start_urls = ['http://www.johnlscott.com/agent-search.aspx']
rules = (
# http://www.johnlscott.com/agent-search.aspx?p=agentResults.asp&OfficeID=7859
Rule(
LinkExtractor(
allow=('p=agentResults.asp&OfficeID=',
),
canonicalize=False
),
callback='parse_roster',
follow=True),
)
def parse_roster(self, response):
pass
我是 运行 从 http://www.johnlscott.com/agent-search.aspx 爬到办公室花名册的爬虫。
office 花名册地址如下所示:http://www.johnlscott.com/agent-search.aspx?p=agentResults.asp&OfficeID=8627 - but Scrapy crawls http://www.johnlscott.com/agent-search.aspx?OfficeID=8627&p=agentResults.asp 这是一个死页。 .aspx后两部分互换
我什至手动将每个地址显式加载为 start_urls,它仍然发生。
我在 python-2.7、Windows 8.1
上使用最新的 Scrapy代码示例:
class JLSSpider(CrawlSpider):
name = 'JLS'
allowed_domains = ['johnlscott.com']
# start_urls = ['http://www.johnlscott.com/agent-search.aspx']
rules = (
Rule(callback="parse_start_url", follow=True),)
def start_requests(self):
with open('hrefnums.csv', 'rbU') as ifile:
read = csv.reader(ifile)
for row in read:
for col in row:
# I have a csv of the office IDs: (Just letting it crawl through them creates the same issue)
yield self.make_requests_from_url("http://www.johnlscott.com/agent-search.aspx?p=agentResults.asp&OfficeID=%s" % col)
def parse_start_url(self, response):
items = []
sel = Selector(response)
sections = sel.xpath("//tr/td/table[@id='tbAgents']/tr")
for section in sections:
item = JLSItem()
item['name'] = section.xpath("td[2]/text()")[0].extract().replace(u'\xa0', ' ').strip()
items.append(item)
return(items)
像这样抓取没有问题:
from scrapy.contrib.spiders import CrawlSpider
from scrapy.http import Request
class JLSSpider(CrawlSpider):
name = 'JLS'
allowed_domains = ['johnlscott.com']
def start_requests(self):
yield Request("http://www.johnlscott.com/agent-search.aspx?p=agentResults.asp&OfficeID=8627", callback=self.parse_item)
def parse_item(self, response):
print response.body
您可以使用代码中的选项 canonicalize=False
来防止交换 url 部分:
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
class JLSSpider(CrawlSpider):
name = 'JLS'
allowed_domains = ['johnlscott.com']
start_urls = ['http://www.johnlscott.com/agent-search.aspx']
rules = (
# http://www.johnlscott.com/agent-search.aspx?p=agentResults.asp&OfficeID=7859
Rule(
LinkExtractor(
allow=('p=agentResults.asp&OfficeID=',
),
canonicalize=False
),
callback='parse_roster',
follow=True),
)
def parse_roster(self, response):
pass