Scrapy 无法抓取下一页
Scrapy not able to scrape for the next page
我想抓取以下页面的信息,但是代码只允许我抓取第一页的信息。
我的代码如下:
# -*- coding: utf-8 -*-
import scrapy
from ..items import PropertyItem
class Starprop(scrapy.Spider):
name = 'starprop'
allowed_domains = ['starproperty.com']
start_urls = ['https://www.starproperty.my/to-buy/search?max_price=1000000%2B&new_launch_checkbox=on&sub_sales_checkbox=on&auction_checkbox=on&listing=For%20Sale&sort=latest&page=1']
def parse(self, response):
item = PropertyItem ()
property_list = response.css('.mb-4 div')
for property in property_list:
property_name = property.css ('.property__name::text').extract()
property_price = property.css('.property__price::text').extract()
property_location = property.css ('.property__location::text').extract()
property_agent = property.css('.property__agentdetails .property__agentdetails span:nth-child(1)::text').extract()
property_phone = property.css ('.property__agentcontacts a span::text').extract()
item['property_name']= property_name
item['property_price']= property_price
item['property_location'] = property_location
item['property_agent'] = property_agent
item['property_phone'] = property_phone
yield item
next_page = response.css('.page-item:nth-child(10) .page-link::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback = self.parse)
可能是因为缩进?
尝试更改:
yield item
next_page = response.css('.page-item:nth-child(10) .page-link::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback = self.parse)
至
yield item
next_page = response.css('.page-item:nth-child(10) .page-link::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback = self.parse)
关于您的 allowed_domains
这就是全部(但您还需要修复缩进)。另外,我确定您想在 inside 循环中定义项目:
class Starprop(scrapy.Spider):
name = 'starprop'
allowed_domains = ['starproperty.my']
start_urls = ['https://www.starproperty.my/to-buy/search?max_price=1000000%2B&new_launch_checkbox=on&sub_sales_checkbox=on&auction_checkbox=on&listing=For%20Sale&sort=latest&page=1']
def parse(self, response):
property_list = response.css('.mb-4 div')
for property in property_list:
property_name = property.css ('.property__name::text').extract()
property_price = property.css('.property__price::text').extract()
property_location = property.css ('.property__location::text').extract()
property_agent = property.css('.property__agentdetails .property__agentdetails span:nth-child(1)::text').extract()
property_phone = property.css ('.property__agentcontacts a span::text').extract()
item = PropertyItem ()
item['property_name']= property_name
item['property_price']= property_price
item['property_location'] = property_location
item['property_agent'] = property_agent
item['property_phone'] = property_phone
yield item
next_page = response.css('.page-item:nth-child(10) .page-link::attr(href)').get()
if next_page:
yield response.follow(next_page, callback = self.parse)
我想抓取以下页面的信息,但是代码只允许我抓取第一页的信息。
我的代码如下:
# -*- coding: utf-8 -*-
import scrapy
from ..items import PropertyItem
class Starprop(scrapy.Spider):
name = 'starprop'
allowed_domains = ['starproperty.com']
start_urls = ['https://www.starproperty.my/to-buy/search?max_price=1000000%2B&new_launch_checkbox=on&sub_sales_checkbox=on&auction_checkbox=on&listing=For%20Sale&sort=latest&page=1']
def parse(self, response):
item = PropertyItem ()
property_list = response.css('.mb-4 div')
for property in property_list:
property_name = property.css ('.property__name::text').extract()
property_price = property.css('.property__price::text').extract()
property_location = property.css ('.property__location::text').extract()
property_agent = property.css('.property__agentdetails .property__agentdetails span:nth-child(1)::text').extract()
property_phone = property.css ('.property__agentcontacts a span::text').extract()
item['property_name']= property_name
item['property_price']= property_price
item['property_location'] = property_location
item['property_agent'] = property_agent
item['property_phone'] = property_phone
yield item
next_page = response.css('.page-item:nth-child(10) .page-link::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback = self.parse)
可能是因为缩进? 尝试更改:
yield item
next_page = response.css('.page-item:nth-child(10) .page-link::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback = self.parse)
至
yield item
next_page = response.css('.page-item:nth-child(10) .page-link::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback = self.parse)
关于您的 allowed_domains
这就是全部(但您还需要修复缩进)。另外,我确定您想在 inside 循环中定义项目:
class Starprop(scrapy.Spider):
name = 'starprop'
allowed_domains = ['starproperty.my']
start_urls = ['https://www.starproperty.my/to-buy/search?max_price=1000000%2B&new_launch_checkbox=on&sub_sales_checkbox=on&auction_checkbox=on&listing=For%20Sale&sort=latest&page=1']
def parse(self, response):
property_list = response.css('.mb-4 div')
for property in property_list:
property_name = property.css ('.property__name::text').extract()
property_price = property.css('.property__price::text').extract()
property_location = property.css ('.property__location::text').extract()
property_agent = property.css('.property__agentdetails .property__agentdetails span:nth-child(1)::text').extract()
property_phone = property.css ('.property__agentcontacts a span::text').extract()
item = PropertyItem ()
item['property_name']= property_name
item['property_price']= property_price
item['property_location'] = property_location
item['property_agent'] = property_agent
item['property_phone'] = property_phone
yield item
next_page = response.css('.page-item:nth-child(10) .page-link::attr(href)').get()
if next_page:
yield response.follow(next_page, callback = self.parse)