如何在没有下一页信息的情况下使用带有无限循环的scrapy进行抓取
how to scrape using scrapy with infinite loop without next page information
我需要使用 scrapy 抓取 url,但我无法向下滚动网站以加载所有元素。
我尝试搜索下一页信息,但找不到
我的蜘蛛代码是:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from appinformatica.items import appinformaticaItem
import w3lib.html
class appinformaticaSpider (CrawlSpider):
name = 'appinformatica'
item_count=0
start_urls =['https://www.appinformatica.com/telefonos/moviles/']
rules = {
Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[@class="info-ficha"]/div[1]/a')),
callback='parse_item', follow=False)
}
def parse_item(self, response):
item = appinformaticaItem()
self.item_count += 1
item['Modelo'] = w3lib.html.remove_tags(response.xpath("//h1").get(default=''))
item['Position'] = self.item_count
item['Precio'] = w3lib.html.remove_tags(response.xpath('//*[@id="ficha-producto"]/div[2]/div[1]/div/div[1]').get(default=''))
item['PrecioTienda'] = w3lib.html.remove_tags(response.xpath('//*[@id="ficha-producto"]/div[2]/div[1]/div/div[2]').get(default=''))
item['Stock'] = w3lib.html.remove_tags(response.xpath('//*[@id="ficha-producto"]/div[2]/div[3]/p[3]').get(default=''))
item['Submodelo'] = w3lib.html.remove_tags(response.xpath('//*[@id="ficha-producto"]/div[2]/div[3]/p[2]/strong[2]').get(default=''))
item['Url'] = w3lib.html.remove_tags(response.url)
yield item
谁能帮帮我?
将 allow 更改为 allow=(r'/moviles/.*.html'),follow=True 并放入您的 allowed_domains。试试这个。
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
# from appinformatica.items import appinformaticaItem
import w3lib.html
class appinformaticaSpider (CrawlSpider):
name = 'appinformatica'
allowed_domains = ["appinformatica.com"]
item_count=0
start_urls =['https://www.appinformatica.com/telefonos/moviles/']
rules = {
Rule(LinkExtractor(allow=(r'/moviles/.*\.html'), ),
callback='parse_item', follow=True)
}
def parse_item(self, response):
item = {}
self.item_count += 1
item['Modelo'] = w3lib.html.remove_tags(response.xpath("//h1").get(default=''))
item['Position'] = self.item_count
item['Precio'] = w3lib.html.remove_tags(response.xpath('//*[@id="ficha-producto"]/div[2]/div[1]/div/div[1]').get(default=''))
item['PrecioTienda'] = w3lib.html.remove_tags(response.xpath('//*[@id="ficha-producto"]/div[2]/div[1]/div/div[2]').get(default=''))
item['Stock'] = w3lib.html.remove_tags(response.xpath('//*[@id="ficha-producto"]/div[2]/div[3]/p[3]').get(default=''))
item['Submodelo'] = w3lib.html.remove_tags(response.xpath('//*[@id="ficha-producto"]/div[2]/div[3]/p[2]/strong[2]').get(default=''))
item['Url'] = w3lib.html.remove_tags(response.url)
yield item
我需要使用 scrapy 抓取 url,但我无法向下滚动网站以加载所有元素。
我尝试搜索下一页信息,但找不到
我的蜘蛛代码是:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from appinformatica.items import appinformaticaItem
import w3lib.html
class appinformaticaSpider (CrawlSpider):
name = 'appinformatica'
item_count=0
start_urls =['https://www.appinformatica.com/telefonos/moviles/']
rules = {
Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[@class="info-ficha"]/div[1]/a')),
callback='parse_item', follow=False)
}
def parse_item(self, response):
item = appinformaticaItem()
self.item_count += 1
item['Modelo'] = w3lib.html.remove_tags(response.xpath("//h1").get(default=''))
item['Position'] = self.item_count
item['Precio'] = w3lib.html.remove_tags(response.xpath('//*[@id="ficha-producto"]/div[2]/div[1]/div/div[1]').get(default=''))
item['PrecioTienda'] = w3lib.html.remove_tags(response.xpath('//*[@id="ficha-producto"]/div[2]/div[1]/div/div[2]').get(default=''))
item['Stock'] = w3lib.html.remove_tags(response.xpath('//*[@id="ficha-producto"]/div[2]/div[3]/p[3]').get(default=''))
item['Submodelo'] = w3lib.html.remove_tags(response.xpath('//*[@id="ficha-producto"]/div[2]/div[3]/p[2]/strong[2]').get(default=''))
item['Url'] = w3lib.html.remove_tags(response.url)
yield item
谁能帮帮我?
将 allow 更改为 allow=(r'/moviles/.*.html'),follow=True 并放入您的 allowed_domains。试试这个。
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
# from appinformatica.items import appinformaticaItem
import w3lib.html
class appinformaticaSpider (CrawlSpider):
name = 'appinformatica'
allowed_domains = ["appinformatica.com"]
item_count=0
start_urls =['https://www.appinformatica.com/telefonos/moviles/']
rules = {
Rule(LinkExtractor(allow=(r'/moviles/.*\.html'), ),
callback='parse_item', follow=True)
}
def parse_item(self, response):
item = {}
self.item_count += 1
item['Modelo'] = w3lib.html.remove_tags(response.xpath("//h1").get(default=''))
item['Position'] = self.item_count
item['Precio'] = w3lib.html.remove_tags(response.xpath('//*[@id="ficha-producto"]/div[2]/div[1]/div/div[1]').get(default=''))
item['PrecioTienda'] = w3lib.html.remove_tags(response.xpath('//*[@id="ficha-producto"]/div[2]/div[1]/div/div[2]').get(default=''))
item['Stock'] = w3lib.html.remove_tags(response.xpath('//*[@id="ficha-producto"]/div[2]/div[3]/p[3]').get(default=''))
item['Submodelo'] = w3lib.html.remove_tags(response.xpath('//*[@id="ficha-producto"]/div[2]/div[3]/p[2]/strong[2]').get(default=''))
item['Url'] = w3lib.html.remove_tags(response.url)
yield item