Scrapy:按顺序抓取 url 并重复输出
Scrapy: scrape url in sequence and output repeated
目前这个爬虫可以正常工作并给我回复,但我有一些问题。首先是抓取页面的顺序。我希望从第 1 页开始到我设置的范围,此时似乎随机进行并重复页面。第二个是输出,全部重复或具有空值或不按顺序。不知道是规则的问题还是爬虫的问题
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class QuotesSpider(CrawlSpider):
name = "catspider"
start_urls = []
for i in range(1,10):
if i % 2 == 1:
start_urls.append('https://www.worldcat.org/title/rose-in-bloom/oclc/' + str(i) +'&referer=brief_results')
rules = (
Rule(LinkExtractor(allow='title')),
Rule(LinkExtractor(allow='oclc'), callback='parse_item')
)
def parse_item(self, response):
yield {
'title': response.css('h1.title::text').get(),
'author': response.css('td[id="bib-author-cell"] a::text').getall(),
'publisher': response.css('td[id="bib-publisher-cell"]::text').get(),
'format': response.css('span[id="editionFormatType"] span::text').get(),
'isbn': response.css('tr[id="details-standardno"] td::text').get(),
'oclc': response.css('tr[id="details-oclcno"] td::text').get()
}
额外信息:来自对 scrapy 有更多经验的人,Xpath 或 css 标签更好,为什么更好?
感谢您提供任何信息。
您可以使用 for loop
range
方法在 start_urls 中进行分页,这种分页类型比 others.And 快 2 倍,这是使用 xpath 的最佳方式之一在规则中,如果每个项目包含 link.
Extra info: from someone that have more experience with scrapy what is better and why, Xpath or css tag?
根据您的评论 Extra info
:xpath 和 css 元素定位器都更好,但 xpath 更丰富一点,因为 xpath 很容易在 html 树中上下移动,而您也可以混合方式同时应用xpath和css。这是一个工作示例。
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
class QuotesSpider(CrawlSpider):
name = "catspider"
start_urls = ['https://www.worldcat.org/search?q=oclc&fq=&dblist=638&start='+str(i)+'1&qt=page_number_link' for i in range(1,11)]
rules = (Rule(LinkExtractor(restrict_xpaths='//*[@class="name"]/a'), callback='parse_item', follow=True),)
def parse_item(self, response):
yield {
'title' : response.css('h1.title::text').get(),
'author' : response.css('td[id="bib-author-cell"] a::text').getall(),
'publisher' : response.css('td[id="bib-publisher-cell"]::text').get(),
'format' : response.css('span[id="editionFormatType"] span::text').get(),
'isbn' : response.css('tr[id="details-standardno"] td::text').get(),
'oclc' : response.css('tr[id="details-oclcno"] td::text').get()
}
process = CrawlerProcess()
process.crawl(QuotesSpider)
process.start()
目前这个爬虫可以正常工作并给我回复,但我有一些问题。首先是抓取页面的顺序。我希望从第 1 页开始到我设置的范围,此时似乎随机进行并重复页面。第二个是输出,全部重复或具有空值或不按顺序。不知道是规则的问题还是爬虫的问题
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class QuotesSpider(CrawlSpider):
name = "catspider"
start_urls = []
for i in range(1,10):
if i % 2 == 1:
start_urls.append('https://www.worldcat.org/title/rose-in-bloom/oclc/' + str(i) +'&referer=brief_results')
rules = (
Rule(LinkExtractor(allow='title')),
Rule(LinkExtractor(allow='oclc'), callback='parse_item')
)
def parse_item(self, response):
yield {
'title': response.css('h1.title::text').get(),
'author': response.css('td[id="bib-author-cell"] a::text').getall(),
'publisher': response.css('td[id="bib-publisher-cell"]::text').get(),
'format': response.css('span[id="editionFormatType"] span::text').get(),
'isbn': response.css('tr[id="details-standardno"] td::text').get(),
'oclc': response.css('tr[id="details-oclcno"] td::text').get()
}
额外信息:来自对 scrapy 有更多经验的人,Xpath 或 css 标签更好,为什么更好?
感谢您提供任何信息。
您可以使用 for loop
range
方法在 start_urls 中进行分页,这种分页类型比 others.And 快 2 倍,这是使用 xpath 的最佳方式之一在规则中,如果每个项目包含 link.
Extra info: from someone that have more experience with scrapy what is better and why, Xpath or css tag?
根据您的评论 Extra info
:xpath 和 css 元素定位器都更好,但 xpath 更丰富一点,因为 xpath 很容易在 html 树中上下移动,而您也可以混合方式同时应用xpath和css。这是一个工作示例。
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
class QuotesSpider(CrawlSpider):
name = "catspider"
start_urls = ['https://www.worldcat.org/search?q=oclc&fq=&dblist=638&start='+str(i)+'1&qt=page_number_link' for i in range(1,11)]
rules = (Rule(LinkExtractor(restrict_xpaths='//*[@class="name"]/a'), callback='parse_item', follow=True),)
def parse_item(self, response):
yield {
'title' : response.css('h1.title::text').get(),
'author' : response.css('td[id="bib-author-cell"] a::text').getall(),
'publisher' : response.css('td[id="bib-publisher-cell"]::text').get(),
'format' : response.css('span[id="editionFormatType"] span::text').get(),
'isbn' : response.css('tr[id="details-standardno"] td::text').get(),
'oclc' : response.css('tr[id="details-oclcno"] td::text').get()
}
process = CrawlerProcess()
process.crawl(QuotesSpider)
process.start()