改进请求结构以提高速度

Improving structure of requests to boost speed

我创建了一个脚本,可以从网页中抓取一些元素,然后进入每个列表所附的链接。然后它从该网页获取更多信息,但抓取速度相对较慢。我得到 ~ 300/分钟,我猜测是我的抓取器的结构以及它如何收集请求,遵循 url,并抓取信息。会不会是这样,如何提高速度?

import scrapy
from scrapy.item import Field
from scrapy.loader import ItemLoader
from itemloaders.processors import TakeFirst, MapCompose, Join
from scrapy.crawler import CrawlerProcess
from price_parser import Price

def get_price(price_raw):
    price_object = Price.fromstring(price_raw)
    return price_object.amount_float

def get_currency(price_raw):
    price_object = Price.fromstring(price_raw)
    currency = price_object.currency
    return currency


class VinylItem(scrapy.Item):
    title = Field(output_processor = TakeFirst())
    label = Field()
    media_condition=Field(input_processor = MapCompose(str.strip),
    output_processor = TakeFirst())
    sleeve_condition = Field(output_processor = TakeFirst())
    location = Field(input_processor = MapCompose(str.strip),
                        output_processor = Join())
    price = Field(input_processor = MapCompose(get_price)
        ,output_processor = TakeFirst())
    currency = Field(input_processor = MapCompose(get_currency)
        ,output_processor  = TakeFirst())
    rated = Field(input_processor = MapCompose(str.strip)
        ,output_processor = Join())
    have_vinyl = Field(output_processor = TakeFirst())
    want_vinyl = Field(output_processor = TakeFirst())
    format = Field(input_processor = MapCompose(str.strip),
    output_processor = Join())
    released = Field(input_processor = MapCompose(str.strip),
    output_processor = Join())
    genre = Field(input_processor = MapCompose(str.strip),
    output_processor = Join())
    style = Field(input_processor = MapCompose(str.strip),
    output_processor = Join())



class VinylSpider(scrapy.Spider):
    name = 'vinyl'
    #allowed_domains = ['x']
    start_urls = ['https://www.discogs.com/sell/list?format=Vinyl']

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url, callback = self.parse
            )

    def parse(self, response):
        content = response.xpath("//table[@class='table_block mpitems push_down table_responsive']//tbody//tr")
        for items in content:
            loader = ItemLoader(VinylItem(), selector = items)
            loader.add_xpath('title', "(.//strong//a)[position() mod 2=1]//text()")
            loader.add_xpath('label', './/p[@class="hide_mobile label_and_cat"]//a//text()')
            loader.add_xpath("media_condition", '(.//p[@class="item_condition"]//span)[position() mod 3=0]//text()')
            loader.add_xpath("sleeve_condition", './/p[@class="item_condition"]//span[@class="item_sleeve_condition"]//text()')
            loader.add_xpath("location", '(.//td[@class="seller_info"]//li)[position() mod 3=0]//text()')
            loader.add_xpath('price', '(//tbody//tr//td//span[@class="price"])[position() mod 2=0]//text()')
            loader.add_xpath('currency', '(//tbody//tr//td//span[@class="price"])[position() mod 2=0]//text()')
            loader.add_xpath('rated', './/td//div[@class="community_rating"]//text()')
            loader.add_xpath('have_vinyl', '(.//td//div[@class="community_result"]//span[@class="community_label"])[contains(text(),"have")]//text()')
            loader.add_xpath('want_vinyl', '(.//td//div[@class="community_result"]//span[@class="community_label"])[contains(text(),"want")]//text()')

            
            links = items.xpath('.//td[@class="item_description"]//strong//@href').get()
            yield response.follow(
                response.urljoin(links), 
                callback = self.parse_vinyls,
                cb_kwargs = {
                    'loader':loader
                }
            )            
        next_page = response.xpath('(//ul[@class="pagination_page_links"]//a)[last()]//@href').get()
        if next_page:
            yield response.follow(
                response.urljoin(next_page),
                callback = self.parse
            )

    def parse_vinyls(self, response, loader):
        #loader = ItemLoader(VinylItem(), selector = response)
        loader.add_value('format', response.xpath("(.//div[@id='page_content']//div[5])[1]//text()").get())
        loader.add_value('released', response.xpath("(.//div[@id='page_content']//div[9])[1]//text()").get())
        loader.add_value('genre', response.xpath("(.//div[@id='page_content']//div[11])[1]//text()").get())
        loader.add_value('style', response.xpath("(.//div[@id='page_content']//div[13])[1]//text()").get())

        yield loader.load_item()


process = CrawlerProcess(
    settings = {
        'FEED_URI':'vinyl.jl',
        'FEED_FORMAT':'jsonlines'
    }
)
process.crawl(VinylSpider)
process.start()

从您提供的代码片段来看,您的抓取程序设置得非常高效,因为它yield一次处理许多请求,让 scrapy 处理并发。

您可以调整几个设置来提高抓取速度。但是,请注意,抓取的第一条规则是你不应该损害你抓取的网站。请参阅下面您可以调整的设置示例。

  1. 增加 CONCURRENT_REQUESTS 的值。 scrapy中默认为16
  2. 增加 CONCURRENT_REQUESTS_PER_DOMAIN 的值。 scrapy中默认为8
  3. 增加 Twisted IO 线程池的最大大小,以便 DNS 解析更快REACTOR_THREADPOOL_MAXSIZE
  4. 降低日志级别LOG_LEVEL = 'INFO'
  5. 如果不需要,请禁用 cookies COOKIES_ENABLED = False
  6. 减少下载超时DOWNLOAD_TIMEOUT = 15
  7. 如果您的互联网速度很快并且您确定您定位的网站足够快,请降低 DOWNLOAD_DELAY 的值。 不建议这样做

docs

阅读有关这些设置的更多信息

如果上述设置没有解决您的问题,那么您可能需要研究一下分布式抓取