改进请求结构以提高速度
Improving structure of requests to boost speed
我创建了一个脚本,可以从网页中抓取一些元素,然后进入每个列表所附的链接。然后它从该网页获取更多信息,但抓取速度相对较慢。我得到 ~ 300/分钟,我猜测是我的抓取器的结构以及它如何收集请求,遵循 url,并抓取信息。会不会是这样,如何提高速度?
import scrapy
from scrapy.item import Field
from scrapy.loader import ItemLoader
from itemloaders.processors import TakeFirst, MapCompose, Join
from scrapy.crawler import CrawlerProcess
from price_parser import Price
def get_price(price_raw):
price_object = Price.fromstring(price_raw)
return price_object.amount_float
def get_currency(price_raw):
price_object = Price.fromstring(price_raw)
currency = price_object.currency
return currency
class VinylItem(scrapy.Item):
title = Field(output_processor = TakeFirst())
label = Field()
media_condition=Field(input_processor = MapCompose(str.strip),
output_processor = TakeFirst())
sleeve_condition = Field(output_processor = TakeFirst())
location = Field(input_processor = MapCompose(str.strip),
output_processor = Join())
price = Field(input_processor = MapCompose(get_price)
,output_processor = TakeFirst())
currency = Field(input_processor = MapCompose(get_currency)
,output_processor = TakeFirst())
rated = Field(input_processor = MapCompose(str.strip)
,output_processor = Join())
have_vinyl = Field(output_processor = TakeFirst())
want_vinyl = Field(output_processor = TakeFirst())
format = Field(input_processor = MapCompose(str.strip),
output_processor = Join())
released = Field(input_processor = MapCompose(str.strip),
output_processor = Join())
genre = Field(input_processor = MapCompose(str.strip),
output_processor = Join())
style = Field(input_processor = MapCompose(str.strip),
output_processor = Join())
class VinylSpider(scrapy.Spider):
name = 'vinyl'
#allowed_domains = ['x']
start_urls = ['https://www.discogs.com/sell/list?format=Vinyl']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url, callback = self.parse
)
def parse(self, response):
content = response.xpath("//table[@class='table_block mpitems push_down table_responsive']//tbody//tr")
for items in content:
loader = ItemLoader(VinylItem(), selector = items)
loader.add_xpath('title', "(.//strong//a)[position() mod 2=1]//text()")
loader.add_xpath('label', './/p[@class="hide_mobile label_and_cat"]//a//text()')
loader.add_xpath("media_condition", '(.//p[@class="item_condition"]//span)[position() mod 3=0]//text()')
loader.add_xpath("sleeve_condition", './/p[@class="item_condition"]//span[@class="item_sleeve_condition"]//text()')
loader.add_xpath("location", '(.//td[@class="seller_info"]//li)[position() mod 3=0]//text()')
loader.add_xpath('price', '(//tbody//tr//td//span[@class="price"])[position() mod 2=0]//text()')
loader.add_xpath('currency', '(//tbody//tr//td//span[@class="price"])[position() mod 2=0]//text()')
loader.add_xpath('rated', './/td//div[@class="community_rating"]//text()')
loader.add_xpath('have_vinyl', '(.//td//div[@class="community_result"]//span[@class="community_label"])[contains(text(),"have")]//text()')
loader.add_xpath('want_vinyl', '(.//td//div[@class="community_result"]//span[@class="community_label"])[contains(text(),"want")]//text()')
links = items.xpath('.//td[@class="item_description"]//strong//@href').get()
yield response.follow(
response.urljoin(links),
callback = self.parse_vinyls,
cb_kwargs = {
'loader':loader
}
)
next_page = response.xpath('(//ul[@class="pagination_page_links"]//a)[last()]//@href').get()
if next_page:
yield response.follow(
response.urljoin(next_page),
callback = self.parse
)
def parse_vinyls(self, response, loader):
#loader = ItemLoader(VinylItem(), selector = response)
loader.add_value('format', response.xpath("(.//div[@id='page_content']//div[5])[1]//text()").get())
loader.add_value('released', response.xpath("(.//div[@id='page_content']//div[9])[1]//text()").get())
loader.add_value('genre', response.xpath("(.//div[@id='page_content']//div[11])[1]//text()").get())
loader.add_value('style', response.xpath("(.//div[@id='page_content']//div[13])[1]//text()").get())
yield loader.load_item()
process = CrawlerProcess(
settings = {
'FEED_URI':'vinyl.jl',
'FEED_FORMAT':'jsonlines'
}
)
process.crawl(VinylSpider)
process.start()
从您提供的代码片段来看,您的抓取程序设置得非常高效,因为它yield
一次处理许多请求,让 scrapy 处理并发。
您可以调整几个设置来提高抓取速度。但是,请注意,抓取的第一条规则是你不应该损害你抓取的网站。请参阅下面您可以调整的设置示例。
- 增加
CONCURRENT_REQUESTS
的值。 scrapy中默认为16
- 增加
CONCURRENT_REQUESTS_PER_DOMAIN
的值。 scrapy中默认为8
- 增加 Twisted IO 线程池的最大大小,以便 DNS 解析更快
REACTOR_THREADPOOL_MAXSIZE
- 降低日志级别
LOG_LEVEL = 'INFO'
- 如果不需要,请禁用 cookies
COOKIES_ENABLED = False
- 减少下载超时
DOWNLOAD_TIMEOUT = 15
- 如果您的互联网速度很快并且您确定您定位的网站足够快,请降低
DOWNLOAD_DELAY
的值。 不建议这样做
从 docs
阅读有关这些设置的更多信息
如果上述设置没有解决您的问题,那么您可能需要研究一下分布式抓取
我创建了一个脚本,可以从网页中抓取一些元素,然后进入每个列表所附的链接。然后它从该网页获取更多信息,但抓取速度相对较慢。我得到 ~ 300/分钟,我猜测是我的抓取器的结构以及它如何收集请求,遵循 url,并抓取信息。会不会是这样,如何提高速度?
import scrapy
from scrapy.item import Field
from scrapy.loader import ItemLoader
from itemloaders.processors import TakeFirst, MapCompose, Join
from scrapy.crawler import CrawlerProcess
from price_parser import Price
def get_price(price_raw):
price_object = Price.fromstring(price_raw)
return price_object.amount_float
def get_currency(price_raw):
price_object = Price.fromstring(price_raw)
currency = price_object.currency
return currency
class VinylItem(scrapy.Item):
title = Field(output_processor = TakeFirst())
label = Field()
media_condition=Field(input_processor = MapCompose(str.strip),
output_processor = TakeFirst())
sleeve_condition = Field(output_processor = TakeFirst())
location = Field(input_processor = MapCompose(str.strip),
output_processor = Join())
price = Field(input_processor = MapCompose(get_price)
,output_processor = TakeFirst())
currency = Field(input_processor = MapCompose(get_currency)
,output_processor = TakeFirst())
rated = Field(input_processor = MapCompose(str.strip)
,output_processor = Join())
have_vinyl = Field(output_processor = TakeFirst())
want_vinyl = Field(output_processor = TakeFirst())
format = Field(input_processor = MapCompose(str.strip),
output_processor = Join())
released = Field(input_processor = MapCompose(str.strip),
output_processor = Join())
genre = Field(input_processor = MapCompose(str.strip),
output_processor = Join())
style = Field(input_processor = MapCompose(str.strip),
output_processor = Join())
class VinylSpider(scrapy.Spider):
name = 'vinyl'
#allowed_domains = ['x']
start_urls = ['https://www.discogs.com/sell/list?format=Vinyl']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url, callback = self.parse
)
def parse(self, response):
content = response.xpath("//table[@class='table_block mpitems push_down table_responsive']//tbody//tr")
for items in content:
loader = ItemLoader(VinylItem(), selector = items)
loader.add_xpath('title', "(.//strong//a)[position() mod 2=1]//text()")
loader.add_xpath('label', './/p[@class="hide_mobile label_and_cat"]//a//text()')
loader.add_xpath("media_condition", '(.//p[@class="item_condition"]//span)[position() mod 3=0]//text()')
loader.add_xpath("sleeve_condition", './/p[@class="item_condition"]//span[@class="item_sleeve_condition"]//text()')
loader.add_xpath("location", '(.//td[@class="seller_info"]//li)[position() mod 3=0]//text()')
loader.add_xpath('price', '(//tbody//tr//td//span[@class="price"])[position() mod 2=0]//text()')
loader.add_xpath('currency', '(//tbody//tr//td//span[@class="price"])[position() mod 2=0]//text()')
loader.add_xpath('rated', './/td//div[@class="community_rating"]//text()')
loader.add_xpath('have_vinyl', '(.//td//div[@class="community_result"]//span[@class="community_label"])[contains(text(),"have")]//text()')
loader.add_xpath('want_vinyl', '(.//td//div[@class="community_result"]//span[@class="community_label"])[contains(text(),"want")]//text()')
links = items.xpath('.//td[@class="item_description"]//strong//@href').get()
yield response.follow(
response.urljoin(links),
callback = self.parse_vinyls,
cb_kwargs = {
'loader':loader
}
)
next_page = response.xpath('(//ul[@class="pagination_page_links"]//a)[last()]//@href').get()
if next_page:
yield response.follow(
response.urljoin(next_page),
callback = self.parse
)
def parse_vinyls(self, response, loader):
#loader = ItemLoader(VinylItem(), selector = response)
loader.add_value('format', response.xpath("(.//div[@id='page_content']//div[5])[1]//text()").get())
loader.add_value('released', response.xpath("(.//div[@id='page_content']//div[9])[1]//text()").get())
loader.add_value('genre', response.xpath("(.//div[@id='page_content']//div[11])[1]//text()").get())
loader.add_value('style', response.xpath("(.//div[@id='page_content']//div[13])[1]//text()").get())
yield loader.load_item()
process = CrawlerProcess(
settings = {
'FEED_URI':'vinyl.jl',
'FEED_FORMAT':'jsonlines'
}
)
process.crawl(VinylSpider)
process.start()
从您提供的代码片段来看,您的抓取程序设置得非常高效,因为它yield
一次处理许多请求,让 scrapy 处理并发。
您可以调整几个设置来提高抓取速度。但是,请注意,抓取的第一条规则是你不应该损害你抓取的网站。请参阅下面您可以调整的设置示例。
- 增加
CONCURRENT_REQUESTS
的值。 scrapy中默认为16 - 增加
CONCURRENT_REQUESTS_PER_DOMAIN
的值。 scrapy中默认为8 - 增加 Twisted IO 线程池的最大大小,以便 DNS 解析更快
REACTOR_THREADPOOL_MAXSIZE
- 降低日志级别
LOG_LEVEL = 'INFO'
- 如果不需要,请禁用 cookies
COOKIES_ENABLED = False
- 减少下载超时
DOWNLOAD_TIMEOUT = 15
- 如果您的互联网速度很快并且您确定您定位的网站足够快,请降低
DOWNLOAD_DELAY
的值。 不建议这样做
从 docs
阅读有关这些设置的更多信息如果上述设置没有解决您的问题,那么您可能需要研究一下分布式抓取