我的 Scrapy 脚本非常慢,在 3 分钟内提取了 100 个项目
My Scrapy script very slow, extracts 100 items in 3 mins
我正在学习 scrapy,因为我了解到它是异步工作的,因此比 Selenium 更快。但实际上只需要 3 分钟就可以抓取 100 个项目。我不知道为什么。我需要帮助。
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.loader import ItemLoader
from batt_data.items import BattDataItem
import urllib.parse
from selenium import webdriver
class BatterySpider(CrawlSpider):
name = 'battery'
# allowed_domains = ['web']
start_urls = ['https://www.made-in-china.com/multi-search/24v%2Bbattery/F1/1.html']
base_url = ['https://www.made-in-china.com/multi-search/24v%2Bbattery/F1/1.html']
# driver = webdriver.Chrome()
# driver.find_element_by_xpath('//a[contains(@class,"list-switch-btn list-switch-btn-right selected")]').click()
rules = (
Rule(LinkExtractor(restrict_xpaths='//*[contains(@class, "nextpage")]'),
callback='parse_item', follow=True),
)
def parse_item(self, response):
price = response.css('.price::text').extract()
description = response.xpath('//img[@class="J-firstLazyload"]/@alt').extract()
chemistry = response.xpath('//li[@class="J-faketitle ellipsis"][1]/span/text()').extract()
applications = response.xpath('//li[@class="J-faketitle ellipsis"][2]/span/text()').extract()
discharge_rate = response.xpath('//li[@class="J-faketitle ellipsis"][4]/span/text()').extract()
shape = response.xpath('//li[@class="J-faketitle ellipsis"][5]/span/text()').extract()
data = zip(description,price,chemistry,applications,discharge_rate,shape)
for item in data:
scraped = {
'description': item[0],
'price' : item[1],
'chemistry' : item[2],
'applications' : item[3],
'discharge_rate' : item[4],
'shape' : item[5],
}
yield scraped
实际上我发送了太多请求。我通过遍历一个包含我需要的所有项目的容器来处理它。更新后的蜘蛛在不到 1 分钟的时间内完成了工作
我正在学习 scrapy,因为我了解到它是异步工作的,因此比 Selenium 更快。但实际上只需要 3 分钟就可以抓取 100 个项目。我不知道为什么。我需要帮助。
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.loader import ItemLoader
from batt_data.items import BattDataItem
import urllib.parse
from selenium import webdriver
class BatterySpider(CrawlSpider):
name = 'battery'
# allowed_domains = ['web']
start_urls = ['https://www.made-in-china.com/multi-search/24v%2Bbattery/F1/1.html']
base_url = ['https://www.made-in-china.com/multi-search/24v%2Bbattery/F1/1.html']
# driver = webdriver.Chrome()
# driver.find_element_by_xpath('//a[contains(@class,"list-switch-btn list-switch-btn-right selected")]').click()
rules = (
Rule(LinkExtractor(restrict_xpaths='//*[contains(@class, "nextpage")]'),
callback='parse_item', follow=True),
)
def parse_item(self, response):
price = response.css('.price::text').extract()
description = response.xpath('//img[@class="J-firstLazyload"]/@alt').extract()
chemistry = response.xpath('//li[@class="J-faketitle ellipsis"][1]/span/text()').extract()
applications = response.xpath('//li[@class="J-faketitle ellipsis"][2]/span/text()').extract()
discharge_rate = response.xpath('//li[@class="J-faketitle ellipsis"][4]/span/text()').extract()
shape = response.xpath('//li[@class="J-faketitle ellipsis"][5]/span/text()').extract()
data = zip(description,price,chemistry,applications,discharge_rate,shape)
for item in data:
scraped = {
'description': item[0],
'price' : item[1],
'chemistry' : item[2],
'applications' : item[3],
'discharge_rate' : item[4],
'shape' : item[5],
}
yield scraped
实际上我发送了太多请求。我通过遍历一个包含我需要的所有项目的容器来处理它。更新后的蜘蛛在不到 1 分钟的时间内完成了工作