Selenium Click() 不适用于 scrapy spider
Selenium Click() not working with scrapy spider
我正在尝试使用 scrapy 蜘蛛从列表页面抓取指向产品页面的链接。该页面显示了前 10 台机器,并且有一个 'show all machines' 的按钮可以调用一些 javascript。 javascript 相当复杂(即我不能只查看函数并查看按钮指向的 url)。我正在尝试使用 selenium webdriver 来模拟按钮上的点击,但由于某种原因它无法正常工作。当我抓取产品链接时,我只得到前 10 个,而不是完整列表。
谁能告诉我为什么它不起作用?
我要抓取的页面是 http://www.ncservice.com/en/second-hand-milling-machines
蜘蛛是
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.http import Request, FormRequest
from scrapy import log
from scrapy.exceptions import DropItem
from scrapy import signals
from mtispider.items import MachineItem
import urlparse
import time
import MySQLdb
import unicodedata
import re
from mtispider import tools
from selenium import webdriver
class MachineSpider(CrawlSpider):
name = 'nc-spider'
allowed_domains = ['ncservice.com']
def start_requests(self):
requests = list(super(MachineSpider, self).start_requests())
requests.append(Request('http://www.ncservice.com/en/second-hand-milling-machines', callback=self.parsencmilllist))
return requests
def parsencmilllist(self,response):
hxs=HtmlXPathSelector(response)
driver= webdriver.Firefox()
driver.get(response.url)
try:
driver.FindElement(By.Id("mas-resultados-fresadoras")).Click()
except:
log.msg("Couldnt get all the machines", level=log.INFO)
ncmachs = hxs.select('//div[@id="resultados"]//a/@href').extract()
for ncmach in ncmachs:
yield Request(ncmach,
meta = {'type':'Milling'},
callback=self.parsencmachine)
driver.quit()
def parsencmachine(self,response):
#scrape the machine
return item
谢谢!
主要问题是您需要从网络驱动程序的 page_source
而不是传递到回调中的 response
初始化 Selector
:
from scrapy.contrib.spiders import CrawlSpider
from scrapy.http import Request
from scrapy import Selector
from selenium import webdriver
class MachineSpider(CrawlSpider):
name = 'nc-spider'
allowed_domains = ['ncservice.com']
def start_requests(self):
yield Request('http://www.ncservice.com/en/second-hand-milling-machines',
callback=self.parsencmilllist)
def parsencmilllist(self, response):
driver = webdriver.Firefox()
driver.get(response.url)
driver.find_element_by_id("mas-resultados-fresadoras").click()
sel = Selector(text=driver.page_source)
driver.quit()
links = sel.xpath('//div[@id="resultados"]//a/@href').extract()
for link in links:
yield Request(link,
meta={'type': 'Milling'},
callback=self.parsencmachine)
def parsencmachine(self, response):
print response.url
我正在尝试使用 scrapy 蜘蛛从列表页面抓取指向产品页面的链接。该页面显示了前 10 台机器,并且有一个 'show all machines' 的按钮可以调用一些 javascript。 javascript 相当复杂(即我不能只查看函数并查看按钮指向的 url)。我正在尝试使用 selenium webdriver 来模拟按钮上的点击,但由于某种原因它无法正常工作。当我抓取产品链接时,我只得到前 10 个,而不是完整列表。
谁能告诉我为什么它不起作用?
我要抓取的页面是 http://www.ncservice.com/en/second-hand-milling-machines
蜘蛛是
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.http import Request, FormRequest
from scrapy import log
from scrapy.exceptions import DropItem
from scrapy import signals
from mtispider.items import MachineItem
import urlparse
import time
import MySQLdb
import unicodedata
import re
from mtispider import tools
from selenium import webdriver
class MachineSpider(CrawlSpider):
name = 'nc-spider'
allowed_domains = ['ncservice.com']
def start_requests(self):
requests = list(super(MachineSpider, self).start_requests())
requests.append(Request('http://www.ncservice.com/en/second-hand-milling-machines', callback=self.parsencmilllist))
return requests
def parsencmilllist(self,response):
hxs=HtmlXPathSelector(response)
driver= webdriver.Firefox()
driver.get(response.url)
try:
driver.FindElement(By.Id("mas-resultados-fresadoras")).Click()
except:
log.msg("Couldnt get all the machines", level=log.INFO)
ncmachs = hxs.select('//div[@id="resultados"]//a/@href').extract()
for ncmach in ncmachs:
yield Request(ncmach,
meta = {'type':'Milling'},
callback=self.parsencmachine)
driver.quit()
def parsencmachine(self,response):
#scrape the machine
return item
谢谢!
主要问题是您需要从网络驱动程序的 page_source
而不是传递到回调中的 response
初始化 Selector
:
from scrapy.contrib.spiders import CrawlSpider
from scrapy.http import Request
from scrapy import Selector
from selenium import webdriver
class MachineSpider(CrawlSpider):
name = 'nc-spider'
allowed_domains = ['ncservice.com']
def start_requests(self):
yield Request('http://www.ncservice.com/en/second-hand-milling-machines',
callback=self.parsencmilllist)
def parsencmilllist(self, response):
driver = webdriver.Firefox()
driver.get(response.url)
driver.find_element_by_id("mas-resultados-fresadoras").click()
sel = Selector(text=driver.page_source)
driver.quit()
links = sel.xpath('//div[@id="resultados"]//a/@href').extract()
for link in links:
yield Request(link,
meta={'type': 'Milling'},
callback=self.parsencmachine)
def parsencmachine(self, response):
print response.url