Scrapy 和 Selenium StaleElementReferenceException
Scrapy and Selenium StaleElementReferenceException
页面上有几个可点击的元素,我正试图抓取后面的一些页面,但出现此错误,蜘蛛程序在第一次点击后关闭:
StaleElementReferenceException: Message: Element not found in the cache - perhaps the page has changed since it was looked up
现在我只是试图让页面打开以捕捉新的 url。这是我的代码
from scrapy import signals
from scrapy.http import TextResponse
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.xlib.pydispatch import dispatcher
from MySpider.items import MyItem
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import time
class MySpider(Spider):
name = "myspider"
allowed_domains = ["http://example.com"]
base_url = 'http://example.com'
start_urls = ["http://example.com/Page.aspx",]
def __init__(self):
self.driver = webdriver.Firefox()
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
self.driver.close()
def parse(self, response):
self.driver.get(response.url)
item = MyItem()
links = self.driver.find_elements_by_xpath("//input[@class='GetData']")
for button in links:
button.click()
time.sleep(5)
source = self.driver.page_source
sel = Selector(text=source) # create a Selector object
item['url'] = self.driver.current_url
print '\n\nURL\n', item['url'], '\n'
yield item
因为 link 个元素在第一页。如果您打开新页面,link 元素会过时。
您可以尝试这两种解决方案:
1、存储linkurl个link元素,用driver.get(url)
打开link.
def parse(self, response):
self.driver.get(response.url)
item = MyItem()
links = self.driver.find_elements_by_xpath("//input[@class='GetData']")
link_urls = links.get_attribute("href")
for link_url in link_urls:
self.driver.get(link_url)
time.sleep(5)
source = self.driver.page_source
sel = Selector(text=source) # create a Selector object
item['url'] = self.driver.current_url
print '\n\nURL\n', item['url'], '\n'
yield item
2、点击link得到url后,调用driver.back()
返回首页。然后 re-find link 个元素。
def parse(self, response):
self.driver.get(response.url)
item = MyItem()
links = self.driver.find_elements_by_xpath("//input[@class='GetData']")
for i in range(len(links)):
links[i].click()
time.sleep(5)
source = self.driver.page_source
sel = Selector(text=source) # create a Selector object
item['url'] = self.driver.current_url
print '\n\nURL\n', item['url'], '\n'
yield item
self.driver.back()
links = self.driver.find_elements_by_xpath("//input[@class='GetData']")
页面上有几个可点击的元素,我正试图抓取后面的一些页面,但出现此错误,蜘蛛程序在第一次点击后关闭:
StaleElementReferenceException: Message: Element not found in the cache - perhaps the page has changed since it was looked up
现在我只是试图让页面打开以捕捉新的 url。这是我的代码
from scrapy import signals
from scrapy.http import TextResponse
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.xlib.pydispatch import dispatcher
from MySpider.items import MyItem
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import time
class MySpider(Spider):
name = "myspider"
allowed_domains = ["http://example.com"]
base_url = 'http://example.com'
start_urls = ["http://example.com/Page.aspx",]
def __init__(self):
self.driver = webdriver.Firefox()
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
self.driver.close()
def parse(self, response):
self.driver.get(response.url)
item = MyItem()
links = self.driver.find_elements_by_xpath("//input[@class='GetData']")
for button in links:
button.click()
time.sleep(5)
source = self.driver.page_source
sel = Selector(text=source) # create a Selector object
item['url'] = self.driver.current_url
print '\n\nURL\n', item['url'], '\n'
yield item
因为 link 个元素在第一页。如果您打开新页面,link 元素会过时。
您可以尝试这两种解决方案:
1、存储linkurl个link元素,用driver.get(url)
打开link.
def parse(self, response):
self.driver.get(response.url)
item = MyItem()
links = self.driver.find_elements_by_xpath("//input[@class='GetData']")
link_urls = links.get_attribute("href")
for link_url in link_urls:
self.driver.get(link_url)
time.sleep(5)
source = self.driver.page_source
sel = Selector(text=source) # create a Selector object
item['url'] = self.driver.current_url
print '\n\nURL\n', item['url'], '\n'
yield item
2、点击link得到url后,调用driver.back()
返回首页。然后 re-find link 个元素。
def parse(self, response):
self.driver.get(response.url)
item = MyItem()
links = self.driver.find_elements_by_xpath("//input[@class='GetData']")
for i in range(len(links)):
links[i].click()
time.sleep(5)
source = self.driver.page_source
sel = Selector(text=source) # create a Selector object
item['url'] = self.driver.current_url
print '\n\nURL\n', item['url'], '\n'
yield item
self.driver.back()
links = self.driver.find_elements_by_xpath("//input[@class='GetData']")