python selenium 在获取 url 进行抓取时出现异常
python selenium exception while fetching a url for scraping
代码在 try 块之前运行良好。当我获取 url 时它给出异常。它只首先获取 link 然后崩溃。
请提供 insights.I 我正在获取列表中的所有 link,但是当我在 for 循环中获取那些 url 时它崩溃了。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
baseUrl = 'http://www.thomassci.com/browse-by/suppliers'
driver = webdriver.Chrome()
driver.get(baseUrl)
mylinks= driver.find_elements_by_xpath("//*[starts-with(@id, 'ctl00') and contains(@id, 'ManufacturerName')]")
for links in mylinks:
print links.get_attribute('href')
datlnk=links.get_attribute('href')
print links.text
try:
pghtml=driver.get(datlnk)
wait = WebDriverWait(driver, 10)
except:
print('problem')
Traceback (most recent call last):
File "D:/thomas/thomastest.py", line 17, in <module>
print links.get_attribute('href')
File "C:\Python27\lib\site-packages\selenium-2.46.0-py2.7.egg\selenium\webdriver\remote\webelement.py", line 102, in get_attribute
resp = self._execute(Command.GET_ELEMENT_ATTRIBUTE, {'name': name})
File "C:\Python27\lib\site-packages\selenium-2.46.0-py2.7.egg\selenium\webdriver\remote\webelement.py", line 404, in _execute
return self._parent.execute(command, params)
File "C:\Python27\lib\site-packages\selenium-2.46.0-py2.7.egg\selenium\webdriver\remote\webdriver.py", line 195, in execute
self.error_handler.check_response(response)
File "C:\Python27\lib\site-packages\selenium-2.46.0-py2.7.egg\selenium\webdriver\remote\errorhandler.py", line 170, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
(Session info: chrome=61.0.3163.100)
(Driver info: chromedriver=2.27.440174 (e97a722caafc2d3a8b807ee115bfb307f7d2cfd9),platform=Windows NT 6.1.7600 x86_64)
Process finished with exit code 1
StaleElementReferenceException
表示您尝试使用的 WebElement 不再有效。当 DOM 已更改(如移动到另一个 url)或刷新时会发生这种情况。
您可以将 url 保留在另一个列表中并对其进行迭代
mylinks = driver.find_elements_by_xpath("//*[starts-with(@id, 'ctl00') and contains(@id, 'ManufacturerName')]")
hrefs = []
for link in mylinks:
hrefs.append(link.get_attribute('href'))
for href in hrefs:
print href
datlnk = href
#...
附带说明,wait = WebDriverWait(driver, 10)
应该定义一次,而不是循环的每次迭代。
代码在 try 块之前运行良好。当我获取 url 时它给出异常。它只首先获取 link 然后崩溃。 请提供 insights.I 我正在获取列表中的所有 link,但是当我在 for 循环中获取那些 url 时它崩溃了。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
baseUrl = 'http://www.thomassci.com/browse-by/suppliers'
driver = webdriver.Chrome()
driver.get(baseUrl)
mylinks= driver.find_elements_by_xpath("//*[starts-with(@id, 'ctl00') and contains(@id, 'ManufacturerName')]")
for links in mylinks:
print links.get_attribute('href')
datlnk=links.get_attribute('href')
print links.text
try:
pghtml=driver.get(datlnk)
wait = WebDriverWait(driver, 10)
except:
print('problem')
Traceback (most recent call last):
File "D:/thomas/thomastest.py", line 17, in <module>
print links.get_attribute('href')
File "C:\Python27\lib\site-packages\selenium-2.46.0-py2.7.egg\selenium\webdriver\remote\webelement.py", line 102, in get_attribute
resp = self._execute(Command.GET_ELEMENT_ATTRIBUTE, {'name': name})
File "C:\Python27\lib\site-packages\selenium-2.46.0-py2.7.egg\selenium\webdriver\remote\webelement.py", line 404, in _execute
return self._parent.execute(command, params)
File "C:\Python27\lib\site-packages\selenium-2.46.0-py2.7.egg\selenium\webdriver\remote\webdriver.py", line 195, in execute
self.error_handler.check_response(response)
File "C:\Python27\lib\site-packages\selenium-2.46.0-py2.7.egg\selenium\webdriver\remote\errorhandler.py", line 170, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
(Session info: chrome=61.0.3163.100)
(Driver info: chromedriver=2.27.440174 (e97a722caafc2d3a8b807ee115bfb307f7d2cfd9),platform=Windows NT 6.1.7600 x86_64)
Process finished with exit code 1
StaleElementReferenceException
表示您尝试使用的 WebElement 不再有效。当 DOM 已更改(如移动到另一个 url)或刷新时会发生这种情况。
您可以将 url 保留在另一个列表中并对其进行迭代
mylinks = driver.find_elements_by_xpath("//*[starts-with(@id, 'ctl00') and contains(@id, 'ManufacturerName')]")
hrefs = []
for link in mylinks:
hrefs.append(link.get_attribute('href'))
for href in hrefs:
print href
datlnk = href
#...
附带说明,wait = WebDriverWait(driver, 10)
应该定义一次,而不是循环的每次迭代。