Python Selenium 中的 IndexError 和 UnboundLocalError
Python IndexError and UnboundLocalError in Selenium
如果这是一个基本问题,我深表歉意,因为我对 python 还是比较陌生。我正在尝试使用 Selenium 开发一个网络抓取脚本,并且已经获得了几乎所有必要的功能(从一个页面导航到另一个页面,在一个页面中定位并打开所有 url)。
但是,由于我要抓取的网站的性质,一些元素在某些页面上被省略,而在其他页面上存在。在省略它们的情况下,终端 returns 一个 IndexError
,我目前正在使用异常绕过它。
然而,当我尝试打印抓取的数据时,出现以下错误:
UnboundLocalError: local variable 'manufacturer' referenced before assignment
我知道这可能是因为我对 IndexError 进行了例外处理,然后在随后的 print
命令中引用了它。
我是否可以通过规避这两个问题来抓取我正在寻找的信息,如果是这样,我该如何将抓取的数据导出到 csv 文件中?
我的函数代码如下:
def scrape():
browser.implicitly_wait(7)
try:
collection = browser.find_elements(By.XPATH,'//*[@id="page-content-wrapper"]/div/ul/li[5]/a')[0].text
description = browser.find_elements(By.XPATH,'//*[(@id = "child-1")]//p')[0].text
dimension = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]//p')[0].text
finish = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) + 1) = 2) and parent::*)]//p')[0].text
country = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) + 1) = 3) and parent::*)]//p')[0].text
manufacturer = browser.find_elements(By.XPATH,'//div[(((count(preceding-sibling::*) + 1) = 4) and parent::*)]//p')[0].text
except IndexError:
pass
print(collection, description, dimension, finish, country, manufacturer)
browser.back()
非常感谢!
由于您的变量是在 try 块内创建的,如果其中一个变量失败,则该变量和该变量下方的任何内容都不会被创建,并且当您尝试引用它时,解释器不知道该变量是什么应该是。如果您执行以下操作,打印语句将起作用:
def scrape():
browser.implicitly_wait(7)
collection = ""
description = ""
dimension = ""
finish = ""
country = ""
manufacturer = ""
try:
collection = browser.find_elements(By.XPATH,'//*[@id="page-content-wrapper"]/div/ul/li[5]/a')[0].text
description = browser.find_elements(By.XPATH,'//*[(@id = "child-1")]//p')[0].text
dimension = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]//p')[0].text
finish = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) + 1) = 2) and parent::*)]//p')[0].text
country = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) + 1) = 3) and parent::*)]//p')[0].text
manufacturer = browser.find_elements(By.XPATH,'//div[(((count(preceding-sibling::*) + 1) = 4) and parent::*)]//p')[0].text
except IndexError:
pass
print(collection, description, dimension, finish, country, manufacturer)
browser.back()
所以现在你必须处理失败变量下面的变量也没有被赋值的问题;我推荐使用字典:
def scrape():
browser.implicitly_wait(7)
page_elements = dict()
page_elements['collection'] = ""
page_elements['description'] = ""
page_elements['dimension'] = ""
page_elements['finish'] = ""
page_elements['country'] = ""
page_elements['manufacturer'] = ""
try:
collection = browser.find_elements(By.XPATH,'//*[@id="page-content-wrapper"]/div/ul/li[5]/a')[0].text
except IndexError:
pass
try:
description = browser.find_elements(By.XPATH,'//*[(@id = "child-1")]//p')[0].text
except IndexError:
pass
try:
dimension = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]//p')[0].text
except IndexError:
pass
try:
finish = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) + 1) = 2) and parent::*)]//p')[0].text
except IndexError:
pass
try:
country = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) + 1) = 3) and parent::*)]//p')[0].text
except IndexError:
pass
try:
manufacturer = browser.find_elements(By.XPATH,'//div[(((count(preceding-sibling::*) + 1) = 4) and parent::*)]//p')[0].text
except IndexError:
pass
print(page_elements)
browser.back()
同样,为每个语句使用 try 块有点麻烦,所以如果您想将路径保存在不同的字典中,您可以这样做:
def scrape():
browser.implicityl_wait(7)
page_elements = dict()
page_elements['collection'] = ""
page_elements['description'] = ""
page_elements['dimension'] = ""
page_elements['finish'] = ""
page_elements['country'] = ""
page_elements['manufacturer'] = ""
element_paths = dict()
element_paths['collection'] = '//*[@id="page-content-wrapper"]/div/ul/li[5]/a'
element_paths['description'] = '//*[(@id = "child-1")]//p'
element_paths['dimension'] = '//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]//p'
element_paths['finish'] = '//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) + 1) = 2) and parent::*)]//p'
element_paths['country'] = '//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) + 1) = 3) and parent::*)]//p'
element_paths['manufacturer'] = '//div[(((count(preceding-sibling::*) + 1) = 4) and parent::*)]//p'
for element in page_elements:
try:
page_elements[element] = browser.find_elements(By.XPATH, element_paths[element])[0].text
except IndexError:
pass
print(page_elements)
browser.back()
如果这是一个基本问题,我深表歉意,因为我对 python 还是比较陌生。我正在尝试使用 Selenium 开发一个网络抓取脚本,并且已经获得了几乎所有必要的功能(从一个页面导航到另一个页面,在一个页面中定位并打开所有 url)。
但是,由于我要抓取的网站的性质,一些元素在某些页面上被省略,而在其他页面上存在。在省略它们的情况下,终端 returns 一个 IndexError
,我目前正在使用异常绕过它。
然而,当我尝试打印抓取的数据时,出现以下错误:
UnboundLocalError: local variable 'manufacturer' referenced before assignment
我知道这可能是因为我对 IndexError 进行了例外处理,然后在随后的 print
命令中引用了它。
我是否可以通过规避这两个问题来抓取我正在寻找的信息,如果是这样,我该如何将抓取的数据导出到 csv 文件中?
我的函数代码如下:
def scrape():
browser.implicitly_wait(7)
try:
collection = browser.find_elements(By.XPATH,'//*[@id="page-content-wrapper"]/div/ul/li[5]/a')[0].text
description = browser.find_elements(By.XPATH,'//*[(@id = "child-1")]//p')[0].text
dimension = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]//p')[0].text
finish = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) + 1) = 2) and parent::*)]//p')[0].text
country = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) + 1) = 3) and parent::*)]//p')[0].text
manufacturer = browser.find_elements(By.XPATH,'//div[(((count(preceding-sibling::*) + 1) = 4) and parent::*)]//p')[0].text
except IndexError:
pass
print(collection, description, dimension, finish, country, manufacturer)
browser.back()
非常感谢!
由于您的变量是在 try 块内创建的,如果其中一个变量失败,则该变量和该变量下方的任何内容都不会被创建,并且当您尝试引用它时,解释器不知道该变量是什么应该是。如果您执行以下操作,打印语句将起作用:
def scrape():
browser.implicitly_wait(7)
collection = ""
description = ""
dimension = ""
finish = ""
country = ""
manufacturer = ""
try:
collection = browser.find_elements(By.XPATH,'//*[@id="page-content-wrapper"]/div/ul/li[5]/a')[0].text
description = browser.find_elements(By.XPATH,'//*[(@id = "child-1")]//p')[0].text
dimension = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]//p')[0].text
finish = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) + 1) = 2) and parent::*)]//p')[0].text
country = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) + 1) = 3) and parent::*)]//p')[0].text
manufacturer = browser.find_elements(By.XPATH,'//div[(((count(preceding-sibling::*) + 1) = 4) and parent::*)]//p')[0].text
except IndexError:
pass
print(collection, description, dimension, finish, country, manufacturer)
browser.back()
所以现在你必须处理失败变量下面的变量也没有被赋值的问题;我推荐使用字典:
def scrape():
browser.implicitly_wait(7)
page_elements = dict()
page_elements['collection'] = ""
page_elements['description'] = ""
page_elements['dimension'] = ""
page_elements['finish'] = ""
page_elements['country'] = ""
page_elements['manufacturer'] = ""
try:
collection = browser.find_elements(By.XPATH,'//*[@id="page-content-wrapper"]/div/ul/li[5]/a')[0].text
except IndexError:
pass
try:
description = browser.find_elements(By.XPATH,'//*[(@id = "child-1")]//p')[0].text
except IndexError:
pass
try:
dimension = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]//p')[0].text
except IndexError:
pass
try:
finish = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) + 1) = 2) and parent::*)]//p')[0].text
except IndexError:
pass
try:
country = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) + 1) = 3) and parent::*)]//p')[0].text
except IndexError:
pass
try:
manufacturer = browser.find_elements(By.XPATH,'//div[(((count(preceding-sibling::*) + 1) = 4) and parent::*)]//p')[0].text
except IndexError:
pass
print(page_elements)
browser.back()
同样,为每个语句使用 try 块有点麻烦,所以如果您想将路径保存在不同的字典中,您可以这样做:
def scrape():
browser.implicityl_wait(7)
page_elements = dict()
page_elements['collection'] = ""
page_elements['description'] = ""
page_elements['dimension'] = ""
page_elements['finish'] = ""
page_elements['country'] = ""
page_elements['manufacturer'] = ""
element_paths = dict()
element_paths['collection'] = '//*[@id="page-content-wrapper"]/div/ul/li[5]/a'
element_paths['description'] = '//*[(@id = "child-1")]//p'
element_paths['dimension'] = '//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]//p'
element_paths['finish'] = '//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) + 1) = 2) and parent::*)]//p'
element_paths['country'] = '//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) + 1) = 3) and parent::*)]//p'
element_paths['manufacturer'] = '//div[(((count(preceding-sibling::*) + 1) = 4) and parent::*)]//p'
for element in page_elements:
try:
page_elements[element] = browser.find_elements(By.XPATH, element_paths[element])[0].text
except IndexError:
pass
print(page_elements)
browser.back()