使用硒写入文档时列出超出范围的索引
List index out of range when writing to a document with selenium
我正在尝试将 uni 名称、部门名称和评级写入来自 https://www.whatuni.com/university-course-reviews/?pageno=14 的文件。一切顺利,直到我到达没有部门名称的 post 它给我错误
file.write(user_name[k].text + ";" + uni_names[k].text + ";" + department[k].text + ";" + date_posted[k].text +
IndexError: list index out of range
这是我使用的代码。我相信当部门不存在时我需要以某种方式写 null 或使用 space 。我使用 if not 和 else 但它对我不起作用。我将不胜感激任何帮助。谢谢
for i in range(20):
try:
driver.refresh()
uni_names = driver.find_elements_by_xpath('//div[@class="rlst_wrap"]/h2/a')
department_names = driver.find_elements_by_xpath('//div[@class="rlst_wrap"]/h3/a')
user_name = driver.find_elements_by_xpath('//div[@class="rev_name"]')
date_posted = driver.find_elements_by_xpath('//div[@class="rev_dte"]')
uni_rev = driver.find_elements_by_xpath('(//div[@class="reviw_rating"]/div[@class="rate_new"]/p)')
uni_rating = driver.find_elements_by_xpath('(//div[@class="reviw_rating"]/div[@class="rate_new"]/span[starts-with(@class,"ml5")])')
job_prospects = driver.find_elements_by_xpath('//span[text()="Job Prospects"]/following-sibling::span')
course_and_lecturers = driver.find_elements_by_xpath('//span[text()="Course and Lecturers"]/following-sibling::span')
if not course_and_lecturers:
lecturers= "None"
else:
lecturers = course_and_lecturers
uni_facilities = driver.find_elements_by_xpath('//span[text()= "Facilities" or "Uni Facilities"]/following-sibling::span')
if not uni_facilities:
facilities = "None"
else:
facilities = uni_facilities
student_support = driver.find_elements_by_xpath('//span[text()="Student Support"]/following-sibling::span')
if not student_support:
support = "None"
else:
support = student_support
with open('uni_scraping.csv', 'a') as file:
for k in range(len(uni_names)):
if not department_names:
department = "None"
else:
department = department_names
file.write(user_name[k].text + ";" + uni_names[k].text + ";" + department[k].text + ";" + date_posted[k].text +
";" + uni_rating[k].get_attribute("class") + ";" + job_prospects[k].get_attribute("class") +
";" + lecturers[k].get_attribute("class") + ";" + facilities[k].get_attribute("class") +
";" + support[k].get_attribute("class") + ";" + uni_rev[k].text + "\n")
next_page = driver.find_element_by_class_name('mr0')
next_page.click()
file.close()
except exceptions.StaleElementReferenceException as e:
print('e')
pass
driver.close()
当您尝试 if not department_names
时感觉不错,但它仅在列表为空时有效。在您的情况下,问题是列表太短。
由于大学没有系,department_names
将比 uni_names
.
更短
因此,在你循环 for k in range(len(uni_names)):
中,department[k].text
并不总是具有相同索引的 uni 的部门,并且在某些时候 k 的值会比你的部门大列表。这就是为什么 department[k]
会导致错误。
我不知道解决这个问题的最有效方法是什么,但我认为您可以获得包含每个 uni 的完整详细信息的更大元素(例如整个 rlst_wrap),然后搜索它是 uni 的详细信息(例如使用正则表达式)。这样你就可以知道什么时候没有部门,避免问题。
谢谢 Vimizen 的回答。我按照你的建议做了,它对我有用。我写了这样的东西。
driver = webdriver.Chrome()
driver.get("https://www.whatuni.com/university-course-reviews/?pageno=14")
posts = []
driver.refresh()
post_elements = driver.find_elements_by_xpath('//div[@class="rlst_row"]')
for post_element_index in range(len(post_elements)):
post_element = post_elements[post_element_index]
uni_name = post_element.find_element_by_tag_name('h2')
try:
department_name = post_element.find_element_by_tag_name('h3')
department = department_name
department = department.text
except NoSuchElementException:
department = "aaaaaaaa"
user_name = post_element.find_element_by_class_name('rev_name')
postdict = {
"uni_name": uni_name.text,
"department": department,
"user_name": user_name.text
}
posts.append(postdict)
print(posts)
driver.close()
最佳
我正在尝试将 uni 名称、部门名称和评级写入来自 https://www.whatuni.com/university-course-reviews/?pageno=14 的文件。一切顺利,直到我到达没有部门名称的 post 它给我错误
file.write(user_name[k].text + ";" + uni_names[k].text + ";" + department[k].text + ";" + date_posted[k].text +
IndexError: list index out of range
这是我使用的代码。我相信当部门不存在时我需要以某种方式写 null 或使用 space 。我使用 if not 和 else 但它对我不起作用。我将不胜感激任何帮助。谢谢
for i in range(20):
try:
driver.refresh()
uni_names = driver.find_elements_by_xpath('//div[@class="rlst_wrap"]/h2/a')
department_names = driver.find_elements_by_xpath('//div[@class="rlst_wrap"]/h3/a')
user_name = driver.find_elements_by_xpath('//div[@class="rev_name"]')
date_posted = driver.find_elements_by_xpath('//div[@class="rev_dte"]')
uni_rev = driver.find_elements_by_xpath('(//div[@class="reviw_rating"]/div[@class="rate_new"]/p)')
uni_rating = driver.find_elements_by_xpath('(//div[@class="reviw_rating"]/div[@class="rate_new"]/span[starts-with(@class,"ml5")])')
job_prospects = driver.find_elements_by_xpath('//span[text()="Job Prospects"]/following-sibling::span')
course_and_lecturers = driver.find_elements_by_xpath('//span[text()="Course and Lecturers"]/following-sibling::span')
if not course_and_lecturers:
lecturers= "None"
else:
lecturers = course_and_lecturers
uni_facilities = driver.find_elements_by_xpath('//span[text()= "Facilities" or "Uni Facilities"]/following-sibling::span')
if not uni_facilities:
facilities = "None"
else:
facilities = uni_facilities
student_support = driver.find_elements_by_xpath('//span[text()="Student Support"]/following-sibling::span')
if not student_support:
support = "None"
else:
support = student_support
with open('uni_scraping.csv', 'a') as file:
for k in range(len(uni_names)):
if not department_names:
department = "None"
else:
department = department_names
file.write(user_name[k].text + ";" + uni_names[k].text + ";" + department[k].text + ";" + date_posted[k].text +
";" + uni_rating[k].get_attribute("class") + ";" + job_prospects[k].get_attribute("class") +
";" + lecturers[k].get_attribute("class") + ";" + facilities[k].get_attribute("class") +
";" + support[k].get_attribute("class") + ";" + uni_rev[k].text + "\n")
next_page = driver.find_element_by_class_name('mr0')
next_page.click()
file.close()
except exceptions.StaleElementReferenceException as e:
print('e')
pass
driver.close()
当您尝试 if not department_names
时感觉不错,但它仅在列表为空时有效。在您的情况下,问题是列表太短。
由于大学没有系,department_names
将比 uni_names
.
因此,在你循环 for k in range(len(uni_names)):
中,department[k].text
并不总是具有相同索引的 uni 的部门,并且在某些时候 k 的值会比你的部门大列表。这就是为什么 department[k]
会导致错误。
我不知道解决这个问题的最有效方法是什么,但我认为您可以获得包含每个 uni 的完整详细信息的更大元素(例如整个 rlst_wrap),然后搜索它是 uni 的详细信息(例如使用正则表达式)。这样你就可以知道什么时候没有部门,避免问题。
谢谢 Vimizen 的回答。我按照你的建议做了,它对我有用。我写了这样的东西。
driver = webdriver.Chrome()
driver.get("https://www.whatuni.com/university-course-reviews/?pageno=14")
posts = []
driver.refresh()
post_elements = driver.find_elements_by_xpath('//div[@class="rlst_row"]')
for post_element_index in range(len(post_elements)):
post_element = post_elements[post_element_index]
uni_name = post_element.find_element_by_tag_name('h2')
try:
department_name = post_element.find_element_by_tag_name('h3')
department = department_name
department = department.text
except NoSuchElementException:
department = "aaaaaaaa"
user_name = post_element.find_element_by_class_name('rev_name')
postdict = {
"uni_name": uni_name.text,
"department": department,
"user_name": user_name.text
}
posts.append(postdict)
print(posts)
driver.close()
最佳