使用 Selenium Python 获取动态 Table 数据
Getting Dynamic Table Data With Selenium Python
所以我正在尝试使用 selenium 从动态 table 解析此数据,它不断从第 1 页获取旧数据,我正在尝试收集第 2 页的数据,我尝试搜索对于其他答案,但没有找到任何答案,有人说我需要添加一个等待期,我做了,但是那没有用。
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.support import expected_conditions as EC
browser = webdriver.Firefox()
browser.get('https://www.nyse.com/listings_directory/stock')
symbol_list=[]
table_data=browser.find_elements_by_xpath("//td");
def append_to_list(data):
for element in data:
symbol_list.append(element.text)
append_to_list(table_data)
pages=browser.find_elements_by_xpath('//a[@href="#"]')
for page in pages:
if(page.get_attribute("rel")== "next"):
if(page.text=="NEXT ›"):
page.click()
browser.implicitly_wait(100)
for elem in browser.find_elements_by_xpath("//td"): //still fetchs the data from page 1
print(elem.text)
#print(symbol_list)
我修改了你的脚本如下。
您应该在 for 循环中检索元素,否则会导致过时的元素引用异常。
并使用 WebDriverWait 在查找元素之前等待元素可见。
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from time import sleep
browser = webdriver.Chrome()
browser.get('https://www.nyse.com/listings_directory/stock')
symbol_list = []
while True:
try:
table_data = WebDriverWait(browser, 10).until(EC.visibility_of_all_elements_located((By.XPATH, "//table//td")))
for i in range(1, len(table_data)+1):
td_text = browser.find_element_by_xpath("(//table//td)["+str(i)+"]").text
print(td_text)
symbol_list.append(td_text)
next_page = WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '//a[@href="#" and contains(text(),"Next")]')))
next_clickable = next_page.find_element_by_xpath("..").get_attribute("class") # li
if next_clickable == 'disabled':
break
print("Go to next page ...")
next_page.click()
sleep(3)
except Exception as e:
print(e)
break
所以我正在尝试使用 selenium 从动态 table 解析此数据,它不断从第 1 页获取旧数据,我正在尝试收集第 2 页的数据,我尝试搜索对于其他答案,但没有找到任何答案,有人说我需要添加一个等待期,我做了,但是那没有用。
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.support import expected_conditions as EC
browser = webdriver.Firefox()
browser.get('https://www.nyse.com/listings_directory/stock')
symbol_list=[]
table_data=browser.find_elements_by_xpath("//td");
def append_to_list(data):
for element in data:
symbol_list.append(element.text)
append_to_list(table_data)
pages=browser.find_elements_by_xpath('//a[@href="#"]')
for page in pages:
if(page.get_attribute("rel")== "next"):
if(page.text=="NEXT ›"):
page.click()
browser.implicitly_wait(100)
for elem in browser.find_elements_by_xpath("//td"): //still fetchs the data from page 1
print(elem.text)
#print(symbol_list)
我修改了你的脚本如下。
您应该在 for 循环中检索元素,否则会导致过时的元素引用异常。
并使用 WebDriverWait 在查找元素之前等待元素可见。
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from time import sleep
browser = webdriver.Chrome()
browser.get('https://www.nyse.com/listings_directory/stock')
symbol_list = []
while True:
try:
table_data = WebDriverWait(browser, 10).until(EC.visibility_of_all_elements_located((By.XPATH, "//table//td")))
for i in range(1, len(table_data)+1):
td_text = browser.find_element_by_xpath("(//table//td)["+str(i)+"]").text
print(td_text)
symbol_list.append(td_text)
next_page = WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '//a[@href="#" and contains(text(),"Next")]')))
next_clickable = next_page.find_element_by_xpath("..").get_attribute("class") # li
if next_clickable == 'disabled':
break
print("Go to next page ...")
next_page.click()
sleep(3)
except Exception as e:
print(e)
break