动态 Table 在 python 中使用 selenium 进行抓取
Dynamic Table Scraping with selenium in python
我正在尝试访问此站点的数据:http://surge.srcc.lsu.edu/s1.html。
到目前为止,我的代码在两个下拉菜单中循环,但是 table 是动态命名的,我无法从中获取数据。我试图通过 "output_data_table" 上方的 class 访问数据,但遇到了问题。
# importing libraries
from selenium import webdriver
import time
from selenium.webdriver.support.ui import Select
import lxml.html
driver = webdriver.Firefox()
driver.get("http://surge.srcc.lsu.edu/s1.html")
# definition for switching frames
def frame_switch(css_selector):
driver.switch_to.frame(driver.find_element_by_css_selector(css_selector))
frame_switch("iframe")
html_source = driver.page_source
nameSelect = Select(driver.find_element_by_xpath('//select[@id="storm_name"]'))
stormCount = len(nameSelect.options)
for i in range(1, stormCount):
print("starting loop on option storm " + nameSelect.options[i].text)
nameSelect.select_by_index(i)
time.sleep(3)
yearSelect = Select(driver.find_element_by_xpath('//select[@id="year"]'))
yearCount = len(yearSelect.options)
for j in range(1, yearCount):
print("starting loop on option year " + yearSelect.options[j].text)
yearSelect.select_by_index(j)
root = lxml.html.fromstring(driver.page_source)
#table=driver.find_element_by_id("output_data_table")
for row in root.xpath('.//table[@id="output_data_table"]//tr'):
# needs dynamic table name
cells = row.xpath('.//td/text()')
dict_value = {'0th': cells[0],
'1st': cells[1],
'2nd': cells[2],
'3rd': cells[3],
'4th': cells[5],
'5th': cells[6],
'6th': cells[7],
'7th': cells[8]}
print(dict_value)
看来您必须等待才能致电 "root = lxml.html.fromstring(driver.page_source)"。
如果您不等待,您将获得 html 源,而 javascript 不会生成 table。在它前面放一个"time.sleep(10)"。
这个好像搞定了table。我用 BeautifulSoup 作为一个简单的例子。
from selenium import webdriver
import time, re
from selenium.webdriver.support.ui import Select
import lxml.html
from bs4 import BeautifulSoup
driver = webdriver.Firefox()
driver.get("http://surge.srcc.lsu.edu/s1.html")
# definition for switching frames
def frame_switch(css_selector):
driver.switch_to.frame(driver.find_element_by_css_selector(css_selector))
frame_switch("iframe")
html_source = driver.page_source
nameSelect = Select(driver.find_element_by_xpath('//select[@id="storm_name"]'))
stormCount = len(nameSelect.options)
for i in range(1, stormCount):
print("starting loop on option storm " + nameSelect.options[i].text)
nameSelect.select_by_index(i)
time.sleep(3)
yearSelect = Select(driver.find_element_by_xpath('//select[@id="year"]'))
yearCount = len(yearSelect.options)
for j in range(1, yearCount):
print("starting loop on option year " + yearSelect.options[j].text)
yearSelect.select_by_index(j)
time.sleep(10)
soup = BeautifulSoup(driver.page_source, 'html.parser')
# get the needed table body
print soup.find_all("tbody", {"class" : re.compile(".*")})[1].prettify()
# print out each column
get_table = soup.find_all("tbody", {"class" : re.compile(".*")})[1]
columns = get_table.find_all("tr")
for column in columns:
print column.getText()
我正在尝试访问此站点的数据:http://surge.srcc.lsu.edu/s1.html。 到目前为止,我的代码在两个下拉菜单中循环,但是 table 是动态命名的,我无法从中获取数据。我试图通过 "output_data_table" 上方的 class 访问数据,但遇到了问题。
# importing libraries
from selenium import webdriver
import time
from selenium.webdriver.support.ui import Select
import lxml.html
driver = webdriver.Firefox()
driver.get("http://surge.srcc.lsu.edu/s1.html")
# definition for switching frames
def frame_switch(css_selector):
driver.switch_to.frame(driver.find_element_by_css_selector(css_selector))
frame_switch("iframe")
html_source = driver.page_source
nameSelect = Select(driver.find_element_by_xpath('//select[@id="storm_name"]'))
stormCount = len(nameSelect.options)
for i in range(1, stormCount):
print("starting loop on option storm " + nameSelect.options[i].text)
nameSelect.select_by_index(i)
time.sleep(3)
yearSelect = Select(driver.find_element_by_xpath('//select[@id="year"]'))
yearCount = len(yearSelect.options)
for j in range(1, yearCount):
print("starting loop on option year " + yearSelect.options[j].text)
yearSelect.select_by_index(j)
root = lxml.html.fromstring(driver.page_source)
#table=driver.find_element_by_id("output_data_table")
for row in root.xpath('.//table[@id="output_data_table"]//tr'):
# needs dynamic table name
cells = row.xpath('.//td/text()')
dict_value = {'0th': cells[0],
'1st': cells[1],
'2nd': cells[2],
'3rd': cells[3],
'4th': cells[5],
'5th': cells[6],
'6th': cells[7],
'7th': cells[8]}
print(dict_value)
看来您必须等待才能致电 "root = lxml.html.fromstring(driver.page_source)"。
如果您不等待,您将获得 html 源,而 javascript 不会生成 table。在它前面放一个"time.sleep(10)"。
这个好像搞定了table。我用 BeautifulSoup 作为一个简单的例子。
from selenium import webdriver
import time, re
from selenium.webdriver.support.ui import Select
import lxml.html
from bs4 import BeautifulSoup
driver = webdriver.Firefox()
driver.get("http://surge.srcc.lsu.edu/s1.html")
# definition for switching frames
def frame_switch(css_selector):
driver.switch_to.frame(driver.find_element_by_css_selector(css_selector))
frame_switch("iframe")
html_source = driver.page_source
nameSelect = Select(driver.find_element_by_xpath('//select[@id="storm_name"]'))
stormCount = len(nameSelect.options)
for i in range(1, stormCount):
print("starting loop on option storm " + nameSelect.options[i].text)
nameSelect.select_by_index(i)
time.sleep(3)
yearSelect = Select(driver.find_element_by_xpath('//select[@id="year"]'))
yearCount = len(yearSelect.options)
for j in range(1, yearCount):
print("starting loop on option year " + yearSelect.options[j].text)
yearSelect.select_by_index(j)
time.sleep(10)
soup = BeautifulSoup(driver.page_source, 'html.parser')
# get the needed table body
print soup.find_all("tbody", {"class" : re.compile(".*")})[1].prettify()
# print out each column
get_table = soup.find_all("tbody", {"class" : re.compile(".*")})[1]
columns = get_table.find_all("tr")
for column in columns:
print column.getText()