python 硒网络抓取。 ```while``` 循环中的错误。如何让代码等待页面加载并重试
python selenium webscraping. Mistake in ```while``` loop. how to make the code wait for page to load and try again
我正在尝试使用 python 中的 selenium 从 JavaScript 网站抓取 table。该过程是通过 python 使用 selenium 驱动程序提交表单,然后检索加载的页面。由于网站速度较慢,有时为先前选择加载的 table 会保留在页面中,代码会抓取错误数据,而不是等待新的 table 加载。我想确保我抓取的 table 与我从下拉列表中所做的选择相对应。由于页面可能需要 5 秒到几分钟才能加载,因此指定等待时间可能不起作用。因此,我在第一个 while
循环中放置了第二个 while
循环,以在抓取数据之前验证下拉列表中的名称和结果 table 是否相同。但它似乎不起作用,并且在没有先验证名称的情况下就抓取了数据。我在下面给出一个示例。请告诉我如何让它正确。
import pandas as pd
import io
import time
from selenium import webdriver
from webdriver_manager.firefox import GeckoDriverManager
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
from selenium.webdriver.support.ui import Select
from retry import retry
# Web page url
driver.get("http://mnregaweb4.nic.in/netnrega/dynamic_work_details.aspx?page=S&lflag=eng&state_name=KERALA&state_code=16&fin_year=2020-2021&source=national&Digest=s5wXOIOkT98cNVkcwF6NQA")
x=0
while True:
try:
driver.implicitly_wait(5)
# Find District of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_dist')
drop = Select(x)
# Select by value
drop.select_by_value("1613")
time.sleep(6)
# Find Block of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_blk')
drop = Select(x)
# Select by value
drop.select_by_value("1613001")
time.sleep(4)
# Find GP of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_pan')
drop = Select(x)
# Select by value
drop.select_by_value("1613001001")
time.sleep(4)
search_button = driver.find_element_by_id("ContentPlaceHolder1_Button1")
search_button.click()
time.sleep(8)
soup = BeautifulSoup(driver.page_source, 'lxml')
tables = soup.find_all('table')
dfs = pd.read_html(str(tables))
#print(dfs[4])
df1 = pd.read_csv(io.StringIO(dfs[4].to_csv(index=False)), skiprows=1, header=[0,1])
c = df1.iat[1,3]
print(c)
c == "Alayamon"
while True:
try:
df1.to_csv("break.csv", index=False)
break
except:
if x:
print("waiting...")
x = False
driver.close()
break
except:
if x:
print("Error")
x = False
嗨,希望这有助于在您的网页加载之前将其插入
driver = chromedriver()
driver.get("http://mnregaweb4.nic.in/netnrega/dynamic_work_details.aspx?page=S&lflag=eng&state_name=KERALA&state_code=16&fin_year=2020-2021&source=national&Digest=s5wXOIOkT98cNVkcwF6NQA")
timeout = 3
try:
element_present = EC.presence_of_element_located((By.ID, 'main'))
WebDriverWait(driver, timeout).until(element_present)
except TimeoutException:
print("Timed out waiting for page to load")
finally:
print("Page loaded")
上找到的文档
我正在尝试使用 python 中的 selenium 从 JavaScript 网站抓取 table。该过程是通过 python 使用 selenium 驱动程序提交表单,然后检索加载的页面。由于网站速度较慢,有时为先前选择加载的 table 会保留在页面中,代码会抓取错误数据,而不是等待新的 table 加载。我想确保我抓取的 table 与我从下拉列表中所做的选择相对应。由于页面可能需要 5 秒到几分钟才能加载,因此指定等待时间可能不起作用。因此,我在第一个 while
循环中放置了第二个 while
循环,以在抓取数据之前验证下拉列表中的名称和结果 table 是否相同。但它似乎不起作用,并且在没有先验证名称的情况下就抓取了数据。我在下面给出一个示例。请告诉我如何让它正确。
import pandas as pd
import io
import time
from selenium import webdriver
from webdriver_manager.firefox import GeckoDriverManager
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
from selenium.webdriver.support.ui import Select
from retry import retry
# Web page url
driver.get("http://mnregaweb4.nic.in/netnrega/dynamic_work_details.aspx?page=S&lflag=eng&state_name=KERALA&state_code=16&fin_year=2020-2021&source=national&Digest=s5wXOIOkT98cNVkcwF6NQA")
x=0
while True:
try:
driver.implicitly_wait(5)
# Find District of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_dist')
drop = Select(x)
# Select by value
drop.select_by_value("1613")
time.sleep(6)
# Find Block of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_blk')
drop = Select(x)
# Select by value
drop.select_by_value("1613001")
time.sleep(4)
# Find GP of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_pan')
drop = Select(x)
# Select by value
drop.select_by_value("1613001001")
time.sleep(4)
search_button = driver.find_element_by_id("ContentPlaceHolder1_Button1")
search_button.click()
time.sleep(8)
soup = BeautifulSoup(driver.page_source, 'lxml')
tables = soup.find_all('table')
dfs = pd.read_html(str(tables))
#print(dfs[4])
df1 = pd.read_csv(io.StringIO(dfs[4].to_csv(index=False)), skiprows=1, header=[0,1])
c = df1.iat[1,3]
print(c)
c == "Alayamon"
while True:
try:
df1.to_csv("break.csv", index=False)
break
except:
if x:
print("waiting...")
x = False
driver.close()
break
except:
if x:
print("Error")
x = False
嗨,希望这有助于在您的网页加载之前将其插入
driver = chromedriver()
driver.get("http://mnregaweb4.nic.in/netnrega/dynamic_work_details.aspx?page=S&lflag=eng&state_name=KERALA&state_code=16&fin_year=2020-2021&source=national&Digest=s5wXOIOkT98cNVkcwF6NQA")
timeout = 3
try:
element_present = EC.presence_of_element_located((By.ID, 'main'))
WebDriverWait(driver, timeout).until(element_present)
except TimeoutException:
print("Timed out waiting for page to load")
finally:
print("Page loaded")
上找到的文档