python 硒网络抓取。如何在网络中断或抓取的网站速度慢时继续重试?
python selenium webscraping. How to keep retrying when network is down or the scraped website is slow?
我正在尝试使用 python 中的 selenium 从网络上抓取 table。但是网站速度很慢,而且大部分时间都有很多网络问题。所以我希望代码继续尝试,即使网站加载需要时间。我必须抓取 941 个条目才能抓取。我尝试了这个我在网上找到的名为 retry 的模块,但它似乎不起作用。给出下面的代码示例。有没有其他方法可以让代码不断重试,直到网站加载?
import pandas as pd
import io
import time
from selenium import webdriver
from webdriver_manager.firefox import GeckoDriverManager
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
from selenium.webdriver.support.ui import Select
from retry import retry
# Web page url
driver.get("http://mnregaweb4.nic.in/netnrega/dynamic_work_details.aspx?page=S&lflag=eng&state_name=KERALA&state_code=16&fin_year=2020-2021&source=national&Digest=s5wXOIOkT98cNVkcwF6NQA")
@retry()
def make_trouble():
'''Retry until succeed'''
driver.implicitly_wait(5)
# Find District of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_dist')
drop = Select(x)
# Select by value
drop.select_by_value("1613")
@retry()
def make_trouble():
'''Retry until succeed'''
time.sleep(6)
# Find Block of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_blk')
drop = Select(x)
# Select by value
drop.select_by_value("1613001")
@retry()
def make_trouble():
'''Retry until succeed'''
time.sleep(4)
# Find GP of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_pan')
drop = Select(x)
# Select by value
drop.select_by_value("1613001001")
@retry()
def make_trouble():
'''Retry until succeed'''
time.sleep(4)
search_button = driver.find_element_by_id("ContentPlaceHolder1_Button1")
search_button.click()
time.sleep(8)
soup = BeautifulSoup(driver.page_source, 'lxml')
tables = soup.find_all('table')
dfs = pd.read_html(str(tables))
print(dfs[4])
df1 = pd.read_csv(io.StringIO(dfs[4].to_csv(index=False)), skiprows=1, header=[0,1])
df1.to_csv("test with pandas V3.csv", index=False)
driver.close()```
while True:
try:
driver.implicitly_wait(5)
# Find District of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_dist')
drop = Select(x)
# Select by value
drop.select_by_value("1613")
time.sleep(6)
# Find Block of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_blk')
drop = Select(x)
# Select by value
drop.select_by_value("1613001")
time.sleep(4)
# Find GP of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_pan')
drop = Select(x)
# Select by value
drop.select_by_value("1613001001")
time.sleep(4)
search_button = driver.find_element_by_id("ContentPlaceHolder1_Button1")
search_button.click()
time.sleep(8)
soup = BeautifulSoup(driver.page_source, 'lxml')
tables = soup.find_all('table')
dfs = pd.read_html(str(tables))
print(dfs[4])
df1 = pd.read_csv(io.StringIO(dfs[4].to_csv(index=False)), skiprows=1, header=[0,1])
df1.to_csv("test with pandas V3.csv", index=False)
driver.close()
except:
print("Error")
这不是我的代码,而是应 ABC 管理员的要求,由 Sangun Devkota 对代码进行了修改。
这样它每 5 个循环打印一个错误。
x = 0
while True:
try:
driver.implicitly_wait(5)
# Find District of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_dist')
drop = Select(x)
# Select by value
drop.select_by_value("1613")
time.sleep(6)
# Find Block of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_blk')
drop = Select(x)
# Select by value
drop.select_by_value("1613001")
time.sleep(4)
# Find GP of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_pan')
drop = Select(x)
# Select by value
drop.select_by_value("1613001001")
time.sleep(4)
search_button = driver.find_element_by_id("ContentPlaceHolder1_Button1")
search_button.click()
time.sleep(8)
soup = BeautifulSoup(driver.page_source, 'lxml')
tables = soup.find_all('table')
dfs = pd.read_html(str(tables))
print(dfs[4])
df1 = pd.read_csv(io.StringIO(dfs[4].to_csv(index=False)), skiprows=1, header=[0,1])
df1.to_csv("test with pandas V3.csv", index=False)
driver.close()
except:
if x%5 == 0:
print("Error")
x += 1
如果你想让它只打印一次你可以把它改成这样:
x = True
...其他代码...
except:
if x:
print('Error')
x = False
我正在尝试使用 python 中的 selenium 从网络上抓取 table。但是网站速度很慢,而且大部分时间都有很多网络问题。所以我希望代码继续尝试,即使网站加载需要时间。我必须抓取 941 个条目才能抓取。我尝试了这个我在网上找到的名为 retry 的模块,但它似乎不起作用。给出下面的代码示例。有没有其他方法可以让代码不断重试,直到网站加载?
import pandas as pd
import io
import time
from selenium import webdriver
from webdriver_manager.firefox import GeckoDriverManager
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
from selenium.webdriver.support.ui import Select
from retry import retry
# Web page url
driver.get("http://mnregaweb4.nic.in/netnrega/dynamic_work_details.aspx?page=S&lflag=eng&state_name=KERALA&state_code=16&fin_year=2020-2021&source=national&Digest=s5wXOIOkT98cNVkcwF6NQA")
@retry()
def make_trouble():
'''Retry until succeed'''
driver.implicitly_wait(5)
# Find District of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_dist')
drop = Select(x)
# Select by value
drop.select_by_value("1613")
@retry()
def make_trouble():
'''Retry until succeed'''
time.sleep(6)
# Find Block of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_blk')
drop = Select(x)
# Select by value
drop.select_by_value("1613001")
@retry()
def make_trouble():
'''Retry until succeed'''
time.sleep(4)
# Find GP of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_pan')
drop = Select(x)
# Select by value
drop.select_by_value("1613001001")
@retry()
def make_trouble():
'''Retry until succeed'''
time.sleep(4)
search_button = driver.find_element_by_id("ContentPlaceHolder1_Button1")
search_button.click()
time.sleep(8)
soup = BeautifulSoup(driver.page_source, 'lxml')
tables = soup.find_all('table')
dfs = pd.read_html(str(tables))
print(dfs[4])
df1 = pd.read_csv(io.StringIO(dfs[4].to_csv(index=False)), skiprows=1, header=[0,1])
df1.to_csv("test with pandas V3.csv", index=False)
driver.close()```
while True:
try:
driver.implicitly_wait(5)
# Find District of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_dist')
drop = Select(x)
# Select by value
drop.select_by_value("1613")
time.sleep(6)
# Find Block of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_blk')
drop = Select(x)
# Select by value
drop.select_by_value("1613001")
time.sleep(4)
# Find GP of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_pan')
drop = Select(x)
# Select by value
drop.select_by_value("1613001001")
time.sleep(4)
search_button = driver.find_element_by_id("ContentPlaceHolder1_Button1")
search_button.click()
time.sleep(8)
soup = BeautifulSoup(driver.page_source, 'lxml')
tables = soup.find_all('table')
dfs = pd.read_html(str(tables))
print(dfs[4])
df1 = pd.read_csv(io.StringIO(dfs[4].to_csv(index=False)), skiprows=1, header=[0,1])
df1.to_csv("test with pandas V3.csv", index=False)
driver.close()
except:
print("Error")
这不是我的代码,而是应 ABC 管理员的要求,由 Sangun Devkota 对代码进行了修改。
这样它每 5 个循环打印一个错误。
x = 0
while True:
try:
driver.implicitly_wait(5)
# Find District of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_dist')
drop = Select(x)
# Select by value
drop.select_by_value("1613")
time.sleep(6)
# Find Block of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_blk')
drop = Select(x)
# Select by value
drop.select_by_value("1613001")
time.sleep(4)
# Find GP of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_pan')
drop = Select(x)
# Select by value
drop.select_by_value("1613001001")
time.sleep(4)
search_button = driver.find_element_by_id("ContentPlaceHolder1_Button1")
search_button.click()
time.sleep(8)
soup = BeautifulSoup(driver.page_source, 'lxml')
tables = soup.find_all('table')
dfs = pd.read_html(str(tables))
print(dfs[4])
df1 = pd.read_csv(io.StringIO(dfs[4].to_csv(index=False)), skiprows=1, header=[0,1])
df1.to_csv("test with pandas V3.csv", index=False)
driver.close()
except:
if x%5 == 0:
print("Error")
x += 1
如果你想让它只打印一次你可以把它改成这样:
x = True
...其他代码...
except:
if x:
print('Error')
x = False