如何使selenium从网页中提取数据更加健壮和高效?
How to make the data extraction from webpage with selenium more robust and efficient?
我想从yahoo finance 网页中提取所有期权链数据,为了简单起见,我们以看跌期权链数据为例。
首先,加载程序中使用的所有包:
import time
import pandas as pd
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
将某公司的看跌期权链数据写入目录的函数:
def write_option_chain(code):
browser = webdriver.Chrome()
browser.maximize_window()
url = "https://finance.yahoo.com/quote/{}/options?p={}".format(code,code)
browser.get(url)
WebDriverWait(browser,10).until(EC.visibility_of_element_located((By.XPATH, ".//select/option")))
time.sleep(25)
date_elem = browser.find_elements_by_xpath(".//select/option")
time_span = len(date_elem)
print('{} option chains exists in {}'.format(time_span,code))
df_all = pd.DataFrame()
for item in range(1,time_span):
element_date = browser.find_element_by_xpath('.//select/option[{}]'.format(item))
print("parsing {}'s put option chain on {} now".format(code,element_date.text))
element_date.click()
WebDriverWait(browser,10).until(EC.visibility_of_all_elements_located((By.XPATH, ".//table[@class='puts W(100%) Pos(r) list-options']//td")))
time.sleep(11)
put_table = browser.find_element_by_xpath((".//table[@class='puts W(100%) Pos(r) list-options']"))
put_table_string = put_table.get_attribute('outerHTML')
df_put = pd.read_html(put_table_string)[0]
df_all = df_all.append(df_put)
browser.close()
browser.quit()
df_all.to_csv('/tmp/{}.csv'.format(code))
print('{} otpion chain written into csv file'.format(code))
要使用列表测试 write_option_chain
:
nas_list = ['aapl','adbe','adi','adp','adsk']
for item in nas_list:
try:
write_option_chain(code=item)
except:
print("check what happens to {} ".format(item))
continue
time.sleep(5)
输出信息显示:
#i omitted many lines for simplicity
18 option chains exists in aapl
parsing aapl's put option chain on August 27, 2021 now
check what happens to aapl
check what happens to adbe
12 option chains exists in adi
parsing adi's put option chain on December 17, 2021 now
adi otpion chain written into csv file
11 option chains exists in adp
parsing adp's put option chain on August 27, 2021 now
adp otpion chain written into csv file
check what happens to adsk
我们根据以上信息做一个总结:
1.only adp
和 adi
将期权链数据写入所需目录。
2.get 只是 aapl
和 adp
的期权链数据的一部分
3.can打不开adsk的选项网页。
4.it 执行需要将近 20 分钟。
如何使selenium从网页中提取数据更加健壮和高效?
我不确定我可以使用 requests
和 BeautifulSoup
之后你说清楚了
How to make the data extraction from webpage with Selenium
more robust and efficient?
但这里的 requests
和 BeautifulSoup
代码非常适合我。
import requests # pip install requests
from bs4 import BeautifulSoup # pip install beautifulsoup4
import pandas as pd
headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0"}
def scrape(c):
page=requests.get(f"https://finance.yahoo.com/quote/{c}/options?p={c}",headers=headers)
soup=BeautifulSoup(page.content,"lxml")
timestamp=list(map(lambda x: x["value"],soup.find("select").find_all("option")))
# Extracting timestamp from <select>'s <option>
df=pd.DataFrame()
for t in timestamp: # Looping through the list of timestamp
page2=requests.get(f"https://finance.yahoo.com/quote/{c}/options?date={t}&p={c}",headers=headers)
soup2=BeautifulSoup(page2.content,"lxml")
table=soup2.find("table",class_="puts W(100%) Pos(r) list-options")
try:
tabledf=pd.read_html(str(table))[0]
df=df.append(tabledf)
except ValueError:
pass
df.to_csv(f"/temp/{c}.csv",index=False)
nas_list = ['aapl','adbe','adi','adp','adsk']
for nas in nas_list:
scrape(nas)
BeautifulSoup
将比 Selenium
快得多,没有 JavaScript 支持网站。所以,我在这里使用 BeautifulSoup
。是的,您也可以通过使用 browser.page_source
将 Selenium
与 BeautifulSoup
一起使用,但在这里我认为不需要使用 Selenium
.
访问此处了解更多详细信息Selenium versus BeautifulSoup for web scraping
如果可以使用 selenium
以外的其他东西,那么最好的吞吐量可以通过使用 asyncio
和 PyPi
存储库中的 ahiohttp
包来实现,因为需要发出的并发 URL 获取请求的数量(因此是比多线程更好的选择)。为了获得更高的性能(此处未完成),代码可以分为获取 URLs(纯 I/O)和数据帧处理(CPU-密集型)并使用多处理池后者。
import asyncio
import aiohttp
from bs4 import BeautifulSoup
import pandas as pd
import time
async def process_code(session, code):
async with session.get(f'https://finance.yahoo.com/quote/{code}/options?p={code}') as resp:
status = resp.status
if status != 200:
raise Exception('status returned =', status)
code_page = await resp.text()
soup = BeautifulSoup(code_page, 'lxml')
dates = [elem['value'] for elem in soup.find('select').find_all('option')]
df_all = pd.DataFrame()
df_tables = await asyncio.gather(*(process_date(session, code, date) for date in dates))
for df_table in df_tables:
if df_table is not None:
df_all = df_all.append(df_table)
df_all.to_csv('/tmp/{}.csv'.format(code))
async def process_date(session, code, date):
async with session.get(f'https://finance.yahoo.com/quote/{code}/options?date={date}&p={code}') as resp:
status = resp.status
if status != 200:
raise Exception('status returned =', status)
code_page = await resp.text()
soup = BeautifulSoup(code_page, 'lxml')
table = soup.find('table', class_='puts W(100%) Pos(r) list-options')
try:
return pd.read_html(str(table))[0]
except ValueError:
return None
async def main():
nas_list = ['aapl','adbe','adi','adp','adsk']
# Connection: keep-alive required to prevent ClientPayloadError on some websites:
t = time.time()
async with aiohttp.ClientSession(headers = {'Connection': 'keep-alive', 'user-agent': 'my-application'}) as session:
await asyncio.gather(*(process_code(session, code) for code in nas_list))
print('Elapsed time:', time.time() - t)
# Test if we are running under iPython or Jupyter Notebook:
try:
__IPYTHON__
except NameError:
asyncio.get_event_loop().run_until_complete(main())
else:
asyncio.get_running_loop().create_task(main())
这里是多线程版本
from multiprocessing.pool import ThreadPool
from functools import partial
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
def process_code(session, pool, code):
code_page = session.get(f'https://finance.yahoo.com/quote/{code}/options?p={code}')
soup = BeautifulSoup(code_page.content, 'lxml')
dates = [elem['value'] for elem in soup.find('select').find_all('option')]
df_all = pd.DataFrame()
for df_table in pool.imap(partial(process_date, session, code), dates):
if df_table is not None:
df_all = df_all.append(df_table)
df_all.to_csv('/tmp/{}.csv'.format(code))
def process_date(session, code, date):
code_page = session.get(f'https://finance.yahoo.com/quote/{code}/options?date={date}&p={code}')
soup = BeautifulSoup(code_page.content, 'lxml')
table = soup.find('table', class_='puts W(100%) Pos(r) list-options')
try:
return pd.read_html(str(table))[0]
except ValueError:
return None
t = time.time()
nas_list = ['aapl','adbe','adi','adp','adsk']
with requests.Session() as session:
headers = {'User-Agent': 'my-application'}
session.headers = headers
pool = ThreadPool(100)
pool.map(partial(process_code, session, pool), nas_list)
print('Elapsed time:', time.time() - t)
在这个用例中,使用 selenium 是可以的。您只需要一些优化,以下是我发现的一些示例:
- 使用
headless
模式:由于浏览器需要加载页面上的元素,selenium 测试可能需要一段时间才能完成。无头测试摆脱了这种加载时间,使您可以显着缩短测试时间。在我们的无头测试中,我们发现测试执行时间减少了 30% (source)。
- 避免使用多个
time.sleep()
和 WebDriverWait().until()
(尤其是在 for
循环内),而是使用简单的 .implicitly_wait()
。
代码示例:
def write_option_chain(code):
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--start-maximized")
browser = webdriver.Chrome(options=chrome_options)
browser.implicitly_wait(10)
url = "https://finance.yahoo.com/quote/{}/options?p={}".format(code, code)
browser.get(url)
date_elem = browser.find_elements_by_xpath(".//select/option")
time_span = len(date_elem)
print('{} option chains exists in {}'.format(time_span, code))
df_all = pd.DataFrame()
for item in range(1, time_span):
element_date = browser.find_element_by_xpath('.//select/option[{}]'.format(item))
print("parsing {}'s put option chain on {} now".format(
code, element_date.text))
element_date.click()
put_table = browser.find_element_by_xpath((".//table[@class='puts W(100%) Pos(r) list-options']"))
put_table_string = put_table.get_attribute('outerHTML')
df_put = pd.read_html(put_table_string)[0]
df_all = df_all.append(df_put)
browser.close()
browser.quit()
df_all.to_csv('/tmp/{}.csv'.format(code))
print('{} otpion chain written into csv file'.format(code))
然后:
>>nas_list = ['aapl', 'adbe', 'adi', 'adp', 'adsk']
>>for item in nas_list:
....write_option_chain(code=item) #this saves your df at /tmp/{code}.csv'
通过这些简单的优化,完成提取所有内容的代码大约需要 180 秒。
我想从yahoo finance 网页中提取所有期权链数据,为了简单起见,我们以看跌期权链数据为例。 首先,加载程序中使用的所有包:
import time
import pandas as pd
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
将某公司的看跌期权链数据写入目录的函数:
def write_option_chain(code):
browser = webdriver.Chrome()
browser.maximize_window()
url = "https://finance.yahoo.com/quote/{}/options?p={}".format(code,code)
browser.get(url)
WebDriverWait(browser,10).until(EC.visibility_of_element_located((By.XPATH, ".//select/option")))
time.sleep(25)
date_elem = browser.find_elements_by_xpath(".//select/option")
time_span = len(date_elem)
print('{} option chains exists in {}'.format(time_span,code))
df_all = pd.DataFrame()
for item in range(1,time_span):
element_date = browser.find_element_by_xpath('.//select/option[{}]'.format(item))
print("parsing {}'s put option chain on {} now".format(code,element_date.text))
element_date.click()
WebDriverWait(browser,10).until(EC.visibility_of_all_elements_located((By.XPATH, ".//table[@class='puts W(100%) Pos(r) list-options']//td")))
time.sleep(11)
put_table = browser.find_element_by_xpath((".//table[@class='puts W(100%) Pos(r) list-options']"))
put_table_string = put_table.get_attribute('outerHTML')
df_put = pd.read_html(put_table_string)[0]
df_all = df_all.append(df_put)
browser.close()
browser.quit()
df_all.to_csv('/tmp/{}.csv'.format(code))
print('{} otpion chain written into csv file'.format(code))
要使用列表测试 write_option_chain
:
nas_list = ['aapl','adbe','adi','adp','adsk']
for item in nas_list:
try:
write_option_chain(code=item)
except:
print("check what happens to {} ".format(item))
continue
time.sleep(5)
输出信息显示:
#i omitted many lines for simplicity
18 option chains exists in aapl
parsing aapl's put option chain on August 27, 2021 now
check what happens to aapl
check what happens to adbe
12 option chains exists in adi
parsing adi's put option chain on December 17, 2021 now
adi otpion chain written into csv file
11 option chains exists in adp
parsing adp's put option chain on August 27, 2021 now
adp otpion chain written into csv file
check what happens to adsk
我们根据以上信息做一个总结:
1.only adp
和 adi
将期权链数据写入所需目录。
2.get 只是 aapl
和 adp
的期权链数据的一部分
3.can打不开adsk的选项网页。
4.it 执行需要将近 20 分钟。
如何使selenium从网页中提取数据更加健壮和高效?
我不确定我可以使用 requests
和 BeautifulSoup
之后你说清楚了
How to make the data extraction from webpage with
Selenium
more robust and efficient?
但这里的 requests
和 BeautifulSoup
代码非常适合我。
import requests # pip install requests
from bs4 import BeautifulSoup # pip install beautifulsoup4
import pandas as pd
headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0"}
def scrape(c):
page=requests.get(f"https://finance.yahoo.com/quote/{c}/options?p={c}",headers=headers)
soup=BeautifulSoup(page.content,"lxml")
timestamp=list(map(lambda x: x["value"],soup.find("select").find_all("option")))
# Extracting timestamp from <select>'s <option>
df=pd.DataFrame()
for t in timestamp: # Looping through the list of timestamp
page2=requests.get(f"https://finance.yahoo.com/quote/{c}/options?date={t}&p={c}",headers=headers)
soup2=BeautifulSoup(page2.content,"lxml")
table=soup2.find("table",class_="puts W(100%) Pos(r) list-options")
try:
tabledf=pd.read_html(str(table))[0]
df=df.append(tabledf)
except ValueError:
pass
df.to_csv(f"/temp/{c}.csv",index=False)
nas_list = ['aapl','adbe','adi','adp','adsk']
for nas in nas_list:
scrape(nas)
BeautifulSoup
将比 Selenium
快得多,没有 JavaScript 支持网站。所以,我在这里使用 BeautifulSoup
。是的,您也可以通过使用 browser.page_source
将 Selenium
与 BeautifulSoup
一起使用,但在这里我认为不需要使用 Selenium
.
访问此处了解更多详细信息Selenium versus BeautifulSoup for web scraping
如果可以使用 selenium
以外的其他东西,那么最好的吞吐量可以通过使用 asyncio
和 PyPi
存储库中的 ahiohttp
包来实现,因为需要发出的并发 URL 获取请求的数量(因此是比多线程更好的选择)。为了获得更高的性能(此处未完成),代码可以分为获取 URLs(纯 I/O)和数据帧处理(CPU-密集型)并使用多处理池后者。
import asyncio
import aiohttp
from bs4 import BeautifulSoup
import pandas as pd
import time
async def process_code(session, code):
async with session.get(f'https://finance.yahoo.com/quote/{code}/options?p={code}') as resp:
status = resp.status
if status != 200:
raise Exception('status returned =', status)
code_page = await resp.text()
soup = BeautifulSoup(code_page, 'lxml')
dates = [elem['value'] for elem in soup.find('select').find_all('option')]
df_all = pd.DataFrame()
df_tables = await asyncio.gather(*(process_date(session, code, date) for date in dates))
for df_table in df_tables:
if df_table is not None:
df_all = df_all.append(df_table)
df_all.to_csv('/tmp/{}.csv'.format(code))
async def process_date(session, code, date):
async with session.get(f'https://finance.yahoo.com/quote/{code}/options?date={date}&p={code}') as resp:
status = resp.status
if status != 200:
raise Exception('status returned =', status)
code_page = await resp.text()
soup = BeautifulSoup(code_page, 'lxml')
table = soup.find('table', class_='puts W(100%) Pos(r) list-options')
try:
return pd.read_html(str(table))[0]
except ValueError:
return None
async def main():
nas_list = ['aapl','adbe','adi','adp','adsk']
# Connection: keep-alive required to prevent ClientPayloadError on some websites:
t = time.time()
async with aiohttp.ClientSession(headers = {'Connection': 'keep-alive', 'user-agent': 'my-application'}) as session:
await asyncio.gather(*(process_code(session, code) for code in nas_list))
print('Elapsed time:', time.time() - t)
# Test if we are running under iPython or Jupyter Notebook:
try:
__IPYTHON__
except NameError:
asyncio.get_event_loop().run_until_complete(main())
else:
asyncio.get_running_loop().create_task(main())
这里是多线程版本
from multiprocessing.pool import ThreadPool
from functools import partial
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
def process_code(session, pool, code):
code_page = session.get(f'https://finance.yahoo.com/quote/{code}/options?p={code}')
soup = BeautifulSoup(code_page.content, 'lxml')
dates = [elem['value'] for elem in soup.find('select').find_all('option')]
df_all = pd.DataFrame()
for df_table in pool.imap(partial(process_date, session, code), dates):
if df_table is not None:
df_all = df_all.append(df_table)
df_all.to_csv('/tmp/{}.csv'.format(code))
def process_date(session, code, date):
code_page = session.get(f'https://finance.yahoo.com/quote/{code}/options?date={date}&p={code}')
soup = BeautifulSoup(code_page.content, 'lxml')
table = soup.find('table', class_='puts W(100%) Pos(r) list-options')
try:
return pd.read_html(str(table))[0]
except ValueError:
return None
t = time.time()
nas_list = ['aapl','adbe','adi','adp','adsk']
with requests.Session() as session:
headers = {'User-Agent': 'my-application'}
session.headers = headers
pool = ThreadPool(100)
pool.map(partial(process_code, session, pool), nas_list)
print('Elapsed time:', time.time() - t)
在这个用例中,使用 selenium 是可以的。您只需要一些优化,以下是我发现的一些示例:
- 使用
headless
模式:由于浏览器需要加载页面上的元素,selenium 测试可能需要一段时间才能完成。无头测试摆脱了这种加载时间,使您可以显着缩短测试时间。在我们的无头测试中,我们发现测试执行时间减少了 30% (source)。 - 避免使用多个
time.sleep()
和WebDriverWait().until()
(尤其是在for
循环内),而是使用简单的.implicitly_wait()
。
代码示例:
def write_option_chain(code):
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--start-maximized")
browser = webdriver.Chrome(options=chrome_options)
browser.implicitly_wait(10)
url = "https://finance.yahoo.com/quote/{}/options?p={}".format(code, code)
browser.get(url)
date_elem = browser.find_elements_by_xpath(".//select/option")
time_span = len(date_elem)
print('{} option chains exists in {}'.format(time_span, code))
df_all = pd.DataFrame()
for item in range(1, time_span):
element_date = browser.find_element_by_xpath('.//select/option[{}]'.format(item))
print("parsing {}'s put option chain on {} now".format(
code, element_date.text))
element_date.click()
put_table = browser.find_element_by_xpath((".//table[@class='puts W(100%) Pos(r) list-options']"))
put_table_string = put_table.get_attribute('outerHTML')
df_put = pd.read_html(put_table_string)[0]
df_all = df_all.append(df_put)
browser.close()
browser.quit()
df_all.to_csv('/tmp/{}.csv'.format(code))
print('{} otpion chain written into csv file'.format(code))
然后:
>>nas_list = ['aapl', 'adbe', 'adi', 'adp', 'adsk']
>>for item in nas_list:
....write_option_chain(code=item) #this saves your df at /tmp/{code}.csv'
通过这些简单的优化,完成提取所有内容的代码大约需要 180 秒。