如何使selenium从网页中提取数据更加健壮和高效?

How to make the data extraction from webpage with selenium more robust and efficient?

我想从yahoo finance 网页中提取所有期权链数据,为了简单起见,我们以看跌期权链数据为例。 首先,加载程序中使用的所有包:

import time 
import pandas as pd
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

将某公司的看跌期权链数据写入目录的函数:

def write_option_chain(code):
    browser = webdriver.Chrome()
    browser.maximize_window()
    url = "https://finance.yahoo.com/quote/{}/options?p={}".format(code,code)
    browser.get(url)
    WebDriverWait(browser,10).until(EC.visibility_of_element_located((By.XPATH, ".//select/option")))
    time.sleep(25)
    date_elem = browser.find_elements_by_xpath(".//select/option")
    time_span = len(date_elem)
    print('{} option chains exists in {}'.format(time_span,code)) 
    df_all = pd.DataFrame()
    for item in range(1,time_span):
        element_date = browser.find_element_by_xpath('.//select/option[{}]'.format(item))
        print("parsing {}'s  put option chain on {} now".format(code,element_date.text))
        element_date.click()
        WebDriverWait(browser,10).until(EC.visibility_of_all_elements_located((By.XPATH, ".//table[@class='puts W(100%) Pos(r) list-options']//td")))
        time.sleep(11)
        put_table = browser.find_element_by_xpath((".//table[@class='puts W(100%) Pos(r) list-options']"))
        put_table_string = put_table.get_attribute('outerHTML')
        df_put = pd.read_html(put_table_string)[0]
        df_all = df_all.append(df_put)
    browser.close()
    browser.quit()
    df_all.to_csv('/tmp/{}.csv'.format(code))
    print('{} otpion chain written into csv file'.format(code))

要使用列表测试 write_option_chain

nas_list = ['aapl','adbe','adi','adp','adsk']
for item in nas_list:
    try:
        write_option_chain(code=item)
    except:
        print("check what happens to {} ".format(item))
        continue
    time.sleep(5)

输出信息显示:

#i omitted many lines for simplicity
18 option chains exists in aapl
parsing aapl's  put option chain on August 27, 2021 now
check what happens to aapl 
check what happens to adbe 
12 option chains exists in adi
parsing adi's  put option chain on December 17, 2021 now
adi otpion chain written into csv file
11 option chains exists in adp
parsing adp's  put option chain on August 27, 2021 now
adp otpion chain written into csv file
check what happens to adsk 

我们根据以上信息做一个总结:

1.only adpadi 将期权链数据写入所需目录。
2.get 只是 aapladp 的期权链数据的一部分
3.can打不开adsk的选项网页。
4.it 执行需要将近 20 分钟。

如何使selenium从网页中提取数据更加健壮和高效?

我不确定我可以使用 requestsBeautifulSoup 之后你说清楚了

How to make the data extraction from webpage with Selenium more robust and efficient?

但这里的 requestsBeautifulSoup 代码非常适合我。

import requests # pip install requests
from bs4 import BeautifulSoup # pip install beautifulsoup4
import pandas as pd

headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0"}

def scrape(c):
    page=requests.get(f"https://finance.yahoo.com/quote/{c}/options?p={c}",headers=headers)
    soup=BeautifulSoup(page.content,"lxml")

    timestamp=list(map(lambda x: x["value"],soup.find("select").find_all("option")))
    # Extracting timestamp from <select>'s <option>

    df=pd.DataFrame()

    for t in timestamp: # Looping through the list of timestamp
        page2=requests.get(f"https://finance.yahoo.com/quote/{c}/options?date={t}&p={c}",headers=headers)
        soup2=BeautifulSoup(page2.content,"lxml")

        table=soup2.find("table",class_="puts W(100%) Pos(r) list-options")
        try:
            tabledf=pd.read_html(str(table))[0]
            df=df.append(tabledf)
        except ValueError:
            pass

    df.to_csv(f"/temp/{c}.csv",index=False)

nas_list = ['aapl','adbe','adi','adp','adsk']
for nas in nas_list:
    scrape(nas)

BeautifulSoup 将比 Selenium 快得多,没有 JavaScript 支持网站。所以,我在这里使用 BeautifulSoup。是的,您也可以通过使用 browser.page_sourceSeleniumBeautifulSoup 一起使用,但在这里我认为不需要使用 Selenium.

访问此处了解更多详细信息Selenium versus BeautifulSoup for web scraping

如果可以使用 selenium 以外的其他东西,那么最好的吞吐量可以通过使用 asyncioPyPi 存储库中的 ahiohttp 包来实现,因为需要发出的并发 URL 获取请求的数量(因此是比多线程更好的选择)。为了获得更高的性能(此处未完成),代码可以分为获取 URLs(纯 I/O)和数据帧处理(CPU-密集型)并使用多处理池后者。

import asyncio
import aiohttp
from bs4 import BeautifulSoup
import pandas as pd
import time

async def process_code(session, code):
    async with session.get(f'https://finance.yahoo.com/quote/{code}/options?p={code}') as resp:
        status = resp.status
        if status != 200:
            raise Exception('status returned =', status)
        code_page = await resp.text()
    soup = BeautifulSoup(code_page, 'lxml')
    dates = [elem['value'] for elem in soup.find('select').find_all('option')]
    df_all = pd.DataFrame()
    df_tables = await asyncio.gather(*(process_date(session, code, date) for date in dates))
    for df_table in df_tables:
        if df_table is not None:
            df_all = df_all.append(df_table)
    df_all.to_csv('/tmp/{}.csv'.format(code))

async def process_date(session, code, date):
    async with session.get(f'https://finance.yahoo.com/quote/{code}/options?date={date}&p={code}') as resp:
        status = resp.status
        if status != 200:
            raise Exception('status returned =', status)
        code_page = await resp.text()
    soup = BeautifulSoup(code_page, 'lxml')
    table = soup.find('table', class_='puts W(100%) Pos(r) list-options')
    try:
        return pd.read_html(str(table))[0]
    except ValueError:
        return None

async def main():
    nas_list = ['aapl','adbe','adi','adp','adsk']
    # Connection: keep-alive required to prevent ClientPayloadError on some websites:
    t = time.time()
    async with aiohttp.ClientSession(headers = {'Connection': 'keep-alive', 'user-agent': 'my-application'}) as session:
        await asyncio.gather(*(process_code(session, code) for code in nas_list))
    print('Elapsed time:', time.time() - t)

# Test if we are running under iPython or Jupyter Notebook:
try:
    __IPYTHON__
except NameError:
    asyncio.get_event_loop().run_until_complete(main())
else:
    asyncio.get_running_loop().create_task(main())

这里是多线程版本

from multiprocessing.pool import ThreadPool
from functools import partial
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def process_code(session, pool, code):
    code_page = session.get(f'https://finance.yahoo.com/quote/{code}/options?p={code}')
    soup = BeautifulSoup(code_page.content, 'lxml')
    dates = [elem['value'] for elem in soup.find('select').find_all('option')]
    df_all = pd.DataFrame()
    for df_table in pool.imap(partial(process_date, session, code), dates):
        if df_table is not None:
            df_all = df_all.append(df_table)
    df_all.to_csv('/tmp/{}.csv'.format(code))

def process_date(session, code, date):
    code_page = session.get(f'https://finance.yahoo.com/quote/{code}/options?date={date}&p={code}')
    soup = BeautifulSoup(code_page.content, 'lxml')
    table = soup.find('table', class_='puts W(100%) Pos(r) list-options')
    try:
        return pd.read_html(str(table))[0]
    except ValueError:
        return None

t = time.time()
nas_list = ['aapl','adbe','adi','adp','adsk']
with requests.Session() as session:
    headers = {'User-Agent': 'my-application'}
    session.headers = headers
    pool = ThreadPool(100)
    pool.map(partial(process_code, session, pool), nas_list)
print('Elapsed time:', time.time() - t)

在这个用例中,使用 selenium 是可以的。您只需要一些优化,以下是我发现的一些示例:

  • 使用 headless 模式:由于浏览器需要加载页面上的元素,selenium 测试可能需要一段时间才能完成。无头测试摆脱了这种加载时间,使您可以显着缩短测试时间。在我们的无头测试中,我们发现测试执行时间减少了 30% (source)。
  • 避免使用多个 time.sleep()WebDriverWait().until()(尤其是在 for 循环内),而是使用简单的 .implicitly_wait()

代码示例:

def write_option_chain(code):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--start-maximized")
    browser = webdriver.Chrome(options=chrome_options)
    browser.implicitly_wait(10)
    url = "https://finance.yahoo.com/quote/{}/options?p={}".format(code, code)
    browser.get(url)
    date_elem = browser.find_elements_by_xpath(".//select/option")
    time_span = len(date_elem)
    print('{} option chains exists in {}'.format(time_span, code))
    df_all = pd.DataFrame()
    for item in range(1, time_span):
        element_date = browser.find_element_by_xpath('.//select/option[{}]'.format(item))
        print("parsing {}'s  put option chain on {} now".format(
        code, element_date.text))
        element_date.click()
        put_table = browser.find_element_by_xpath((".//table[@class='puts W(100%) Pos(r) list-options']"))
        put_table_string = put_table.get_attribute('outerHTML')
        df_put = pd.read_html(put_table_string)[0]
        df_all = df_all.append(df_put)
    browser.close()
    browser.quit()
    df_all.to_csv('/tmp/{}.csv'.format(code))
    print('{} otpion chain written into csv file'.format(code))

然后:

>>nas_list = ['aapl', 'adbe', 'adi', 'adp', 'adsk']
>>for item in nas_list:
  ....write_option_chain(code=item) #this saves your df at /tmp/{code}.csv'
  

通过这些简单的优化,完成提取所有内容的代码大约需要 180 秒。