Python 高效的网页抓取?
Python Efficient Web Scraping?
我是 Python 的新手,我正在尝试为股票应用程序制作网络解析器。我实际上是在使用 urllib 为参数列表中的每只股票打开所需的网页,并阅读该页面的 html 代码的全部内容。然后我将其切片以找到我正在寻找的报价。我实施的方法有效,但我怀疑这是实现此结果的最有效方法。我花了一些时间研究其他可以更快读取文件的潜在方法,但 none 似乎与网络抓取有关。这是我的代码:
from urllib.request import urlopen
def getQuotes(stocks):
quoteList = {}
for stock in stocks:
html = urlopen("https://finance.google.com/finance?q={}".format(stock))
webpageData = html.read()
scrape1 = webpageData.split(str.encode('<span class="pr">\n<span id='))[1].split(str.encode('</span>'))[0]
scrape2 = scrape1.split(str.encode('>'))[1]
quote = bytes.decode(scrape2)
quoteList[stock] = float(quote)
return quoteList
print(getQuotes(['FB', 'GOOG', 'TSLA']))
提前谢谢大家!
I'm essentially using urllib to open the desired webpage for each
stock in the argument list and reading the full contents of the html
code for that page. Then I'm slicing that in order to find the quote
I'm looking for.
这是 Beautiful Soup
和 requests
中的实现:
import requests
from bs4 import BeautifulSoup
def get_quotes(*stocks):
quotelist = {}
base = 'https://finance.google.com/finance?q={}'
for stock in stocks:
url = base.format(stock)
soup = BeautifulSoup(requests.get(url).text, 'html.parser')
quote = soup.find('span', attrs={'class' : 'pr'}).get_text().strip()
quotelist[stock] = float(quote)
return quotelist
print(get_quotes('AAPL', 'GE', 'C'))
{'AAPL': 160.86, 'GE': 23.91, 'C': 68.79}
# 1 loop, best of 3: 1.31 s per loop
如评论中所述,您可能需要查看 multithreading or grequests。
使用 grequests
发出异步 HTTP 请求:
def get_quotes(*stocks):
quotelist = {}
base = 'https://finance.google.com/finance?q={}'
rs = (grequests.get(u) for u in [base.format(stock) for stock in stocks])
rs = grequests.map(rs)
for r, stock in zip(rs, stocks):
soup = BeautifulSoup(r.text, 'html.parser')
quote = soup.find('span', attrs={'class' : 'pr'}).get_text().strip()
quotelist[stock] = float(quote)
return quotelist
%%timeit
get_quotes('AAPL', 'BAC', 'MMM', 'ATVI',
'PPG', 'MS', 'GOOGL', 'RRC')
1 loop, best of 3: 2.81 s per loop
更新:这是 Dusty Phillips 的 Python 3 面向对象编程 的修改版本,它使用内置 threading
模块。
from threading import Thread
from bs4 import BeautifulSoup
import numpy as np
import requests
class QuoteGetter(Thread):
def __init__(self, ticker):
super().__init__()
self.ticker = ticker
def run(self):
base = 'https://finance.google.com/finance?q={}'
response = requests.get(base.format(self.ticker))
soup = BeautifulSoup(response.text, 'html.parser')
try:
self.quote = float(soup.find('span', attrs={'class':'pr'})
.get_text()
.strip()
.replace(',', ''))
except AttributeError:
self.quote = np.nan
def get_quotes(tickers):
threads = [QuoteGetter(t) for t in tickers]
for thread in threads:
thread.start()
for thread in threads:
thread.join()
quotes = dict(zip(tickers, [thread.quote for thread in threads]))
return quotes
tickers = [
'A', 'AAL', 'AAP', 'AAPL', 'ABBV', 'ABC', 'ABT', 'ACN', 'ADBE', 'ADI',
'ADM', 'ADP', 'ADS', 'ADSK', 'AEE', 'AEP', 'AES', 'AET', 'AFL', 'AGN',
'AIG', 'AIV', 'AIZ', 'AJG', 'AKAM', 'ALB', 'ALGN', 'ALK', 'ALL', 'ALLE',
]
%time get_quotes(tickers)
# Wall time: 1.53 s
我是 Python 的新手,我正在尝试为股票应用程序制作网络解析器。我实际上是在使用 urllib 为参数列表中的每只股票打开所需的网页,并阅读该页面的 html 代码的全部内容。然后我将其切片以找到我正在寻找的报价。我实施的方法有效,但我怀疑这是实现此结果的最有效方法。我花了一些时间研究其他可以更快读取文件的潜在方法,但 none 似乎与网络抓取有关。这是我的代码:
from urllib.request import urlopen
def getQuotes(stocks):
quoteList = {}
for stock in stocks:
html = urlopen("https://finance.google.com/finance?q={}".format(stock))
webpageData = html.read()
scrape1 = webpageData.split(str.encode('<span class="pr">\n<span id='))[1].split(str.encode('</span>'))[0]
scrape2 = scrape1.split(str.encode('>'))[1]
quote = bytes.decode(scrape2)
quoteList[stock] = float(quote)
return quoteList
print(getQuotes(['FB', 'GOOG', 'TSLA']))
提前谢谢大家!
I'm essentially using urllib to open the desired webpage for each stock in the argument list and reading the full contents of the html code for that page. Then I'm slicing that in order to find the quote I'm looking for.
这是 Beautiful Soup
和 requests
中的实现:
import requests
from bs4 import BeautifulSoup
def get_quotes(*stocks):
quotelist = {}
base = 'https://finance.google.com/finance?q={}'
for stock in stocks:
url = base.format(stock)
soup = BeautifulSoup(requests.get(url).text, 'html.parser')
quote = soup.find('span', attrs={'class' : 'pr'}).get_text().strip()
quotelist[stock] = float(quote)
return quotelist
print(get_quotes('AAPL', 'GE', 'C'))
{'AAPL': 160.86, 'GE': 23.91, 'C': 68.79}
# 1 loop, best of 3: 1.31 s per loop
如评论中所述,您可能需要查看 multithreading or grequests。
使用 grequests
发出异步 HTTP 请求:
def get_quotes(*stocks):
quotelist = {}
base = 'https://finance.google.com/finance?q={}'
rs = (grequests.get(u) for u in [base.format(stock) for stock in stocks])
rs = grequests.map(rs)
for r, stock in zip(rs, stocks):
soup = BeautifulSoup(r.text, 'html.parser')
quote = soup.find('span', attrs={'class' : 'pr'}).get_text().strip()
quotelist[stock] = float(quote)
return quotelist
%%timeit
get_quotes('AAPL', 'BAC', 'MMM', 'ATVI',
'PPG', 'MS', 'GOOGL', 'RRC')
1 loop, best of 3: 2.81 s per loop
更新:这是 Dusty Phillips 的 Python 3 面向对象编程 的修改版本,它使用内置 threading
模块。
from threading import Thread
from bs4 import BeautifulSoup
import numpy as np
import requests
class QuoteGetter(Thread):
def __init__(self, ticker):
super().__init__()
self.ticker = ticker
def run(self):
base = 'https://finance.google.com/finance?q={}'
response = requests.get(base.format(self.ticker))
soup = BeautifulSoup(response.text, 'html.parser')
try:
self.quote = float(soup.find('span', attrs={'class':'pr'})
.get_text()
.strip()
.replace(',', ''))
except AttributeError:
self.quote = np.nan
def get_quotes(tickers):
threads = [QuoteGetter(t) for t in tickers]
for thread in threads:
thread.start()
for thread in threads:
thread.join()
quotes = dict(zip(tickers, [thread.quote for thread in threads]))
return quotes
tickers = [
'A', 'AAL', 'AAP', 'AAPL', 'ABBV', 'ABC', 'ABT', 'ACN', 'ADBE', 'ADI',
'ADM', 'ADP', 'ADS', 'ADSK', 'AEE', 'AEP', 'AES', 'AET', 'AFL', 'AGN',
'AIG', 'AIV', 'AIZ', 'AJG', 'AKAM', 'ALB', 'ALGN', 'ALK', 'ALL', 'ALLE',
]
%time get_quotes(tickers)
# Wall time: 1.53 s