如何使用 Beautiful Soup BS4 在 Yahoo Finance 上抓取多个页面
How to scrape multiple pages on Yahoo Finance with Beautiful Soup BS4
我是 Python 的新手,正在尝试使用 BS4 从 Yahoo Finance 获取一些财务数据。
对于单个页面,脚本工作得很好。但是,现在我正在尝试一次抓取多个页面,但是 for url in urls:
循环无法按预期工作。它只会从最后一个 url.
中抓取数据
有人知道如何解决吗?
#!/usr/bin/python
# -*- coding: utf-8 -*-
import urllib2
from bs4 import BeautifulSoup
import ssl
import json
import ast
import os
from urllib2 import Request, urlopen
import datetime
# For ignoring SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
# Input from the user
urls = ['https://finance.yahoo.com/quote/ALV.DE?p=ALV.DE&.tsrc=fin-srch', 'https://finance.yahoo.com/quote/SAP?p=SAP&.tsrc=fin-srch']
# Making the website believe that you are accessing it using a Mozilla browser
for url in urls:
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
# Creating a BeautifulSoup object of the HTML page for easy extraction of data.
soup = BeautifulSoup(webpage, 'html.parser')
html = soup.prettify('utf-8')
company_json = {}
other_details = {}
for h1 in soup.findAll('h1'):
company_json['TICKER'] = h1.text.strip()
for span in soup.findAll('span',attrs={'class': 'Trsdu(0.3s) Trsdu(0.3s) Fw(b) Fz(36px) Mb(-4px) D(b)'}):
company_json['PRESENT_VALUE'] = span.text.strip()
for div in soup.findAll('div', attrs={'class': 'D(ib) Va(t)'}):
for span in div.findAll('span', recursive=False):
company_json['PRESENT_GROWTH'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'PREV_CLOSE-value'}):
for span in td.findAll('span', recursive=False):
other_details['PREV_CLOSE'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'OPEN-value'}):
for span in td.findAll('span', recursive=False):
other_details['OPEN'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'BID-value'}):
for span in td.findAll('span', recursive=False):
other_details['BID'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'ASK-value'}):
for span in td.findAll('span', recursive=False):
other_details['ASK'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'DAYS_RANGE-value'}):
for span in td.findAll('span', recursive=False):
other_details['DAYS_RANGE'] = span.text.strip()
for td in soup.findAll('td',attrs={'data-test': 'FIFTY_TWO_WK_RANGE-value'}):
for span in td.findAll('span', recursive=False):
other_details['FIFTY_TWO_WK_RANGE'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'TD_VOLUME-value'}):
for span in td.findAll('span', recursive=False):
other_details['TD_VOLUME'] = span.text.strip()
for td in soup.findAll('td',attrs={'data-test': 'AVERAGE_VOLUME_3MONTH-value'}):
for span in td.findAll('span', recursive=False):
other_details['AVERAGE_VOLUME_3MONTH'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'MARKET_CAP-value'}):
for span in td.findAll('span', recursive=False):
other_details['MARKET_CAP'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'BETA_3Y-value'}):
for span in td.findAll('span', recursive=False):
other_details['BETA_3Y'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'PE_RATIO-value'}):
for span in td.findAll('span', recursive=False):
other_details['PE_RATIO'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'EPS_RATIO-value'}):
for span in td.findAll('span', recursive=False):
other_details['EPS_RATIO'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'EARNINGS_DATE-value'}):
other_details['EARNINGS_DATE'] = []
for span in td.findAll('span', recursive=False):
other_details['EARNINGS_DATE'].append(span.text.strip())
for td in soup.findAll('td',attrs={'data-test': 'DIVIDEND_AND_YIELD-value'}):
other_details['DIVIDEND_AND_YIELD'] = td.text.strip()
for td in soup.findAll('td',attrs={'data-test': 'EX_DIVIDEND_DATE-value'}):
for span in td.findAll('span', recursive=False):
other_details['EX_DIVIDEND_DATE'] = span.text.strip()
for td in soup.findAll('td',attrs={'data-test': 'ONE_YEAR_TARGET_PRICE-value' }):
for span in td.findAll('span', recursive=False):
other_details['ONE_YEAR_TARGET_PRICE'] = span.text.strip()
other_details['DATE'] = str(datetime.datetime.now())
company_json['OTHER_DETAILS'] = other_details
with open('dax30_kpis.json', 'a') as outfile:
json.dump(company_json, outfile, indent=4)
print company_json
print '----------Extraction of data is complete. Check json file.----------'
你的缩进好像有误:
# Making the website believe that you are accessing it using a Mozilla browser
for url in urls:
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
# Creating a BeautifulSoup object of the HTML page for easy extraction of data.
soup = BeautifulSoup(webpage, 'html.parser')
# ... rest of the code
应该是
# Making the website believe that you are accessing it using a Mozilla browser
for url in urls:
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
# Creating a BeautifulSoup object of the HTML page for easy extraction of data.
soup = BeautifulSoup(webpage, 'html.parser')
# ... rest of the code
此更改是必要的,因为您在循环中调用所有 url 并将它们保存在单个变量中。因此,您的实施最终会覆盖所有被抓取的网站,并且只处理最后一个 url.
返回的结果
你需要把所有的处理都放到
#... website processing code
with open('dax30_kpis.json', 'a') as outfile:
json.dump(company_json, outfile, indent=4)
进入 for
循环为:
###code before###
# Making the website believe that you are accessing it using a Mozilla browser
for url in urls:
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
# Creating a BeautifulSoup object of the HTML page for easy extraction of data.
soup = BeautifulSoup(webpage, 'html.parser')
html = soup.prettify('utf-8')
company_json = {}
other_details = {}
for h1 in soup.findAll('h1'):
company_json['TICKER'] = h1.text.strip()
for span in soup.findAll('span',attrs={'class': 'Trsdu(0.3s) Trsdu(0.3s) Fw(b) Fz(36px) Mb(-4px) D(b)'}):
company_json['PRESENT_VALUE'] = span.text.strip()
for div in soup.findAll('div', attrs={'class': 'D(ib) Va(t)'}):
for span in div.findAll('span', recursive=False):
company_json['PRESENT_GROWTH'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'PREV_CLOSE-value'}):
for span in td.findAll('span', recursive=False):
other_details['PREV_CLOSE'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'OPEN-value'}):
for span in td.findAll('span', recursive=False):
other_details['OPEN'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'BID-value'}):
for span in td.findAll('span', recursive=False):
other_details['BID'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'ASK-value'}):
for span in td.findAll('span', recursive=False):
other_details['ASK'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'DAYS_RANGE-value'}):
for span in td.findAll('span', recursive=False):
other_details['DAYS_RANGE'] = span.text.strip()
for td in soup.findAll('td',attrs={'data-test': 'FIFTY_TWO_WK_RANGE-value'}):
for span in td.findAll('span', recursive=False):
other_details['FIFTY_TWO_WK_RANGE'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'TD_VOLUME-value'}):
for span in td.findAll('span', recursive=False):
other_details['TD_VOLUME'] = span.text.strip()
for td in soup.findAll('td',attrs={'data-test': 'AVERAGE_VOLUME_3MONTH-value'}):
for span in td.findAll('span', recursive=False):
other_details['AVERAGE_VOLUME_3MONTH'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'MARKET_CAP-value'}):
for span in td.findAll('span', recursive=False):
other_details['MARKET_CAP'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'BETA_3Y-value'}):
for span in td.findAll('span', recursive=False):
other_details['BETA_3Y'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'PE_RATIO-value'}):
for span in td.findAll('span', recursive=False):
other_details['PE_RATIO'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'EPS_RATIO-value'}):
for span in td.findAll('span', recursive=False):
other_details['EPS_RATIO'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'EARNINGS_DATE-value'}):
other_details['EARNINGS_DATE'] = []
for span in td.findAll('span', recursive=False):
other_details['EARNINGS_DATE'].append(span.text.strip())
for td in soup.findAll('td',attrs={'data-test': 'DIVIDEND_AND_YIELD-value'}):
other_details['DIVIDEND_AND_YIELD'] = td.text.strip()
for td in soup.findAll('td',attrs={'data-test': 'EX_DIVIDEND_DATE-value'}):
for span in td.findAll('span', recursive=False):
other_details['EX_DIVIDEND_DATE'] = span.text.strip()
for td in soup.findAll('td',attrs={'data-test': 'ONE_YEAR_TARGET_PRICE-value' }):
for span in td.findAll('span', recursive=False):
other_details['ONE_YEAR_TARGET_PRICE'] = span.text.strip()
other_details['DATE'] = str(datetime.datetime.now())
company_json['OTHER_DETAILS'] = other_details
with open('dax30_kpis.json', 'a') as outfile:
json.dump(company_json, outfile, indent=4)
print company_json
### Code after ###
我是 Python 的新手,正在尝试使用 BS4 从 Yahoo Finance 获取一些财务数据。
对于单个页面,脚本工作得很好。但是,现在我正在尝试一次抓取多个页面,但是 for url in urls:
循环无法按预期工作。它只会从最后一个 url.
有人知道如何解决吗?
#!/usr/bin/python
# -*- coding: utf-8 -*-
import urllib2
from bs4 import BeautifulSoup
import ssl
import json
import ast
import os
from urllib2 import Request, urlopen
import datetime
# For ignoring SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
# Input from the user
urls = ['https://finance.yahoo.com/quote/ALV.DE?p=ALV.DE&.tsrc=fin-srch', 'https://finance.yahoo.com/quote/SAP?p=SAP&.tsrc=fin-srch']
# Making the website believe that you are accessing it using a Mozilla browser
for url in urls:
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
# Creating a BeautifulSoup object of the HTML page for easy extraction of data.
soup = BeautifulSoup(webpage, 'html.parser')
html = soup.prettify('utf-8')
company_json = {}
other_details = {}
for h1 in soup.findAll('h1'):
company_json['TICKER'] = h1.text.strip()
for span in soup.findAll('span',attrs={'class': 'Trsdu(0.3s) Trsdu(0.3s) Fw(b) Fz(36px) Mb(-4px) D(b)'}):
company_json['PRESENT_VALUE'] = span.text.strip()
for div in soup.findAll('div', attrs={'class': 'D(ib) Va(t)'}):
for span in div.findAll('span', recursive=False):
company_json['PRESENT_GROWTH'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'PREV_CLOSE-value'}):
for span in td.findAll('span', recursive=False):
other_details['PREV_CLOSE'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'OPEN-value'}):
for span in td.findAll('span', recursive=False):
other_details['OPEN'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'BID-value'}):
for span in td.findAll('span', recursive=False):
other_details['BID'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'ASK-value'}):
for span in td.findAll('span', recursive=False):
other_details['ASK'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'DAYS_RANGE-value'}):
for span in td.findAll('span', recursive=False):
other_details['DAYS_RANGE'] = span.text.strip()
for td in soup.findAll('td',attrs={'data-test': 'FIFTY_TWO_WK_RANGE-value'}):
for span in td.findAll('span', recursive=False):
other_details['FIFTY_TWO_WK_RANGE'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'TD_VOLUME-value'}):
for span in td.findAll('span', recursive=False):
other_details['TD_VOLUME'] = span.text.strip()
for td in soup.findAll('td',attrs={'data-test': 'AVERAGE_VOLUME_3MONTH-value'}):
for span in td.findAll('span', recursive=False):
other_details['AVERAGE_VOLUME_3MONTH'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'MARKET_CAP-value'}):
for span in td.findAll('span', recursive=False):
other_details['MARKET_CAP'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'BETA_3Y-value'}):
for span in td.findAll('span', recursive=False):
other_details['BETA_3Y'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'PE_RATIO-value'}):
for span in td.findAll('span', recursive=False):
other_details['PE_RATIO'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'EPS_RATIO-value'}):
for span in td.findAll('span', recursive=False):
other_details['EPS_RATIO'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'EARNINGS_DATE-value'}):
other_details['EARNINGS_DATE'] = []
for span in td.findAll('span', recursive=False):
other_details['EARNINGS_DATE'].append(span.text.strip())
for td in soup.findAll('td',attrs={'data-test': 'DIVIDEND_AND_YIELD-value'}):
other_details['DIVIDEND_AND_YIELD'] = td.text.strip()
for td in soup.findAll('td',attrs={'data-test': 'EX_DIVIDEND_DATE-value'}):
for span in td.findAll('span', recursive=False):
other_details['EX_DIVIDEND_DATE'] = span.text.strip()
for td in soup.findAll('td',attrs={'data-test': 'ONE_YEAR_TARGET_PRICE-value' }):
for span in td.findAll('span', recursive=False):
other_details['ONE_YEAR_TARGET_PRICE'] = span.text.strip()
other_details['DATE'] = str(datetime.datetime.now())
company_json['OTHER_DETAILS'] = other_details
with open('dax30_kpis.json', 'a') as outfile:
json.dump(company_json, outfile, indent=4)
print company_json
print '----------Extraction of data is complete. Check json file.----------'
你的缩进好像有误:
# Making the website believe that you are accessing it using a Mozilla browser
for url in urls:
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
# Creating a BeautifulSoup object of the HTML page for easy extraction of data.
soup = BeautifulSoup(webpage, 'html.parser')
# ... rest of the code
应该是
# Making the website believe that you are accessing it using a Mozilla browser
for url in urls:
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
# Creating a BeautifulSoup object of the HTML page for easy extraction of data.
soup = BeautifulSoup(webpage, 'html.parser')
# ... rest of the code
此更改是必要的,因为您在循环中调用所有 url 并将它们保存在单个变量中。因此,您的实施最终会覆盖所有被抓取的网站,并且只处理最后一个 url.
返回的结果你需要把所有的处理都放到
#... website processing code
with open('dax30_kpis.json', 'a') as outfile:
json.dump(company_json, outfile, indent=4)
进入 for
循环为:
###code before###
# Making the website believe that you are accessing it using a Mozilla browser
for url in urls:
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
# Creating a BeautifulSoup object of the HTML page for easy extraction of data.
soup = BeautifulSoup(webpage, 'html.parser')
html = soup.prettify('utf-8')
company_json = {}
other_details = {}
for h1 in soup.findAll('h1'):
company_json['TICKER'] = h1.text.strip()
for span in soup.findAll('span',attrs={'class': 'Trsdu(0.3s) Trsdu(0.3s) Fw(b) Fz(36px) Mb(-4px) D(b)'}):
company_json['PRESENT_VALUE'] = span.text.strip()
for div in soup.findAll('div', attrs={'class': 'D(ib) Va(t)'}):
for span in div.findAll('span', recursive=False):
company_json['PRESENT_GROWTH'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'PREV_CLOSE-value'}):
for span in td.findAll('span', recursive=False):
other_details['PREV_CLOSE'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'OPEN-value'}):
for span in td.findAll('span', recursive=False):
other_details['OPEN'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'BID-value'}):
for span in td.findAll('span', recursive=False):
other_details['BID'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'ASK-value'}):
for span in td.findAll('span', recursive=False):
other_details['ASK'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'DAYS_RANGE-value'}):
for span in td.findAll('span', recursive=False):
other_details['DAYS_RANGE'] = span.text.strip()
for td in soup.findAll('td',attrs={'data-test': 'FIFTY_TWO_WK_RANGE-value'}):
for span in td.findAll('span', recursive=False):
other_details['FIFTY_TWO_WK_RANGE'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'TD_VOLUME-value'}):
for span in td.findAll('span', recursive=False):
other_details['TD_VOLUME'] = span.text.strip()
for td in soup.findAll('td',attrs={'data-test': 'AVERAGE_VOLUME_3MONTH-value'}):
for span in td.findAll('span', recursive=False):
other_details['AVERAGE_VOLUME_3MONTH'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'MARKET_CAP-value'}):
for span in td.findAll('span', recursive=False):
other_details['MARKET_CAP'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'BETA_3Y-value'}):
for span in td.findAll('span', recursive=False):
other_details['BETA_3Y'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'PE_RATIO-value'}):
for span in td.findAll('span', recursive=False):
other_details['PE_RATIO'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'EPS_RATIO-value'}):
for span in td.findAll('span', recursive=False):
other_details['EPS_RATIO'] = span.text.strip()
for td in soup.findAll('td', attrs={'data-test': 'EARNINGS_DATE-value'}):
other_details['EARNINGS_DATE'] = []
for span in td.findAll('span', recursive=False):
other_details['EARNINGS_DATE'].append(span.text.strip())
for td in soup.findAll('td',attrs={'data-test': 'DIVIDEND_AND_YIELD-value'}):
other_details['DIVIDEND_AND_YIELD'] = td.text.strip()
for td in soup.findAll('td',attrs={'data-test': 'EX_DIVIDEND_DATE-value'}):
for span in td.findAll('span', recursive=False):
other_details['EX_DIVIDEND_DATE'] = span.text.strip()
for td in soup.findAll('td',attrs={'data-test': 'ONE_YEAR_TARGET_PRICE-value' }):
for span in td.findAll('span', recursive=False):
other_details['ONE_YEAR_TARGET_PRICE'] = span.text.strip()
other_details['DATE'] = str(datetime.datetime.now())
company_json['OTHER_DETAILS'] = other_details
with open('dax30_kpis.json', 'a') as outfile:
json.dump(company_json, outfile, indent=4)
print company_json
### Code after ###