在 2019 年 10 月最近的更改后刮掉雅虎财经
Scraping Yahoo Finance after recent change Oc 2019
import lxml
from lxml import html
import requests
import numpy as np
import pandas as pd
symbol = 'AAPL'
url = 'https://finance.yahoo.com/quote/' + symbol + '/balance-sheet?p=' + symbol
page = requests.get(url)
tree = html.fromstring(page.content)
tableHeaders = tree.xpath('//*[@class="D(tbhg)"]//span')
Headers = []
for Header in tableHeaders:
Headers.append(Header.text)
df = pd.DataFrame()
df = pd.DataFrame(columns=Headers,index=[1])
Xpath1 = "//span[contains(.,'"+item1+"')]/parent::div/parent::div/following-sibling::div"
item1 = 'Long Term Debt'
row1 = []
row1.append(item1)
rowvalues1 = tree.xpath(Xpath1)
for value1 in rowvalues1:
row1.append(value1.text)
Xpath1 = Xpath1+"/span"
Childvalues1 = tree.xpath(Xpath1)
j=0
for i in range(len(row1)):
if(row1[i]==None):
row1[i] =Childvalues1[j].text
j=j+1
df.loc[1] = row1
df=df.fillna(0)
df[df=='-'] ='0'
long_term_debt=float(str(df.iloc[0,4]).replace(',',''))
当我 运行 AAPL 符号时出现错误:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-63-fe5e79eabd51> in <module>
57 df[df=='-'] ='0'
58
---> 59 long_term_debt=float(str(df.iloc[0,4]).replace(',',''))
60
--
2007 l = len(ax)
2008 if key >= l or key < -l:
-> 2009 raise IndexError("single positional indexer is out-of-bounds")
2010
2011 def _getitem_tuple(self, tup):
IndexError:单个位置索引器越界
我无法从 df
中提取 运行 值,因为 AAPL 公司没有 2015 年。这家公司的 Yahoo Finance 于当年关闭,但其他公司的财务是 2015 年。
我能做些什么来消除这个特定年份和代码的错误。我尝试在函数中使用 'None' 作为长期变量,但它不起作用。知道如何处理这种情况吗?
尝试使用的公式:
def debt():
if df.iloc[0,4]== None : return 0
else: float(str(df.iloc[0,4]).replace(',',''))
然而它不起作用
您必须将 YahooFinancials 库安装到您的 python。
import lxml
from lxml import html
import requests
import numpy as np
import pandas as pd
symbol = 'ORCL'
url = 'https://finance.yahoo.com/quote/' + symbol + '/balance-sheet?p=' + symbol
page = requests.get(url)
tree = html.fromstring(page.content)
tableHeaders = tree.xpath('//*[@class="D(tbhg)"]//span')
Headers = []
for Header in tableHeaders:
Headers.append(Header.text)
df = pd.DataFrame()
#Adding columns
df = pd.DataFrame(columns=Headers,index=[1])
item5 = 'Inventory'
row5 = []
row5.append(item5)
Xpath = "//span[contains(.,'"+item5+"')]/parent::div/parent::div/following-sibling::div"
rowvalues5 = tree.xpath(Xpath) # identify all 4 high level nodes
# This will store the high level node values, but store 'None' if value is not present.
for value5 in rowvalues5:
row5.append(value5.text)
#if the value is None, we are going to fetch to next level node values using /span
Xpath = Xpath+"/span"
Childvalues = tree.xpath(Xpath) #Fetch low level nodes
j=0
for i in range(len(row5)):
if(row5[i]==None):
row5[i] =Childvalues[j].text
j=j+1
df.loc[1] = row5
print(df)
伙计们,这与 yahoo finance 的最新变化有关
def get_page(url):
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'max-age=0',
'Pragma': 'no-cache',
'Referrer': 'https://google.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'
}
return requests.get(url, headers=headers)
def parse_rows(table_rows):
parsed_rows = []
for table_row in table_rows:
parsed_row = []
el = table_row.xpath("./div")
none_count = 0
for rs in el:
try:
(text,) = rs.xpath('.//span/text()[1]')
parsed_row.append(text)
except ValueError:
parsed_row.append(np.NaN)
none_count += 1
if (none_count < 4):
parsed_rows.append(parsed_row)
return pd.DataFrame(parsed_rows)
def scrape_table(url):
page = get_page(url);
tree = html.fromstring(page.content)
table_rows = tree.xpath("//div[contains(@class, 'D(tbr)')]")
assert len(table_rows) > 0
df = parse_rows(table_rows)
return df
bs = scrape_table('https://finance.yahoo.com/quote/' + tickers[0] + '/balance-sheet?p=' + tickers[0])
bs[1]=bs[1].str.replace(',', '')
bs[2]=bs[2].str.replace(',', '')
bs[3]=bs[3].str.replace(',', '')
bs[4]=bs[4].str.replace(',', '')
bs=bs.fillna(0)
bs[bs=='-'] ='0'
ic = scrape_table('https://finance.yahoo.com/quote/' + tickers[0] + '/financials?p=' + tickers[0])
ic[1]=ic[1].str.replace(',', '')
ic[2]=ic[2].str.replace(',', '')
ic[3]=ic[3].str.replace(',', '')
ic[4]=ic[4].str.replace(',', '')
ic[5]=ic[5].str.replace(',', '')
ic=ic.fillna(0)
ic[ic=='-'] ='0'
cf = scrape_table('https://finance.yahoo.com/quote/' + tickers[0] + '/cash-flow?p=' + tickers[0])
cf[1]=cf[1].str.replace(',', '')
cf[2]=cf[2].str.replace(',', '')
cf[3]=cf[3].str.replace(',', '')
cf[4]=cf[4].str.replace(',', '')
cf[5]=cf[5].str.replace(',', '')
cf=cf.fillna(0)
cf[cf=='-'] ='0'
如果您想获得 2019 年的收入价值:
revenue2019=float(ic.iloc[1,1])
import lxml
from lxml import html
import requests
import numpy as np
import pandas as pd
symbol = 'AAPL'
url = 'https://finance.yahoo.com/quote/' + symbol + '/balance-sheet?p=' + symbol
page = requests.get(url)
tree = html.fromstring(page.content)
tableHeaders = tree.xpath('//*[@class="D(tbhg)"]//span')
Headers = []
for Header in tableHeaders:
Headers.append(Header.text)
df = pd.DataFrame()
df = pd.DataFrame(columns=Headers,index=[1])
Xpath1 = "//span[contains(.,'"+item1+"')]/parent::div/parent::div/following-sibling::div"
item1 = 'Long Term Debt'
row1 = []
row1.append(item1)
rowvalues1 = tree.xpath(Xpath1)
for value1 in rowvalues1:
row1.append(value1.text)
Xpath1 = Xpath1+"/span"
Childvalues1 = tree.xpath(Xpath1)
j=0
for i in range(len(row1)):
if(row1[i]==None):
row1[i] =Childvalues1[j].text
j=j+1
df.loc[1] = row1
df=df.fillna(0)
df[df=='-'] ='0'
long_term_debt=float(str(df.iloc[0,4]).replace(',',''))
当我 运行 AAPL 符号时出现错误:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-63-fe5e79eabd51> in <module>
57 df[df=='-'] ='0'
58
---> 59 long_term_debt=float(str(df.iloc[0,4]).replace(',',''))
60
--
2007 l = len(ax)
2008 if key >= l or key < -l:
-> 2009 raise IndexError("single positional indexer is out-of-bounds")
2010
2011 def _getitem_tuple(self, tup):
IndexError:单个位置索引器越界
我无法从 df
中提取 运行 值,因为 AAPL 公司没有 2015 年。这家公司的 Yahoo Finance 于当年关闭,但其他公司的财务是 2015 年。
我能做些什么来消除这个特定年份和代码的错误。我尝试在函数中使用 'None' 作为长期变量,但它不起作用。知道如何处理这种情况吗?
尝试使用的公式:
def debt():
if df.iloc[0,4]== None : return 0
else: float(str(df.iloc[0,4]).replace(',',''))
然而它不起作用
您必须将 YahooFinancials 库安装到您的 python。
import lxml
from lxml import html
import requests
import numpy as np
import pandas as pd
symbol = 'ORCL'
url = 'https://finance.yahoo.com/quote/' + symbol + '/balance-sheet?p=' + symbol
page = requests.get(url)
tree = html.fromstring(page.content)
tableHeaders = tree.xpath('//*[@class="D(tbhg)"]//span')
Headers = []
for Header in tableHeaders:
Headers.append(Header.text)
df = pd.DataFrame()
#Adding columns
df = pd.DataFrame(columns=Headers,index=[1])
item5 = 'Inventory'
row5 = []
row5.append(item5)
Xpath = "//span[contains(.,'"+item5+"')]/parent::div/parent::div/following-sibling::div"
rowvalues5 = tree.xpath(Xpath) # identify all 4 high level nodes
# This will store the high level node values, but store 'None' if value is not present.
for value5 in rowvalues5:
row5.append(value5.text)
#if the value is None, we are going to fetch to next level node values using /span
Xpath = Xpath+"/span"
Childvalues = tree.xpath(Xpath) #Fetch low level nodes
j=0
for i in range(len(row5)):
if(row5[i]==None):
row5[i] =Childvalues[j].text
j=j+1
df.loc[1] = row5
print(df)
伙计们,这与 yahoo finance 的最新变化有关
def get_page(url):
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'max-age=0',
'Pragma': 'no-cache',
'Referrer': 'https://google.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'
}
return requests.get(url, headers=headers)
def parse_rows(table_rows):
parsed_rows = []
for table_row in table_rows:
parsed_row = []
el = table_row.xpath("./div")
none_count = 0
for rs in el:
try:
(text,) = rs.xpath('.//span/text()[1]')
parsed_row.append(text)
except ValueError:
parsed_row.append(np.NaN)
none_count += 1
if (none_count < 4):
parsed_rows.append(parsed_row)
return pd.DataFrame(parsed_rows)
def scrape_table(url):
page = get_page(url);
tree = html.fromstring(page.content)
table_rows = tree.xpath("//div[contains(@class, 'D(tbr)')]")
assert len(table_rows) > 0
df = parse_rows(table_rows)
return df
bs = scrape_table('https://finance.yahoo.com/quote/' + tickers[0] + '/balance-sheet?p=' + tickers[0])
bs[1]=bs[1].str.replace(',', '')
bs[2]=bs[2].str.replace(',', '')
bs[3]=bs[3].str.replace(',', '')
bs[4]=bs[4].str.replace(',', '')
bs=bs.fillna(0)
bs[bs=='-'] ='0'
ic = scrape_table('https://finance.yahoo.com/quote/' + tickers[0] + '/financials?p=' + tickers[0])
ic[1]=ic[1].str.replace(',', '')
ic[2]=ic[2].str.replace(',', '')
ic[3]=ic[3].str.replace(',', '')
ic[4]=ic[4].str.replace(',', '')
ic[5]=ic[5].str.replace(',', '')
ic=ic.fillna(0)
ic[ic=='-'] ='0'
cf = scrape_table('https://finance.yahoo.com/quote/' + tickers[0] + '/cash-flow?p=' + tickers[0])
cf[1]=cf[1].str.replace(',', '')
cf[2]=cf[2].str.replace(',', '')
cf[3]=cf[3].str.replace(',', '')
cf[4]=cf[4].str.replace(',', '')
cf[5]=cf[5].str.replace(',', '')
cf=cf.fillna(0)
cf[cf=='-'] ='0'
如果您想获得 2019 年的收入价值:
revenue2019=float(ic.iloc[1,1])