在 2019 年 10 月最近的更改后刮掉雅虎财经

Question

import lxml
from lxml import html
import requests
import numpy as np
import pandas as pd

symbol = 'AAPL'

url = 'https://finance.yahoo.com/quote/' + symbol + '/balance-sheet?p=' + symbol



page = requests.get(url)

tree = html.fromstring(page.content)

tableHeaders = tree.xpath('//*[@class="D(tbhg)"]//span') 

Headers = []
for Header in tableHeaders:
    Headers.append(Header.text)

df = pd.DataFrame()

df = pd.DataFrame(columns=Headers,index=[1])


Xpath1 = "//span[contains(.,'"+item1+"')]/parent::div/parent::div/following-sibling::div"


item1 = 'Long Term Debt'
row1 = []
row1.append(item1)



rowvalues1 = tree.xpath(Xpath1) 

for value1 in rowvalues1:
    row1.append(value1.text)

Xpath1 = Xpath1+"/span"
Childvalues1 = tree.xpath(Xpath1) 
j=0
for i in range(len(row1)):
    if(row1[i]==None):
        row1[i] =Childvalues1[j].text
        j=j+1



df.loc[1] = row1



df=df.fillna(0)
df[df=='-'] ='0'

long_term_debt=float(str(df.iloc[0,4]).replace(',',''))

当我运行 AAPL 符号时出现错误：

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-63-fe5e79eabd51> in <module>
     57 df[df=='-'] ='0'
     58 
---> 59 long_term_debt=float(str(df.iloc[0,4]).replace(',',''))
     60 
--
   2007         l = len(ax)
   2008         if key >= l or key < -l:
-> 2009             raise IndexError("single positional indexer is out-of-bounds")
   2010 
   2011     def _getitem_tuple(self, tup):

IndexError：单个位置索引器越界

我无法从 df 中提取运行值，因为 AAPL 公司没有 2015 年。这家公司的 Yahoo Finance 于当年关闭，但其他公司的财务是 2015 年。

我能做些什么来消除这个特定年份和代码的错误。我尝试在函数中使用 'None' 作为长期变量，但它不起作用。知道如何处理这种情况吗？

尝试使用的公式：

def debt():
    if df.iloc[0,4]== None : return 0
    else: float(str(df.iloc[0,4]).replace(',',''))

然而它不起作用

Answer 1

您必须将 YahooFinancials 库安装到您的 python。

import lxml
from lxml import html
import requests
import numpy as np
import pandas as pd

symbol = 'ORCL'

url = 'https://finance.yahoo.com/quote/' + symbol + '/balance-sheet?p=' + symbol



page = requests.get(url)

tree = html.fromstring(page.content)

tableHeaders = tree.xpath('//*[@class="D(tbhg)"]//span') 

Headers = []
for Header in tableHeaders:
    Headers.append(Header.text)

df = pd.DataFrame()
#Adding columns
df = pd.DataFrame(columns=Headers,index=[1])
item5 = 'Inventory'
row5 = []
row5.append(item5)

Xpath = "//span[contains(.,'"+item5+"')]/parent::div/parent::div/following-sibling::div"

rowvalues5 = tree.xpath(Xpath) # identify all 4 high level nodes
# This will store the high level node values, but store 'None' if value is not present.
for value5 in rowvalues5:
    row5.append(value5.text)
 #if the value is None, we are going to fetch to next level node values using /span   
Xpath = Xpath+"/span"
Childvalues = tree.xpath(Xpath) #Fetch low level nodes
j=0
for i in range(len(row5)):
    if(row5[i]==None):
        row5[i] =Childvalues[j].text
        j=j+1


df.loc[1] = row5

print(df)

Answer 2

伙计们，这与 yahoo finance 的最新变化有关

def get_page(url):
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'en-US,en;q=0.9',
        'Cache-Control': 'max-age=0',
        'Pragma': 'no-cache',
        'Referrer': 'https://google.com',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'
    }

    return requests.get(url, headers=headers)

def parse_rows(table_rows):
    parsed_rows = []

    for table_row in table_rows:
        parsed_row = []
        el = table_row.xpath("./div")

        none_count = 0

        for rs in el:
            try:
                (text,) = rs.xpath('.//span/text()[1]')
                parsed_row.append(text)
            except ValueError:
                parsed_row.append(np.NaN)
                none_count += 1

        if (none_count < 4):
            parsed_rows.append(parsed_row)

    return pd.DataFrame(parsed_rows)



def scrape_table(url):

    page = get_page(url);
    tree = html.fromstring(page.content)
    table_rows = tree.xpath("//div[contains(@class, 'D(tbr)')]")
    assert len(table_rows) > 0
    df = parse_rows(table_rows)
    return df



bs = scrape_table('https://finance.yahoo.com/quote/' + tickers[0] + '/balance-sheet?p=' + tickers[0])
bs[1]=bs[1].str.replace(',', '')
bs[2]=bs[2].str.replace(',', '')
bs[3]=bs[3].str.replace(',', '')
bs[4]=bs[4].str.replace(',', '')
bs=bs.fillna(0)
bs[bs=='-'] ='0'

ic = scrape_table('https://finance.yahoo.com/quote/' + tickers[0] + '/financials?p=' + tickers[0])
ic[1]=ic[1].str.replace(',', '')
ic[2]=ic[2].str.replace(',', '')
ic[3]=ic[3].str.replace(',', '')
ic[4]=ic[4].str.replace(',', '')
ic[5]=ic[5].str.replace(',', '')
ic=ic.fillna(0)
ic[ic=='-'] ='0'

cf = scrape_table('https://finance.yahoo.com/quote/' + tickers[0] + '/cash-flow?p=' + tickers[0])
cf[1]=cf[1].str.replace(',', '')
cf[2]=cf[2].str.replace(',', '')
cf[3]=cf[3].str.replace(',', '')
cf[4]=cf[4].str.replace(',', '')
cf[5]=cf[5].str.replace(',', '')
cf=cf.fillna(0)
cf[cf=='-'] ='0'

如果您想获得 2019 年的收入价值：

revenue2019=float(ic.iloc[1,1])

在 2019 年 10 月最近的更改后刮掉雅虎财经

Scraping Yahoo Finance after recent change Oc 2019

python

web-scraping

yahoo-api

yahoo-finance