soup.find 无法在雅虎财经上找到文字
soup.find unable to find text on yahoo finance
我收到以下错误:
Invalid figure 'Total Cash Flow From Operating Activities' passed.
从这个url:
"This is made a little more difficult because the "净收入”包含在强标签中
有人可以向我解释为什么此代码无法用于经营活动的总现金流量以及我如何确定某些东西具有强标签吗?
代码:
import re, requests
from bs4 import BeautifulSoup
import sys
"""
import os # file system operations
import re # regular expressions
import pandas as pd # pandas... the best time series library out there
import datetime as dt # date and time functions
import io
"""
# search with regular expressions
# "CrumbStore":\{"crumb":"(?<crumb>[^"]+)"\}
def get_crumb():
url = 'https://uk.finance.yahoo.com/quote/AAPL/history' # url for a ticker symbol, with a download link
r = requests.get(url) # download page
txt = r.text # extract html
cookie = r.cookies['B'] # the cooke we're looking for is named 'B'
print('Cookie: ', cookie)
# Now we need to extract the token from html.
# the string we need looks like this: "CrumbStore":{"crumb":"lQHxbbYOBCq"}
# regular expressions will do the trick!
pattern = re.compile('.*"CrumbStore":\{"crumb":"(?P<crumb>[^"]+)"\}')
for line in txt.splitlines():
m = pattern.match(line)
if m is not None:
crumb = m.groupdict()['crumb']
print('Crumb=',crumb)
return crumb
def periodic_figure_values(soup, yahoo_figure):
values = []
pattern = re.compile(yahoo_figure)
title = soup.find("strong", text=pattern) # works for the figures printed in bold
if title:
row = title.parent.parent
else:
title = soup.find("td", text=pattern) # works for any other available figure
if title:
row = title.parent
else:
sys.exit("Invalid figure '" + yahoo_figure + "' passed.")
cells = row.find_all("td")[1:] # exclude the <td> with figure name
for cell in cells:
if cell.text.strip() != yahoo_figure: # needed because some figures are indented
str_value = cell.text.strip().replace(",", "").replace("(", "-").replace(")", "")
if str_value == "-":
str_value = 0
value = int(str_value) * 1000
values.append(value)
return values
def financials_soup(ticker_symbol, statement, quarterly=False):
if statement == "is" or statement == "bs" or statement == "cf":
crumb = get_crumb()
url = "https://finance.yahoo.com/q/" + statement + "?s=" + ticker_symbol + "&crumb=" + crumb
if not quarterly:
url += "&annual"
return BeautifulSoup(requests.get(url).text, "html.parser")
return sys.exit("Invalid financial statement code '" + statement + "' passed.")
print(periodic_figure_values(financials_soup("AAPL", "cf"), "Total Cash Flow From Operating Activities"))
编辑:
我可以通过将 financials_soup 函数更改为以下内容来获得结果:
def financials_soup(ticker_symbol, statement, quarterly=False):
if statement == "financials" or statement == "balance-sheet" or statement == "cash-flow":
crumb = get_crumb()
url = "https://finance.yahoo.com/quote/" + ticker_symbol + "/" + statement + "?p=" + ticker_symbol + "&crumb=" + crumb
if not quarterly:
url += "&annual"
return BeautifulSoup(requests.get(url).text, "html.parser")
return sys.exit("Invalid financial statement code '" + statement + "' passed.")
如果您检查 return 由您 link 编辑的来源(第 69 行):
url = "https://finance.yahoo.com/q/" + statement + "?s=" + ticker_symbol + "&crumb=" + crumb
您会看到它没有关于 Total Cash Flow From Operating Activities
的任何有用信息。如果你把它改成
url = "https://finance.yahoo.com/quote/" + ticker_symbol + "/cash-flow?p=" + ticker_symbol
它将return您正在寻找的信息:
[65824000000, 81266000000, 59713000000]
我收到以下错误:
Invalid figure 'Total Cash Flow From Operating Activities' passed.
从这个url:
"This is made a little more difficult because the "净收入”包含在强标签中
有人可以向我解释为什么此代码无法用于经营活动的总现金流量以及我如何确定某些东西具有强标签吗?
代码:
import re, requests
from bs4 import BeautifulSoup
import sys
"""
import os # file system operations
import re # regular expressions
import pandas as pd # pandas... the best time series library out there
import datetime as dt # date and time functions
import io
"""
# search with regular expressions
# "CrumbStore":\{"crumb":"(?<crumb>[^"]+)"\}
def get_crumb():
url = 'https://uk.finance.yahoo.com/quote/AAPL/history' # url for a ticker symbol, with a download link
r = requests.get(url) # download page
txt = r.text # extract html
cookie = r.cookies['B'] # the cooke we're looking for is named 'B'
print('Cookie: ', cookie)
# Now we need to extract the token from html.
# the string we need looks like this: "CrumbStore":{"crumb":"lQHxbbYOBCq"}
# regular expressions will do the trick!
pattern = re.compile('.*"CrumbStore":\{"crumb":"(?P<crumb>[^"]+)"\}')
for line in txt.splitlines():
m = pattern.match(line)
if m is not None:
crumb = m.groupdict()['crumb']
print('Crumb=',crumb)
return crumb
def periodic_figure_values(soup, yahoo_figure):
values = []
pattern = re.compile(yahoo_figure)
title = soup.find("strong", text=pattern) # works for the figures printed in bold
if title:
row = title.parent.parent
else:
title = soup.find("td", text=pattern) # works for any other available figure
if title:
row = title.parent
else:
sys.exit("Invalid figure '" + yahoo_figure + "' passed.")
cells = row.find_all("td")[1:] # exclude the <td> with figure name
for cell in cells:
if cell.text.strip() != yahoo_figure: # needed because some figures are indented
str_value = cell.text.strip().replace(",", "").replace("(", "-").replace(")", "")
if str_value == "-":
str_value = 0
value = int(str_value) * 1000
values.append(value)
return values
def financials_soup(ticker_symbol, statement, quarterly=False):
if statement == "is" or statement == "bs" or statement == "cf":
crumb = get_crumb()
url = "https://finance.yahoo.com/q/" + statement + "?s=" + ticker_symbol + "&crumb=" + crumb
if not quarterly:
url += "&annual"
return BeautifulSoup(requests.get(url).text, "html.parser")
return sys.exit("Invalid financial statement code '" + statement + "' passed.")
print(periodic_figure_values(financials_soup("AAPL", "cf"), "Total Cash Flow From Operating Activities"))
编辑:
我可以通过将 financials_soup 函数更改为以下内容来获得结果:
def financials_soup(ticker_symbol, statement, quarterly=False):
if statement == "financials" or statement == "balance-sheet" or statement == "cash-flow":
crumb = get_crumb()
url = "https://finance.yahoo.com/quote/" + ticker_symbol + "/" + statement + "?p=" + ticker_symbol + "&crumb=" + crumb
if not quarterly:
url += "&annual"
return BeautifulSoup(requests.get(url).text, "html.parser")
return sys.exit("Invalid financial statement code '" + statement + "' passed.")
如果您检查 return 由您 link 编辑的来源(第 69 行):
url = "https://finance.yahoo.com/q/" + statement + "?s=" + ticker_symbol + "&crumb=" + crumb
您会看到它没有关于 Total Cash Flow From Operating Activities
的任何有用信息。如果你把它改成
url = "https://finance.yahoo.com/quote/" + ticker_symbol + "/cash-flow?p=" + ticker_symbol
它将return您正在寻找的信息:
[65824000000, 81266000000, 59713000000]