在 Python 中抓取具有可点击内容的网站
Scraping a website with clickable content in Python
我想删除以下网站的内容:
http://financials.morningstar.com/ratios/r.html?t=AMD
在 Key Ratios 下,我想单击 "Growth" 按钮,然后在 Python.
中删除数据
我该怎么做?
可以用requests
+BeautifulSoup
来解决。有一个异步 GET
请求发送到您需要模拟的 http://financials.morningstar.com/financials/getKeyStatPart.html 端点。 Growth
table 位于 div
内 id="tab-growth"
:
from bs4 import BeautifulSoup
import requests
url = 'http://financials.morningstar.com/ratios/r.html?t=AMD'
keystat_url = 'http://financials.morningstar.com/financials/getKeyStatPart.html'
with requests.Session() as session:
session.headers = {'User-Agent': 'Mozilla/5.0 (Linux; U; Android 4.0.3; ko-kr; LG-L160L Build/IML74K) AppleWebkit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30'}
# visit the target url
session.get(url)
params = {
'callback': '',
't': 'XNAS:AMD',
'region': 'usa',
'culture': 'en-US',
'cur': '',
'order': 'asc',
'_': '1426047023943'
}
response = session.get(keystat_url, params=params)
# get the HTML part from the JSON response
soup = BeautifulSoup(response.json()['componentData'])
# grab the data
for row in soup.select('div#tab-growth table tr'):
print row.text
我想删除以下网站的内容:
http://financials.morningstar.com/ratios/r.html?t=AMD
在 Key Ratios 下,我想单击 "Growth" 按钮,然后在 Python.
中删除数据我该怎么做?
可以用requests
+BeautifulSoup
来解决。有一个异步 GET
请求发送到您需要模拟的 http://financials.morningstar.com/financials/getKeyStatPart.html 端点。 Growth
table 位于 div
内 id="tab-growth"
:
from bs4 import BeautifulSoup
import requests
url = 'http://financials.morningstar.com/ratios/r.html?t=AMD'
keystat_url = 'http://financials.morningstar.com/financials/getKeyStatPart.html'
with requests.Session() as session:
session.headers = {'User-Agent': 'Mozilla/5.0 (Linux; U; Android 4.0.3; ko-kr; LG-L160L Build/IML74K) AppleWebkit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30'}
# visit the target url
session.get(url)
params = {
'callback': '',
't': 'XNAS:AMD',
'region': 'usa',
'culture': 'en-US',
'cur': '',
'order': 'asc',
'_': '1426047023943'
}
response = session.get(keystat_url, params=params)
# get the HTML part from the JSON response
soup = BeautifulSoup(response.json()['componentData'])
# grab the data
for row in soup.select('div#tab-growth table tr'):
print row.text