请求和 Beautifulsoup <tables>
requests and Beautifulsoup <tables>
我正试图从网络中提取一个数据 table。我怎样才能让这段代码在 table 中只提取一个数据?
I'm trying to pull just the value 0.83 how could I do that?
import requests
from bs4 import BeautifulSoup
url = 'https://www.gov.br/receitafederal/pt-br/assuntos/orientacao-`tributaria/pagamentos-e-parcelamentos/taxa-de-juros-selic#Taxa_de_Juros_Selic'`
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36"}
page = requests.get(url ,headers=headers)
#print(page.content)
#span class = DFlfde SwHCTb
soup = BeautifulSoup(page.content, "html.parser")
valor_taxa = soup.find_all("table",class_ ="listing" )[0]
valor_tr = soup.find_all("tr",class_="odd")
valor_especifico = soup.select('td', class_={'align': 'CENTER'})
print(valor_especifico)
输出为:
C:\Users\Francisco\PycharmProjects\INSS\Scripts\python.exe C:/Users/Francisco/PycharmProjects/INSS/web.py
[<td>
<ul>
<li></li>
</ul>
</td>, <td> <strong><a class="anchor-link" href="#Taxa_de_Juros_Selic" target="_self" title="">Taxa de Juros Selic</a></strong></td>, <td>
<ul>
<li></li>
</ul>
</td>, <td><strong><a class="anchor-link" href="#Selicacumulada" target="_self" title=""> </a><a class="anchor-link" href="#Selicmensalmente" target="_self" title="">Taxa de Juros Selic Acumulada Mensalmente</a></strong></td>, <td>
<ul>
<li></li>
</ul>
</td>, <td> <a class="anchor-link" href="#Taxa" target="_self" title=""><strong>Taxa de Juros Selic Incidente sobre as Quotas do Imposto de Renda Pessoa Físic</strong>a</a></td>, <td align="LEFT"><b>Mês/Ano</b></td>, <td align="CENTER"><b>2013</b></td>, <td align="CENTER"><b>2014</b></td>, <td align="CENTER"><b>2015</b></td>, <td align="CENTER"><b>2016</b></td>, <td align="CENTER"><b>2017</b></td>, <td align="CENTER"><b>2018</b></td>, <td align="CENTER"><b>2019</b></td>, <td align="CENTER"><b>2020</b></td>, <td align="CENTER"><b>2021</b></td>, <td align="CENTER"><b>2022</b></td>, <td align="LEFT"><b>Janeiro</b></td>, <td align="CENTER">0,60%</td>, <td align="CENTER">0,85%</td>, <td align="CENTER">0,94%</td>, <td align="CENTER">1,06%</td>, <td align="CENTER">1,09%</td>, <td align="CENTER">0,58%</td>, <td align="CENTER">0,54%</td>, <td align="CENTER">0,38%</td>, <td align="CENTER">0,15%</td>, <td align="CENTER">0,73%</td>, <td align="LEFT"><b>Fevereiro</b></td>, <td align="CENTER">0,49%</td>, <td align="CENTER">0,79%</td>, <td align="CENTER">0,82%</td>, <td align="CENTER">1,00%</td>, <td align="CENTER">0,87%</td>, <td align="CENTER">0,47%</td>, <td align="CENTER">0,49%</td>, <td align="CENTER">0,29%</td>, <td align="CENTER">0,13%</td>, <td align="CENTER">0,76%</td>, <td align="LEFT"><b>Março</b></td>, <td align="CENTER">0,55%</td>, <td align="CENTER">0,77%</td>, <td align="CENTER">1,04%</td>, <td align="CENTER">1,16%</td>, <td align="CENTER">1,05%</td>, <td align="CENTER">0,53%</td>, <td align="CENTER">0,47%</td>, <td align="CENTER">0,34%</td>, <td align="CENTER">0,20%</td>, <td align="CENTER">0,93%</td>, <td align="LEFT"><b>Abril</b></td>, <td align="CENTER">0,61%</td>, <td align="CENTER">0,82%</td>, <td align="CENTER">0,95%</td>, <td align="CENTER">1,06%</td>, <td align="CENTER">0,79%</td>, <td align="CENTER">0,52%</td>, <td align="CENTER">0,52%</td>, <td align="CENTER">0,28%</td>, <td align="CENTER">0,21%</td>, <td align="CENTER">0,83%</td>, <td align="LEFT"><b>Maio</b></td>, <td align="CENTER">0,60%</td>, <td align="CENTER">0,87%</td>, <td align="CENTER">0,99%</td>, <td align="CENTER">1,11%</td>, <td align="CENTER">0,93%</td>, <td align="CENTER">0,52%</td>, <td align="CENTER">0,54%</td>, <td align="CENTER">0,24%</td>, <td align="CENTER">0,27%</td>, <td align="CENTER"></td>, <td align="LEFT"><b>Junho</b></td>, <td align="CENTER">0,61%</td>, <td align="CENTER">0,82%</td>, <td align="CENTER">1,07%</td>, <td align="CENTER">1,16%</td>, <td align="CENTER">0,81%</td>, <td align="CENTER">0,52%</td>, <td align="CENTER">0,47%</td>, <td align="CENTER">0,21%</td>, <td align="CENTER">0,31%</td>, <td align="CENTER"></td>, <td align="LEFT"><b>Julho</b></td>, <td align="CENTER">0,72%</td>, <td align="CENTER">0,95%</td>, <td align="CENTER">1,18%</td>, <td align="CENTER">1,11%</td>, <td align="CENTER">0,80%</td>, <td align="CENTER">0,54%</td>
Process finished with exit code 0
试试 selenium 包:
from selenium.webdriver.common.by import By
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
url = 'https://www.gov.br/receitafederal/pt-br/assuntos/orientacao-`tributaria/pagamentos-e-parcelamentos/taxa-de-juros-selic#Taxa_de_Juros_Selic'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(url)
elementsxpath = #!!!!! define the xpath of the element
driver.find_element(By.XPATH, elementsxpath).text
有更简洁的方法可以做到这一点,但分解成单独的步骤可能会更清楚。
在 URL 上执行 GET 并检查 HTTP 状态。
根据响应文本构建 'soup'。
迭代每个 table、tr 和 td,最终打印与较低级别 tds 关联的所有文本。
import requests
from bs4 import BeautifulSoup as BS
(r := requests.get('https://www.gov.br/receitafederal/pt-br/assuntos/orientacao-tributaria/pagamentos-e-parcelamentos/taxa-de-juros-selic#Taxa_de_Juros_Selic')).raise_for_status()
soup = BS(r.text, 'lxml')
for table in soup.find_all('table', {'class': 'listing'}):
for tr in table.find_all('tr', {'class': 'odd'}):
for td in tr.find_all('td', {'align': 'CENTER'}):
print(td.text)
我正试图从网络中提取一个数据 table。我怎样才能让这段代码在 table 中只提取一个数据?
I'm trying to pull just the value 0.83 how could I do that?
import requests
from bs4 import BeautifulSoup
url = 'https://www.gov.br/receitafederal/pt-br/assuntos/orientacao-`tributaria/pagamentos-e-parcelamentos/taxa-de-juros-selic#Taxa_de_Juros_Selic'`
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36"}
page = requests.get(url ,headers=headers)
#print(page.content)
#span class = DFlfde SwHCTb
soup = BeautifulSoup(page.content, "html.parser")
valor_taxa = soup.find_all("table",class_ ="listing" )[0]
valor_tr = soup.find_all("tr",class_="odd")
valor_especifico = soup.select('td', class_={'align': 'CENTER'})
print(valor_especifico)
输出为:
C:\Users\Francisco\PycharmProjects\INSS\Scripts\python.exe C:/Users/Francisco/PycharmProjects/INSS/web.py
[<td>
<ul>
<li></li>
</ul>
</td>, <td> <strong><a class="anchor-link" href="#Taxa_de_Juros_Selic" target="_self" title="">Taxa de Juros Selic</a></strong></td>, <td>
<ul>
<li></li>
</ul>
</td>, <td><strong><a class="anchor-link" href="#Selicacumulada" target="_self" title=""> </a><a class="anchor-link" href="#Selicmensalmente" target="_self" title="">Taxa de Juros Selic Acumulada Mensalmente</a></strong></td>, <td>
<ul>
<li></li>
</ul>
</td>, <td> <a class="anchor-link" href="#Taxa" target="_self" title=""><strong>Taxa de Juros Selic Incidente sobre as Quotas do Imposto de Renda Pessoa Físic</strong>a</a></td>, <td align="LEFT"><b>Mês/Ano</b></td>, <td align="CENTER"><b>2013</b></td>, <td align="CENTER"><b>2014</b></td>, <td align="CENTER"><b>2015</b></td>, <td align="CENTER"><b>2016</b></td>, <td align="CENTER"><b>2017</b></td>, <td align="CENTER"><b>2018</b></td>, <td align="CENTER"><b>2019</b></td>, <td align="CENTER"><b>2020</b></td>, <td align="CENTER"><b>2021</b></td>, <td align="CENTER"><b>2022</b></td>, <td align="LEFT"><b>Janeiro</b></td>, <td align="CENTER">0,60%</td>, <td align="CENTER">0,85%</td>, <td align="CENTER">0,94%</td>, <td align="CENTER">1,06%</td>, <td align="CENTER">1,09%</td>, <td align="CENTER">0,58%</td>, <td align="CENTER">0,54%</td>, <td align="CENTER">0,38%</td>, <td align="CENTER">0,15%</td>, <td align="CENTER">0,73%</td>, <td align="LEFT"><b>Fevereiro</b></td>, <td align="CENTER">0,49%</td>, <td align="CENTER">0,79%</td>, <td align="CENTER">0,82%</td>, <td align="CENTER">1,00%</td>, <td align="CENTER">0,87%</td>, <td align="CENTER">0,47%</td>, <td align="CENTER">0,49%</td>, <td align="CENTER">0,29%</td>, <td align="CENTER">0,13%</td>, <td align="CENTER">0,76%</td>, <td align="LEFT"><b>Março</b></td>, <td align="CENTER">0,55%</td>, <td align="CENTER">0,77%</td>, <td align="CENTER">1,04%</td>, <td align="CENTER">1,16%</td>, <td align="CENTER">1,05%</td>, <td align="CENTER">0,53%</td>, <td align="CENTER">0,47%</td>, <td align="CENTER">0,34%</td>, <td align="CENTER">0,20%</td>, <td align="CENTER">0,93%</td>, <td align="LEFT"><b>Abril</b></td>, <td align="CENTER">0,61%</td>, <td align="CENTER">0,82%</td>, <td align="CENTER">0,95%</td>, <td align="CENTER">1,06%</td>, <td align="CENTER">0,79%</td>, <td align="CENTER">0,52%</td>, <td align="CENTER">0,52%</td>, <td align="CENTER">0,28%</td>, <td align="CENTER">0,21%</td>, <td align="CENTER">0,83%</td>, <td align="LEFT"><b>Maio</b></td>, <td align="CENTER">0,60%</td>, <td align="CENTER">0,87%</td>, <td align="CENTER">0,99%</td>, <td align="CENTER">1,11%</td>, <td align="CENTER">0,93%</td>, <td align="CENTER">0,52%</td>, <td align="CENTER">0,54%</td>, <td align="CENTER">0,24%</td>, <td align="CENTER">0,27%</td>, <td align="CENTER"></td>, <td align="LEFT"><b>Junho</b></td>, <td align="CENTER">0,61%</td>, <td align="CENTER">0,82%</td>, <td align="CENTER">1,07%</td>, <td align="CENTER">1,16%</td>, <td align="CENTER">0,81%</td>, <td align="CENTER">0,52%</td>, <td align="CENTER">0,47%</td>, <td align="CENTER">0,21%</td>, <td align="CENTER">0,31%</td>, <td align="CENTER"></td>, <td align="LEFT"><b>Julho</b></td>, <td align="CENTER">0,72%</td>, <td align="CENTER">0,95%</td>, <td align="CENTER">1,18%</td>, <td align="CENTER">1,11%</td>, <td align="CENTER">0,80%</td>, <td align="CENTER">0,54%</td>
Process finished with exit code 0
试试 selenium 包:
from selenium.webdriver.common.by import By
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
url = 'https://www.gov.br/receitafederal/pt-br/assuntos/orientacao-`tributaria/pagamentos-e-parcelamentos/taxa-de-juros-selic#Taxa_de_Juros_Selic'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(url)
elementsxpath = #!!!!! define the xpath of the element
driver.find_element(By.XPATH, elementsxpath).text
有更简洁的方法可以做到这一点,但分解成单独的步骤可能会更清楚。
在 URL 上执行 GET 并检查 HTTP 状态。
根据响应文本构建 'soup'。
迭代每个 table、tr 和 td,最终打印与较低级别 tds 关联的所有文本。
import requests
from bs4 import BeautifulSoup as BS
(r := requests.get('https://www.gov.br/receitafederal/pt-br/assuntos/orientacao-tributaria/pagamentos-e-parcelamentos/taxa-de-juros-selic#Taxa_de_Juros_Selic')).raise_for_status()
soup = BS(r.text, 'lxml')
for table in soup.find_all('table', {'class': 'listing'}):
for tr in table.find_all('tr', {'class': 'odd'}):
for td in tr.find_all('td', {'align': 'CENTER'}):
print(td.text)