使用漂亮的汤从网站上提取碳抵消项目但一无所获
Extracting Carbon offset projects from website using beautiful soup and getting nothing
我正在尝试从此网站提取数据 ('https://alliedoffsets.com/#/profile/2)。它有很多这样的项目,我想获得估计平均批发价格和估计年减排量的值。当我尝试使用漂亮的汤打印代码时,它没有提供这些标签并提供空值。我知道这可能是一件基本的事情,但我被困住了。可能是网站上使用 javascript 填充了数据,但我想不出办法。
import pandas as pd
import requests
from bs4 import BeautifulSoup
url='https://alliedoffsets.com/#/profile/1'
r=requests.get(url)
url=r.content
soup = BeautifulSoup(url,'html.parser')
tab=soup.find("thead",{"class":"sr-only"})
print(tab)
网页在 JavaScript 中呈现,因此无法使用 BeautifulSoup 直接提取 HTML 元素。 Selenium 可用于提取呈现的 HTML 然后通过 ID、class、XPath 等搜索元素
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import re
url = 'https://alliedoffsets.com/#/profile/1'
s = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=s)
# web driver goes to page
driver.get(url)
# use WebDriverWait to wait until page is rendered
# find Estimated Average Wholesale Price
elt = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, 'direct-price-panel'))
)
# extract just the price from the text
print(re.sub(r'.*($\S+).*', r'', elt.text))
# find Estimated Annual Emission Reduction
elt = driver.find_element(By.XPATH, "//*[strong[contains(., 'Estimated Annual Emission Reduction')]]")
print(elt.text.split(":")[1])
输出:
.06
11603 tCO2
您看到的数据是通过 JavaScript 从外部 URL 加载的。要使用 requests
/json
模块加载数据,您可以使用此示例:
import json
import requests
url = "https://carbon-registry.herokuapp.com/1.0/provider/1"
params = {
"embedded": '{"provider_capital_types":1,"provider_capital_types.capital_type":1,"provider_countries":1,"provider_countries.country":1,"contacts":1,"contacts.office":1,"provider_currencies":1,"provider_currencies.currency":1,"provider_languages":1,"provider_languages.language":1,"offices":1,"offices.country":1,"provider_sectors":1,"provider_sectors.sector":1,"provider_social_medias":1,"provider_social_medias.social_media":1,"provider_provider_types":1,"provider_provider_types.provider_type":1,"provider_stats":1,"provider_stats.stat":1,"provider_descriptions":1,"provider_descriptions.description":1,"relationships":1,"relationships.description":1,"provider_statuses":1,"provider_statuses.status":1}'
}
headers = {"Authorization": "Bearer 8hCH4MuPCa5t6ra8wtAz8xOQfJdjLvDVZk07ib60TZ"}
data = requests.get(url, headers=headers, params=params).json()
# uncomment to print all data:
# print(json.dumps(data, indent=4))
stats = {s["stat"]["name"]: s for s in data["provider_stats"]}
print(f"{stats['Estimated Direct Price']['value']=}")
print(f"{stats['Estimated Annual Emission Reduction']['value']=}")
打印:
stats['Estimated Direct Price']['value']=5.0630778182036105
stats['Estimated Annual Emission Reduction']['value']=11603
网站是动态的。因此,您可以按照下一个示例 selenium 和 bs4 来获取正确的数据。
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
url = 'https://alliedoffsets.com/#/profile/1'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(url)
driver.maximize_window()
time.sleep(5)
soup = BeautifulSoup(driver.page_source,'lxml')
driver.close()
Price = soup.select_one('p#direct-price-panel').contents[1].strip().replace('/tCO2e','')
Reduction= soup.select('.panel')[-1].contents[1].strip().replace('tCO2','')
print('Estimated Average Wholesale Price: '+ str(Price))
print('Estimated Annual Emission Reduction: ' + str(Reduction))
输出:
Estimated Average Wholesale Price: .06
Estimated Annual Emission Reduction: 11603
我正在尝试从此网站提取数据 ('https://alliedoffsets.com/#/profile/2)。它有很多这样的项目,我想获得估计平均批发价格和估计年减排量的值。当我尝试使用漂亮的汤打印代码时,它没有提供这些标签并提供空值。我知道这可能是一件基本的事情,但我被困住了。可能是网站上使用 javascript 填充了数据,但我想不出办法。
import pandas as pd
import requests
from bs4 import BeautifulSoup
url='https://alliedoffsets.com/#/profile/1'
r=requests.get(url)
url=r.content
soup = BeautifulSoup(url,'html.parser')
tab=soup.find("thead",{"class":"sr-only"})
print(tab)
网页在 JavaScript 中呈现,因此无法使用 BeautifulSoup 直接提取 HTML 元素。 Selenium 可用于提取呈现的 HTML 然后通过 ID、class、XPath 等搜索元素
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import re
url = 'https://alliedoffsets.com/#/profile/1'
s = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=s)
# web driver goes to page
driver.get(url)
# use WebDriverWait to wait until page is rendered
# find Estimated Average Wholesale Price
elt = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, 'direct-price-panel'))
)
# extract just the price from the text
print(re.sub(r'.*($\S+).*', r'', elt.text))
# find Estimated Annual Emission Reduction
elt = driver.find_element(By.XPATH, "//*[strong[contains(., 'Estimated Annual Emission Reduction')]]")
print(elt.text.split(":")[1])
输出:
.06
11603 tCO2
您看到的数据是通过 JavaScript 从外部 URL 加载的。要使用 requests
/json
模块加载数据,您可以使用此示例:
import json
import requests
url = "https://carbon-registry.herokuapp.com/1.0/provider/1"
params = {
"embedded": '{"provider_capital_types":1,"provider_capital_types.capital_type":1,"provider_countries":1,"provider_countries.country":1,"contacts":1,"contacts.office":1,"provider_currencies":1,"provider_currencies.currency":1,"provider_languages":1,"provider_languages.language":1,"offices":1,"offices.country":1,"provider_sectors":1,"provider_sectors.sector":1,"provider_social_medias":1,"provider_social_medias.social_media":1,"provider_provider_types":1,"provider_provider_types.provider_type":1,"provider_stats":1,"provider_stats.stat":1,"provider_descriptions":1,"provider_descriptions.description":1,"relationships":1,"relationships.description":1,"provider_statuses":1,"provider_statuses.status":1}'
}
headers = {"Authorization": "Bearer 8hCH4MuPCa5t6ra8wtAz8xOQfJdjLvDVZk07ib60TZ"}
data = requests.get(url, headers=headers, params=params).json()
# uncomment to print all data:
# print(json.dumps(data, indent=4))
stats = {s["stat"]["name"]: s for s in data["provider_stats"]}
print(f"{stats['Estimated Direct Price']['value']=}")
print(f"{stats['Estimated Annual Emission Reduction']['value']=}")
打印:
stats['Estimated Direct Price']['value']=5.0630778182036105
stats['Estimated Annual Emission Reduction']['value']=11603
网站是动态的。因此,您可以按照下一个示例 selenium 和 bs4 来获取正确的数据。
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
url = 'https://alliedoffsets.com/#/profile/1'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(url)
driver.maximize_window()
time.sleep(5)
soup = BeautifulSoup(driver.page_source,'lxml')
driver.close()
Price = soup.select_one('p#direct-price-panel').contents[1].strip().replace('/tCO2e','')
Reduction= soup.select('.panel')[-1].contents[1].strip().replace('tCO2','')
print('Estimated Average Wholesale Price: '+ str(Price))
print('Estimated Annual Emission Reduction: ' + str(Reduction))
输出:
Estimated Average Wholesale Price: .06
Estimated Annual Emission Reduction: 11603