使用 Python 和 Pandas 难以提取 HTML table
Difficulty extracting HTML table with Python and Pandas
我正在尝试从以下网站的 HTML table 中提取数据:https://fuelkaki.sg/home
我的Python代码如下图。 Pandas 无法检测到 Table。我怀疑是因为 Beautiful Soup 无法正确捕获页面上的 HTML 代码。
import sys
import time
from bs4 import BeautifulSoup
import requests
import pandas as pd
try:
url = 'https://fuelkaki.sg/home'
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36 Edg/97.0.1072.69'}
page=requests.get(url, headers=headers)
except Exception as e:
error_type, error_obj, error_info = sys.exc_info()
print ('ERROR FOR LINK:', url)
print (error_type, 'Line:', error_info.tb_lineno)
time.sleep(2)
soup=BeautifulSoup(page.text,'html.parser')
df = pd.read_html(page.text)
df
我也尝试过使用 Selenium(见下面的代码),但仍然无法捕获 HTML table 信息。
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd
url = 'https://fuelkaki.sg/home'
options = Options()
options.binary_location = "C:\Program Files (x86)\Google\Chrome\Application\chrome.exe" #chrome binary location specified here
options.add_argument('--headless')
options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options=options)
driver.get(url)
time.sleep(3)
page = driver.page_source
driver.quit()
soup = BeautifulSoup(page, 'html.parser')
df = pd.read_html(page)
df
如有任何建议,我们将不胜感激
使用:
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd
url = 'https://fuelkaki.sg/home'
options = Options()
options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options=options)
driver.get(url)
time.sleep(3)
page = driver.page_source
driver.quit()
soup = BeautifulSoup(page, 'html.parser')
table = soup.find("table", { "class" : "table" })
pd.DataFrame(np.array([x.text.replace('\u202c', '') for x in table.find_all('td')]).reshape(-1,5))
输出:
请注意,使用网站数据可能是不道德的。
我正在尝试从以下网站的 HTML table 中提取数据:https://fuelkaki.sg/home
我的Python代码如下图。 Pandas 无法检测到 Table。我怀疑是因为 Beautiful Soup 无法正确捕获页面上的 HTML 代码。
import sys
import time
from bs4 import BeautifulSoup
import requests
import pandas as pd
try:
url = 'https://fuelkaki.sg/home'
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36 Edg/97.0.1072.69'}
page=requests.get(url, headers=headers)
except Exception as e:
error_type, error_obj, error_info = sys.exc_info()
print ('ERROR FOR LINK:', url)
print (error_type, 'Line:', error_info.tb_lineno)
time.sleep(2)
soup=BeautifulSoup(page.text,'html.parser')
df = pd.read_html(page.text)
df
我也尝试过使用 Selenium(见下面的代码),但仍然无法捕获 HTML table 信息。
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd
url = 'https://fuelkaki.sg/home'
options = Options()
options.binary_location = "C:\Program Files (x86)\Google\Chrome\Application\chrome.exe" #chrome binary location specified here
options.add_argument('--headless')
options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options=options)
driver.get(url)
time.sleep(3)
page = driver.page_source
driver.quit()
soup = BeautifulSoup(page, 'html.parser')
df = pd.read_html(page)
df
如有任何建议,我们将不胜感激
使用:
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd
url = 'https://fuelkaki.sg/home'
options = Options()
options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options=options)
driver.get(url)
time.sleep(3)
page = driver.page_source
driver.quit()
soup = BeautifulSoup(page, 'html.parser')
table = soup.find("table", { "class" : "table" })
pd.DataFrame(np.array([x.text.replace('\u202c', '') for x in table.find_all('td')]).reshape(-1,5))
输出: