使用 Selenium 返回空 DataFrame 从网站抓取 Table
Scrapping Table from Website with Selenium Returning Empty DataFrame
我刚开始学习网络抓取并尝试从 'Holdings' table 和 https://www.ishares.com/us/products/268752/ishares-global-reit-etf
中提取数据
首先,我使用 pandas 但它 returns 我的数据框是空的。后来发现这个table是动态的,需要用到selenium。但话又说回来,它也是 returns 我的空数据框。有人可以帮我吗?真的很感激。
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
# Instantiate options
options = webdriver.ChromeOptions()
options.headless = True
# Instantiate a webdriver
site = 'https://www.ishares.com/us/products/268752/ishares-global-reit-etf'
wd = webdriver.Chrome('chromedriver',options=options)
wd.get(site)
# Load the HTML page
html = wd.page_source
# Extract data with pandas
df = pd.read_html(html)
table = df[6]
要从 iShares Global REIT ETF webpage you need to induce WebDriverWait for the and using DataFrame from Pandas you can use the following 的 Holdings table 中提取数据:
代码块:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
wd.get("https://www.ishares.com/us/products/268752/ishares-global-reit-etf")
WebDriverWait(wd, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button#onetrust-accept-btn-handler"))).click()
wd.execute_script("arguments[0].scrollIntoView();", WebDriverWait(wd, 20).until(EC.visibility_of_element_located((By.XPATH, "//div[@data-componentname]/h2[normalize-space()='Holdings']"))))
data = WebDriverWait(wd, 20).until(EC.visibility_of_element_located((By.XPATH, "//table[@aria-describedby='allHoldingsTable_info']"))).get_attribute("outerHTML")
df = pd.read_html(data)
# df = pd.read_html(data, flavor='html5lib')
print(df)
控制台输出:
[ Ticker Name Sector Asset Class ... CUSIP ISIN SEDOL Accrual Date
0 PLD PROLOGIS REIT INC Real Estate Equity ... 74340W103 US74340W1036 B44WZD7 -
1 EQIX EQUINIX REIT INC Real Estate Equity ... 29444U700 US29444U7000 BVLZX12 -
2 PSA PUBLIC STORAGE REIT Real Estate Equity ... 74460D109 US74460D1090 2852533 -
3 SPG SIMON PROPERTY GROUP REIT INC Real Estate Equity ... 828806109 US8288061091 2812452 -
4 DLR DIGITAL REALTY TRUST REIT INC Real Estate Equity ... 253868103 US2538681030 B03GQS4 -
5 O REALTY INCOME REIT CORP Real Estate Equity ... 756109104 US7561091049 2724193 -
6 WELL WELLTOWER INC Real Estate Equity ... 95040Q104 US95040Q1040 BYVYHH4 -
7 AVB AVALONBAY COMMUNITIES REIT INC Real Estate Equity ... 053484101 US0534841012 2131179 -
8 ARE ALEXANDRIA REAL ESTATE EQUITIES RE Real Estate Equity ... 015271109 US0152711091 2009210 -
9 EQR EQUITY RESIDENTIAL REIT Real Estate Equity ... 29476L107 US29476L1070 2319157 -
[10 rows x 12 columns]]
我刚开始学习网络抓取并尝试从 'Holdings' table 和 https://www.ishares.com/us/products/268752/ishares-global-reit-etf
中提取数据首先,我使用 pandas 但它 returns 我的数据框是空的。后来发现这个table是动态的,需要用到selenium。但话又说回来,它也是 returns 我的空数据框。有人可以帮我吗?真的很感激。
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
# Instantiate options
options = webdriver.ChromeOptions()
options.headless = True
# Instantiate a webdriver
site = 'https://www.ishares.com/us/products/268752/ishares-global-reit-etf'
wd = webdriver.Chrome('chromedriver',options=options)
wd.get(site)
# Load the HTML page
html = wd.page_source
# Extract data with pandas
df = pd.read_html(html)
table = df[6]
要从 iShares Global REIT ETF webpage you need to induce WebDriverWait for the
代码块:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
wd.get("https://www.ishares.com/us/products/268752/ishares-global-reit-etf")
WebDriverWait(wd, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button#onetrust-accept-btn-handler"))).click()
wd.execute_script("arguments[0].scrollIntoView();", WebDriverWait(wd, 20).until(EC.visibility_of_element_located((By.XPATH, "//div[@data-componentname]/h2[normalize-space()='Holdings']"))))
data = WebDriverWait(wd, 20).until(EC.visibility_of_element_located((By.XPATH, "//table[@aria-describedby='allHoldingsTable_info']"))).get_attribute("outerHTML")
df = pd.read_html(data)
# df = pd.read_html(data, flavor='html5lib')
print(df)
控制台输出:
[ Ticker Name Sector Asset Class ... CUSIP ISIN SEDOL Accrual Date
0 PLD PROLOGIS REIT INC Real Estate Equity ... 74340W103 US74340W1036 B44WZD7 -
1 EQIX EQUINIX REIT INC Real Estate Equity ... 29444U700 US29444U7000 BVLZX12 -
2 PSA PUBLIC STORAGE REIT Real Estate Equity ... 74460D109 US74460D1090 2852533 -
3 SPG SIMON PROPERTY GROUP REIT INC Real Estate Equity ... 828806109 US8288061091 2812452 -
4 DLR DIGITAL REALTY TRUST REIT INC Real Estate Equity ... 253868103 US2538681030 B03GQS4 -
5 O REALTY INCOME REIT CORP Real Estate Equity ... 756109104 US7561091049 2724193 -
6 WELL WELLTOWER INC Real Estate Equity ... 95040Q104 US95040Q1040 BYVYHH4 -
7 AVB AVALONBAY COMMUNITIES REIT INC Real Estate Equity ... 053484101 US0534841012 2131179 -
8 ARE ALEXANDRIA REAL ESTATE EQUITIES RE Real Estate Equity ... 015271109 US0152711091 2009210 -
9 EQR EQUITY RESIDENTIAL REIT Real Estate Equity ... 29476L107 US29476L1070 2319157 -
[10 rows x 12 columns]]