Python、Selenium - 解析。无法从动态填充中获取信息
Python, Selenium - parse. Can't get info from dynamic filling
我想了解如何从动态生成的字段中获取信息。
当我尝试简单的网站时,一切正常。然后我决定尝试更困难的,现在我想不通了。我花了大约两周的时间,一遍又一遍地划掉了我在互联网上找到的解决方案选项。
现在我不确定我能否以这种方式获取出现在网站上的信息。当然,很可能我做错了什么,但我不能对它是如何做的有一些新的想法。现在,我决定在这里问。也许有人明白这一点,可以提示。如果是 - 请给我一些例子。
我用来学习的网站-kbp.aero/en
我想获取的信息(到达时间表)- .tbody .tr .td
例如我试过:
1.
URL = 'https://kbp.aero/en/'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
}
time.sleep(1)
response = requests.get(URL, headers = HEADERS)
soup = BeautifulSoup(response.content, 'html.parser')
items = soup.find('div', class_ = 'table_wrp out yesterday')
items = items.findAll('tr', class_ = 'tr')
comps = []
if(len(items) > 0):
for item in items:
comps.append({
'title':item.find('td', class_ = 'td').get_text(strip = True),
})
for comp in comps:
print(comp['title'])
# for item in items:
# comps.append({
# 'text': item.get_text(strip=True)
# })
#
# for comp in comps:
# print(comp['text'])
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def main():
driver = webdriver.Chrome()
driver.get("https://kbp.aero/en/")
wait = WebDriverWait(driver, 10)
element = wait.until(EC.text_to_be_present_in_element((By.CLASS_NAME, 'tbody'), ''))
tds = element.find_elements(By.CLASS_NAME, "td")
for td in tds:
print(td.text)
# try:
# element = WebDriverWait(driver, 10).until(
# EC.presence_of_element_located((By.CLASS_NAME, "tbody"))
# )
# tds = element.find_elements(By.CLASS_NAME, "td")
# for td in tds:
# print(td.text)
#
# finally:
# driver.quit()
感谢任何建议。
这将获取整个 table 数据:
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
PATH = r"chromedriverexe path"
driver = webdriver.Chrome(PATH)
driver.get("https://kbp.aero/en/")
driver.maximize_window()
sleep(3)
print(driver.find_element(By.CSS_SELECTOR, "div.table_wrp.out.today > table").text)
输出:
Рейс Час Призначення Перевізник Термінал Гейт Статус
TK 1256 15:05 Istanbul Turkish Airlines D D5 Boarding Completed
PS 9556 15:05 Istanbul Ukraine International Airlines D D5 Boarding Completed
7W 163 15:10 Lviv Wind Rose D D19 Boarding
FR 3167 15:10 Warsaw Ryanair D D9 Boarding
PS 9013 15:15 Ivano-Frankivsk Ukraine International Airlines D D18 Boarding
7W 113 15:15 Ivano-Frankivsk Wind Rose D D18 Boarding
我想了解如何从动态生成的字段中获取信息。 当我尝试简单的网站时,一切正常。然后我决定尝试更困难的,现在我想不通了。我花了大约两周的时间,一遍又一遍地划掉了我在互联网上找到的解决方案选项。 现在我不确定我能否以这种方式获取出现在网站上的信息。当然,很可能我做错了什么,但我不能对它是如何做的有一些新的想法。现在,我决定在这里问。也许有人明白这一点,可以提示。如果是 - 请给我一些例子。
我用来学习的网站-kbp.aero/en
我想获取的信息(到达时间表)- .tbody .tr .td
例如我试过:
1.
URL = 'https://kbp.aero/en/'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
}
time.sleep(1)
response = requests.get(URL, headers = HEADERS)
soup = BeautifulSoup(response.content, 'html.parser')
items = soup.find('div', class_ = 'table_wrp out yesterday')
items = items.findAll('tr', class_ = 'tr')
comps = []
if(len(items) > 0):
for item in items:
comps.append({
'title':item.find('td', class_ = 'td').get_text(strip = True),
})
for comp in comps:
print(comp['title'])
# for item in items:
# comps.append({
# 'text': item.get_text(strip=True)
# })
#
# for comp in comps:
# print(comp['text'])
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def main():
driver = webdriver.Chrome()
driver.get("https://kbp.aero/en/")
wait = WebDriverWait(driver, 10)
element = wait.until(EC.text_to_be_present_in_element((By.CLASS_NAME, 'tbody'), ''))
tds = element.find_elements(By.CLASS_NAME, "td")
for td in tds:
print(td.text)
# try:
# element = WebDriverWait(driver, 10).until(
# EC.presence_of_element_located((By.CLASS_NAME, "tbody"))
# )
# tds = element.find_elements(By.CLASS_NAME, "td")
# for td in tds:
# print(td.text)
#
# finally:
# driver.quit()
感谢任何建议。
这将获取整个 table 数据:
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
PATH = r"chromedriverexe path"
driver = webdriver.Chrome(PATH)
driver.get("https://kbp.aero/en/")
driver.maximize_window()
sleep(3)
print(driver.find_element(By.CSS_SELECTOR, "div.table_wrp.out.today > table").text)
输出:
Рейс Час Призначення Перевізник Термінал Гейт Статус
TK 1256 15:05 Istanbul Turkish Airlines D D5 Boarding Completed
PS 9556 15:05 Istanbul Ukraine International Airlines D D5 Boarding Completed
7W 163 15:10 Lviv Wind Rose D D19 Boarding
FR 3167 15:10 Warsaw Ryanair D D9 Boarding
PS 9013 15:15 Ivano-Frankivsk Ukraine International Airlines D D18 Boarding
7W 113 15:15 Ivano-Frankivsk Wind Rose D D18 Boarding