如何使用 selenium python 从具有多个页面(分页)的特定 div 容器中获取子元素的所有超链接
how to get all the hyperlinks from child elements from a specific div container having multiple pages(pagination) using selenium python
我正在尝试从 this site 的父 id='search-properties' 抓取子元素 href 属性内的链接。我首先尝试使用 find_elements_by_id 定位元素,然后使用 find_elements_by_css_selector 定位链接,但我经常遇到 AttributeError: 'list' object has no attribute 'find_elements_by_css_selectors' 在这样做的同时,我尝试使用 find_elements_by_tag_name 以及 find_elements_by_xpath,但它实际上没有抓取链接,而是抓取了对我没有用的链接内的细节。所以经过大量的环顾四周,我终于找到了这段代码
from logging import exception
from typing import Text
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
import time
import pandas as pd
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
import csv
from selenium import webdriver
PATH = "C:/ProgramData/Anaconda3/scripts/chromedriver.exe" #always keeps chromedriver.exe inside scripts to save hours of debugging
driver =webdriver.Chrome(PATH) #preety important part
driver.get("https://www.gharbazar.com/property/search/?_q=&_vt=1&_r=0&_pt=residential&_si=0&_srt=latest")
driver.implicitly_wait(10)
house=driver.find_elements_by_tag_name("a")
# traverse list
for lnk in house:
# get_attribute() to get all href
print(lnk.get_attribute('href'))
此代码的问题在于它会抓取所有链接,这意味着它也有一些绝对不必要的链接,就像这张图片 don't need javascript void 中那样。
最后,对于分页,我尝试遵循此 answer 但得到了无限循环,因此我不得不删除分页代码。总之,我正在尝试获取 id = 'search-properties'
的多个页面的链接
试试这个。
links = driver.find_elements_by_xpath("//div[@id = 'search-properties']/a")
for ele in links:
print(ele.get_attribute('href'))
我试过这个用于分页。
from selenium import webdriver
import time
driver = webdriver.Chrome(executable_path="path")
driver.implicitly_wait(10)
driver.get("https://www.gharbazar.com/property/search/?_q=&_vt=1&_r=0&_pt=residential&_si=0&_srt=latest")
page=2
while True:
nextoption = driver.find_element_by_xpath("//div[@id='pagination-div']//a[contains(text(),'>>')]")
driver.execute_script("arguments[0].scrollIntoView(true);",nextoption)
driver.execute_script("window.scrollBy(0,-300)")
time.sleep(5)
try:
driver.find_element_by_link_text(str(page)).click()
page += 1
time.sleep(3)
except Exception as e:
print(e)
break
driver.quit()
我试过从每个页面获取链接。
driver.get("https://www.gharbazar.com/property/search/?_q=&_vt=1&_r=0&_pt=residential&_si=0&_srt=latest")
page=2
pagelinks= []
#links of the 1st page
links = driver.find_elements_by_xpath("//div[@id = 'search-properties']/a")
for ele in links:
pagelinks.append(ele.get_attribute('href'))
while True:
nextoption = driver.find_element_by_xpath("//div[@id='pagination-div']//a[contains(text(),'>>')]")
driver.execute_script("arguments[0].scrollIntoView(true);",nextoption)
driver.execute_script("window.scrollBy(0,-300)")
time.sleep(5)
try:
driver.find_element_by_link_text(str(page)).click()
page += 1
links = driver.find_elements_by_xpath("//div[@id = 'search-properties']/a")
for ele in links:
pagelinks.append(ele.get_attribute('href'))
time.sleep(3)
except Exception as e:
print(e)
break
print(len(pagelinks))
for i in range(len(pagelinks)):
print(pagelinks[i])
driver.quit()
我正在尝试从 this site 的父 id='search-properties' 抓取子元素 href 属性内的链接。我首先尝试使用 find_elements_by_id 定位元素,然后使用 find_elements_by_css_selector 定位链接,但我经常遇到 AttributeError: 'list' object has no attribute 'find_elements_by_css_selectors' 在这样做的同时,我尝试使用 find_elements_by_tag_name 以及 find_elements_by_xpath,但它实际上没有抓取链接,而是抓取了对我没有用的链接内的细节。所以经过大量的环顾四周,我终于找到了这段代码
from logging import exception
from typing import Text
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
import time
import pandas as pd
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
import csv
from selenium import webdriver
PATH = "C:/ProgramData/Anaconda3/scripts/chromedriver.exe" #always keeps chromedriver.exe inside scripts to save hours of debugging
driver =webdriver.Chrome(PATH) #preety important part
driver.get("https://www.gharbazar.com/property/search/?_q=&_vt=1&_r=0&_pt=residential&_si=0&_srt=latest")
driver.implicitly_wait(10)
house=driver.find_elements_by_tag_name("a")
# traverse list
for lnk in house:
# get_attribute() to get all href
print(lnk.get_attribute('href'))
此代码的问题在于它会抓取所有链接,这意味着它也有一些绝对不必要的链接,就像这张图片 don't need javascript void 中那样。 最后,对于分页,我尝试遵循此 answer 但得到了无限循环,因此我不得不删除分页代码。总之,我正在尝试获取 id = 'search-properties'
的多个页面的链接试试这个。
links = driver.find_elements_by_xpath("//div[@id = 'search-properties']/a")
for ele in links:
print(ele.get_attribute('href'))
我试过这个用于分页。
from selenium import webdriver
import time
driver = webdriver.Chrome(executable_path="path")
driver.implicitly_wait(10)
driver.get("https://www.gharbazar.com/property/search/?_q=&_vt=1&_r=0&_pt=residential&_si=0&_srt=latest")
page=2
while True:
nextoption = driver.find_element_by_xpath("//div[@id='pagination-div']//a[contains(text(),'>>')]")
driver.execute_script("arguments[0].scrollIntoView(true);",nextoption)
driver.execute_script("window.scrollBy(0,-300)")
time.sleep(5)
try:
driver.find_element_by_link_text(str(page)).click()
page += 1
time.sleep(3)
except Exception as e:
print(e)
break
driver.quit()
我试过从每个页面获取链接。
driver.get("https://www.gharbazar.com/property/search/?_q=&_vt=1&_r=0&_pt=residential&_si=0&_srt=latest")
page=2
pagelinks= []
#links of the 1st page
links = driver.find_elements_by_xpath("//div[@id = 'search-properties']/a")
for ele in links:
pagelinks.append(ele.get_attribute('href'))
while True:
nextoption = driver.find_element_by_xpath("//div[@id='pagination-div']//a[contains(text(),'>>')]")
driver.execute_script("arguments[0].scrollIntoView(true);",nextoption)
driver.execute_script("window.scrollBy(0,-300)")
time.sleep(5)
try:
driver.find_element_by_link_text(str(page)).click()
page += 1
links = driver.find_elements_by_xpath("//div[@id = 'search-properties']/a")
for ele in links:
pagelinks.append(ele.get_attribute('href'))
time.sleep(3)
except Exception as e:
print(e)
break
print(len(pagelinks))
for i in range(len(pagelinks)):
print(pagelinks[i])
driver.quit()