如何使用 Selenium 向下滚动 zillow 网页的右侧部分

How to scroll down through the right part of the zillow webpage using Selenium

我正在尝试向下滚动到网页右侧 website 的最底部。

我尝试使用以下代码 - 但不幸的是它不会在网站右侧向下滚动

import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import os, sys
import xlwings as xw
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
from webdriver_manager.chrome import ChromeDriverManager
from fake_useragent import UserAgent


if __name__ == '__main__':
  print(f"Checking chromedriver...")
  os.environ['WDM_LOG_LEVEL'] = '0' 
  ua = UserAgent()
  userAgent = ua.random
  options = Options()
  # options.add_argument('--headless')
  options.add_argument("start-maximized")
  options.add_experimental_option("prefs", {"profile.default_content_setting_values.notifications": 1})    
  options.add_experimental_option("excludeSwitches", ["enable-automation"])
  options.add_experimental_option('excludeSwitches', ['enable-logging'])
  options.add_experimental_option('useAutomationExtension', False)
  options.add_argument('--disable-blink-features=AutomationControlled')
  options.add_argument(f'user-agent={userAgent}') 
  srv=Service(ChromeDriverManager().install())
  driver = webdriver.Chrome (service=srv, options=options)    
  waitWebDriver = WebDriverWait (driver, 10)         
  
  link = f"https://www.zillow.com/clayton-county-ga/houses/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22usersSearchTerm%22%3A%22Clayton%20County%2C%20GA%22%2C%22mapBounds%22%3A%7B%22west%22%3A-84.83476148874033%2C%22east%22%3A-84.0313862445997%2C%22south%22%3A33.22700148452994%2C%22north%22%3A33.70472214817801%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A1622%2C%22regionType%22%3A4%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22beds%22%3A%7B%22min%22%3A3%7D%2C%22baths%22%3A%7B%22min%22%3A2%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%2C%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22tow%22%3A%7B%22value%22%3Afalse%7D%2C%22mf%22%3A%7B%22value%22%3Afalse%7D%2C%22con%22%3A%7B%22value%22%3Afalse%7D%2C%22land%22%3A%7B%22value%22%3Afalse%7D%2C%22apa%22%3A%7B%22value%22%3Afalse%7D%2C%22manu%22%3A%7B%22value%22%3Afalse%7D%2C%22apco%22%3A%7B%22value%22%3Afalse%7D%2C%22sqft%22%3A%7B%22min%22%3A1000%2C%22max%22%3A3000%7D%2C%22lot%22%3A%7B%22max%22%3A43560%7D%2C%22built%22%3A%7B%22min%22%3A1965%7D%2C%22gar%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A11%7D" 

  driver.get (link)       
  time.sleep(WAIT) 

  element = driver.find_element(By.XPATH,"//div[@id='px-captcha']")
  action = ActionChains(driver)
  click = ActionChains(driver)
  action.click_and_hold(element)
  action.perform()
  time.sleep(10)
  action.release(element)
  action.perform()
  time.sleep(0.2)
  action.release(element)     
  time.sleep(WAIT)      

  driver.find_element(By.XPATH,"//h1").click()  
  time.sleep(WAIT)      

  driver.execute_script("window.scrollTo(0, 10000)")               
  time.sleep(5000)    

如何向下滚动到页面右侧的最底部?

在网站的以下部分,我想向下滚动到最底部:

向下滚动到 zillow website on the right side of the webpage you can the pagination element once it is visible inducing WebDriverWait for the and you can use either of the following 的最底部:

  • 代码块:

    driver.get("https://www.zillow.com/clayton-county-ga/houses/3-_beds/2.0-_baths/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22usersSearchTerm%22%3A%22Clayton%20County%2C%20GA%22%2C%22mapBounds%22%3A%7B%22west%22%3A-85.19662367135751%2C%22east%22%3A-83.66952406198251%2C%22south%22%3A33.16207210856734%2C%22north%22%3A33.76924644337602%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A1622%2C%22regionType%22%3A4%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22beds%22%3A%7B%22min%22%3A3%7D%2C%22baths%22%3A%7B%22min%22%3A2%7D%2C%22sqft%22%3A%7B%22min%22%3A1000%2C%22max%22%3A3000%7D%2C%22built%22%3A%7B%22min%22%3A1965%7D%2C%22con%22%3A%7B%22value%22%3Afalse%7D%2C%22apa%22%3A%7B%22value%22%3Afalse%7D%2C%22mf%22%3A%7B%22value%22%3Afalse%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%2C%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22lot%22%3A%7B%22max%22%3A43560%7D%2C%22land%22%3A%7B%22value%22%3Afalse%7D%2C%22gar%22%3A%7B%22value%22%3Atrue%7D%2C%22tow%22%3A%7B%22value%22%3Afalse%7D%2C%22manu%22%3A%7B%22value%22%3Afalse%7D%2C%22apco%22%3A%7B%22value%22%3Afalse%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A9%7D")
    driver.execute_script("return arguments[0].scrollIntoView(true);", WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, "//div[@class='search-pagination']"))))
    
  • 浏览器快照:

互联网世界中最聪明的网站之一是 zillow,它还提供 API。如果您真的喜欢并想从内心学习 Web scraping 为什么要硬编码,为什么不如何从 api 中提取数据?下面是一个示例,如何仅使用 requests 模块从 api 抓取数据作为 json 响应。

脚本:

import requests

headers = {
    "User-Agent": "Mozilla/5.0 ",
    "Accept": "*/*",
    "Accept-Language": "en-US,en;q=0.5",
    "Pragma": "no-cache",
    "Cache-Control": "no-cache"
}

with requests.Session() as s:
    s.headers.update(headers)
    s.head('https://www.zillow.com/')
    for page in range(1,3):
        params = {
                "searchQueryState": '{"pagination":{"currentPage":2},"usersSearchTerm":"Clayton County, GA","mapBounds":{"west":-84.85604749948251,"east":-84.01010023385751,"south":33.18506146243053,"north":33.746411533949434},"regionSelection":[{"regionId":1622,"regionType":4}],"isMapVisible":false,"filterState":{"beds":{"min":3},"baths":{"min":2},"sqft":{"min":1000,"max":3000},"built":{"min":1965},"isCondo":{"value":false},"isApartment":{"value":false},"isMultiFamily":{"value":false},"isAllHomes":{"value":true},"sortSelection":{"value":"globalrelevanceex"},"lotSize":{"max":43560},"isLotLand":{"value":false},"hasGarage":{"value":true},"isTownhouse":{"value":false},"isManufactured":{"value":false},"isApartmentOrCondo":{"value":false}},"isListVisible":true}',
                "wants": '{"cat1":["listResults"],"cat2":["total"]}'
                }
   

        r = s.get('https://www.zillow.com/search/GetSearchPageState.htm',params=params).json()
        #print(r)
        for card in r['cat1']['searchResults']['listResults']:
            price=card['price']
            print(price)

输出:

5,000
9,000
9,000
0,000
5,000
0,000
5,000
0,000
0,000
9,900
0,000
5,000
9,000
9,900

...等等