下拉列表下的硒网页抓取

Selenium web scraping under dropdown list

Selenium 网页抓取:

  1. 下拉列表更改
  2. 尝试更改结果抓取
  3. 失败

代码:

'''
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
import time

driver=webdriver.Chrome(executable_path=r'C:\Program Files\Python39\chromedriver.exe')
driver.maximize_window()
driver.get("https://www.gastite.com/locator/?cats=109")

for i in range(1,3,1):
    state=driver.find_element(By.NAME, 'state')
    stateDD=Select(state)
    stateDD.select_by_index(i)
    driver.find_element(By.XPATH,'//*[@id="content"]/div[3]/form/input[2]')
    time.sleep(2)
    lists=driver.find_elements_by_css_selector("div.repcontent > a")
    #print(lists)
    for list in lists:
        company=list.find_element_by_class_name('namelink company_title').text
        address=list.find_element_by_class_name('address').text
        address1=list.find_element_by_class_name('address2').text
        tel=list.find_element_by_tag_name('span').text
        fax=list.find_element_by_tag_name('span').text
        web=list.get_attribute('href')
        print(company, address, address1, tel, fax, web)
    
'''

我 select 下拉列表中的一个状态作为示例,其余的请尝试。

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.select import Select
from bs4 import BeautifulSoup

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

url ='https://www.gastite.com/locator/?cats=109' 
driver.maximize_window()

Select(WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.XPATH, "//select[@name='state']")))).select_by_value("AL")

soup = BeautifulSoup(driver.page_source,'lxml')
data=[]
for card in soup.select('#resultscontainer > ul > li > div.repcontent'):
    company = card.select_one('h3.company_title').text
    print(company)
    address = card.select_one('div.address').text
    address2 = card.select_one('div.address2').text
    phone = card.select('span').contents[0]
    fax = card.select('span').contents[1]

    data.append({
        'company':company,
        'address':address,
        'address2':address2,
        'phone':phone,
        'fax':fax
        })
        
print(data)