如何在抓取站点时移动到下一个封闭(div)?
How to move to the next enclosing(div) while scraping a site?
中的所有数据都是从第一个 table 开始填充的。我无法移动到下一个 div 并获取每个 tr
的 td
的数据。
站点:https://asd.com/page/
下面是我写的代码。
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
my_url= "https://asd.com/page/asd"
driver.get(my_url)
boxes = driver.find_elements(By.CLASS_NAME, "col-md-4")
companies = []
company = {}
for box in boxes:
header = box.find_element(By.CLASS_NAME,"text-primary.text-uppercase")
company['name']= header.text
td= box
company['Type']= td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[1]/td").text
company['Capital']= td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[2]/td").text
company['Address'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[3]/td").text
company['Owner'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[4]/td").text
company['Co-Owner'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[5]/td").text
company['Duration'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[6]/td").text
company['Place'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[7]/td").text
company['Company ID'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[8]/td").text
companies.append(company)
print(company)
这里有几个问题:
- 您需要在
driver.get(my_url)
和 boxes = driver.find_elements(By.CLASS_NAME, "col-md-4")
之间添加一些延迟,以便在获取所有元素的列表之前加载元素。
text-primary.text-uppercase
实际上是 2 class 名称:text-primary
和 text-uppercase
所以你应该使用 XPATH 或 CSS_SELECTOR 来定位元素 2 class 名称,而不是 CLASS_NAME
.
- 为了在另一个元素中定位元素,您应该使用以点开头的 XPATH
.
- 像
//div/div/div/table/tbody/tr[1]/td
这样的定位器是绝对的,而它们应该根据父 box
元素计算。
- 不需要定义
td
元素,可以在这里使用已有的box
元素。
- 像
//div/div/div/table/tbody/tr[1]/td
这样的定位器可以而且应该改进。
- 您可能需要在遍历框时滚动到框。
- 我认为
company = {}
应该定义在循环内部。
这应该会更好:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
my_url= "https://monentreprise.bj/page/annonces"
driver.get(my_url)
wait = WebDriverWait(driver, 20)
actions = ActionChains(driver)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "col-md-4")))
time.sleep(2)
boxes = driver.find_elements(By.CLASS_NAME, "col-md-4")
companies = []
for box in boxes:
actions.move_to_element(box).perform()
time.sleep(0.3)
company = {}
header = box.find_element(By.XPATH,".//h5[@class='text-primary text-uppercase']")
company['name']= header.text
company['Objet']= box.find_element(By.XPATH,".//tr[1]/td").text
company['Capital']= box.find_element(By.XPATH,".//tr[2]/td").text
company['Siège Social'] = box.find_element(By.XPATH,".//tr[3]/td").text
company['Gérant'] = box.find_element(By.XPATH,".//tr[4]/td").text
company['Co-Gérant'] = box.find_element(By.XPATH,".//tr[5]/td").text
company['Durée'] = box.find_element(By.XPATH,".//tr[6]/td").text
company['Dépôt'] = box.find_element(By.XPATH,".//tr[7]/td").text
company['Immatriculation RCCM'] = box.find_element(By.XPATH,".//tr[8]/td").text
companies.append(company)
print(company)
中的所有数据都是从第一个 table 开始填充的。我无法移动到下一个 div 并获取每个 tr
的 td
的数据。
站点:https://asd.com/page/
下面是我写的代码。
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
my_url= "https://asd.com/page/asd"
driver.get(my_url)
boxes = driver.find_elements(By.CLASS_NAME, "col-md-4")
companies = []
company = {}
for box in boxes:
header = box.find_element(By.CLASS_NAME,"text-primary.text-uppercase")
company['name']= header.text
td= box
company['Type']= td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[1]/td").text
company['Capital']= td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[2]/td").text
company['Address'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[3]/td").text
company['Owner'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[4]/td").text
company['Co-Owner'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[5]/td").text
company['Duration'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[6]/td").text
company['Place'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[7]/td").text
company['Company ID'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[8]/td").text
companies.append(company)
print(company)
这里有几个问题:
- 您需要在
driver.get(my_url)
和boxes = driver.find_elements(By.CLASS_NAME, "col-md-4")
之间添加一些延迟,以便在获取所有元素的列表之前加载元素。 text-primary.text-uppercase
实际上是 2 class 名称:text-primary
和text-uppercase
所以你应该使用 XPATH 或 CSS_SELECTOR 来定位元素 2 class 名称,而不是CLASS_NAME
.- 为了在另一个元素中定位元素,您应该使用以点开头的 XPATH
.
- 像
//div/div/div/table/tbody/tr[1]/td
这样的定位器是绝对的,而它们应该根据父box
元素计算。 - 不需要定义
td
元素,可以在这里使用已有的box
元素。 - 像
//div/div/div/table/tbody/tr[1]/td
这样的定位器可以而且应该改进。 - 您可能需要在遍历框时滚动到框。
- 我认为
company = {}
应该定义在循环内部。
这应该会更好:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
my_url= "https://monentreprise.bj/page/annonces"
driver.get(my_url)
wait = WebDriverWait(driver, 20)
actions = ActionChains(driver)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "col-md-4")))
time.sleep(2)
boxes = driver.find_elements(By.CLASS_NAME, "col-md-4")
companies = []
for box in boxes:
actions.move_to_element(box).perform()
time.sleep(0.3)
company = {}
header = box.find_element(By.XPATH,".//h5[@class='text-primary text-uppercase']")
company['name']= header.text
company['Objet']= box.find_element(By.XPATH,".//tr[1]/td").text
company['Capital']= box.find_element(By.XPATH,".//tr[2]/td").text
company['Siège Social'] = box.find_element(By.XPATH,".//tr[3]/td").text
company['Gérant'] = box.find_element(By.XPATH,".//tr[4]/td").text
company['Co-Gérant'] = box.find_element(By.XPATH,".//tr[5]/td").text
company['Durée'] = box.find_element(By.XPATH,".//tr[6]/td").text
company['Dépôt'] = box.find_element(By.XPATH,".//tr[7]/td").text
company['Immatriculation RCCM'] = box.find_element(By.XPATH,".//tr[8]/td").text
companies.append(company)
print(company)