如何使用 webdriver 将多个页面的数据保存到单个 csv 中
how to save data from multiple pages using webdriver into a single csv
所以我正在尝试使用 selenium (webdriver) 从 googlescholar 保存数据,到目前为止我可以打印我想要的数据,但是当我将它保存到 csv 中时它只保存了第一页
from selenium import webdriver
from selenium.webdriver.common.by import By
# Import statements for explicit wait
from selenium.webdriver.support.ui import WebDriverWait as W
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
from csv import writer
exec_path = r"C:\Users\gvste\Desktop\proyecto\chromedriver.exe"
URL = r"https://scholar.google.com/citations?view_op=view_org&hl=en&authuser=2&org=8337597745079551909"
button_locators = ['//*[@id="gsc_authors_bottom_pag"]/div/button[2]', '//*[@id="gsc_authors_bottom_pag"]/div/button[2]','//*[@id="gsc_authors_bottom_pag"]/div/button[2]']
wait_time = 3
driver = webdriver.Chrome(executable_path=exec_path)
driver.get(URL)
wait = W(driver, wait_time)
#driver.maximize_window()
for j in range(len(button_locators)):
button_link = wait.until(EC.element_to_be_clickable((By.XPATH, button_locators[j])))
address = driver.find_elements_by_class_name("gsc_1usr")
#for post in address:
#print(post.text)
time.sleep(4)
with open('post.csv','a') as s:
for i in range(len(address)):
addresst = address
#if addresst == 'NONE':
# addresst = str(address)
#else:
addresst = address[i].text.replace('\n',',')
s.write(addresst+ '\n')
button_link.click()
time.sleep(4)
#driver.quit()
你只得到一个第一页数据,因为你的程序在点击下一页按钮后停止了。你必须把所有这些都放在一个 for 循环中。
注意我在 range(7) 中写了,因为我知道有 7 页要打开,实际上我们永远不应该那样做。想象一下,如果我们有数千页。我们应该添加一些逻辑来检查 "next page button" 是否存在或某些东西并循环直到它不存在
exec_path = r"C:\Users\gvste\Desktop\proyecto\chromedriver.exe"
URL = r"https://scholar.google.com/citations?view_op=view_org&hl=en&authuser=2&org=8337597745079551909"
button_locators = "/html/body/div/div[8]/div[2]/div/div[12]/div/button[2]"
wait_time = 3
driver = webdriver.Chrome(executable_path=exec_path)
driver.get(URL)
wait = W(driver, wait_time)
time.sleep(4)
# 7 pages. In reality, we should get this number programmatically
for page in range(7):
# read data from new page
address = driver.find_elements_by_class_name("gsc_1usr")
# write to file
with open('post.csv','a') as s:
for i in range(len(address)):
addresst = address[i].text.replace('\n',',')
s.write(addresst+ '\n')
# find and click next page button
button_link = wait.until(EC.element_to_be_clickable((By.XPATH, button_locators)))
button_link.click()
time.sleep(4)
此外,您将来应该考虑将所有这些 time.sleeps
更改为 wait.until
。因为有时您的页面加载速度更快,并且程序可以更快地完成工作。或者更糟的是,您的网络可能会出现延迟,这会搞砸您的脚本。
所以我正在尝试使用 selenium (webdriver) 从 googlescholar 保存数据,到目前为止我可以打印我想要的数据,但是当我将它保存到 csv 中时它只保存了第一页
from selenium import webdriver
from selenium.webdriver.common.by import By
# Import statements for explicit wait
from selenium.webdriver.support.ui import WebDriverWait as W
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
from csv import writer
exec_path = r"C:\Users\gvste\Desktop\proyecto\chromedriver.exe"
URL = r"https://scholar.google.com/citations?view_op=view_org&hl=en&authuser=2&org=8337597745079551909"
button_locators = ['//*[@id="gsc_authors_bottom_pag"]/div/button[2]', '//*[@id="gsc_authors_bottom_pag"]/div/button[2]','//*[@id="gsc_authors_bottom_pag"]/div/button[2]']
wait_time = 3
driver = webdriver.Chrome(executable_path=exec_path)
driver.get(URL)
wait = W(driver, wait_time)
#driver.maximize_window()
for j in range(len(button_locators)):
button_link = wait.until(EC.element_to_be_clickable((By.XPATH, button_locators[j])))
address = driver.find_elements_by_class_name("gsc_1usr")
#for post in address:
#print(post.text)
time.sleep(4)
with open('post.csv','a') as s:
for i in range(len(address)):
addresst = address
#if addresst == 'NONE':
# addresst = str(address)
#else:
addresst = address[i].text.replace('\n',',')
s.write(addresst+ '\n')
button_link.click()
time.sleep(4)
#driver.quit()
你只得到一个第一页数据,因为你的程序在点击下一页按钮后停止了。你必须把所有这些都放在一个 for 循环中。
注意我在 range(7) 中写了,因为我知道有 7 页要打开,实际上我们永远不应该那样做。想象一下,如果我们有数千页。我们应该添加一些逻辑来检查 "next page button" 是否存在或某些东西并循环直到它不存在
exec_path = r"C:\Users\gvste\Desktop\proyecto\chromedriver.exe"
URL = r"https://scholar.google.com/citations?view_op=view_org&hl=en&authuser=2&org=8337597745079551909"
button_locators = "/html/body/div/div[8]/div[2]/div/div[12]/div/button[2]"
wait_time = 3
driver = webdriver.Chrome(executable_path=exec_path)
driver.get(URL)
wait = W(driver, wait_time)
time.sleep(4)
# 7 pages. In reality, we should get this number programmatically
for page in range(7):
# read data from new page
address = driver.find_elements_by_class_name("gsc_1usr")
# write to file
with open('post.csv','a') as s:
for i in range(len(address)):
addresst = address[i].text.replace('\n',',')
s.write(addresst+ '\n')
# find and click next page button
button_link = wait.until(EC.element_to_be_clickable((By.XPATH, button_locators)))
button_link.click()
time.sleep(4)
此外,您将来应该考虑将所有这些 time.sleeps
更改为 wait.until
。因为有时您的页面加载速度更快,并且程序可以更快地完成工作。或者更糟的是,您的网络可能会出现延迟,这会搞砸您的脚本。