从数据框中读取网络链接会引发 "stale element reference: element is not attached to the page document" 错误

Reading weblink from dataframe throws "stale element reference: element is not attached to the page document" error

我得到了一个数据框,其中包含对两家餐厅的 link 至 google 条评论。我想将两家餐厅的所有评论(一个接一个)加载到浏览器中,然后将它们保存到一个新的数据框中。我写了一个脚本来读取所有评论并将其加载到浏览器中,如下所示:

from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time

link_df =   Link
0   https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]
1   https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]

i = 0
driver = webdriver.Chrome()
for index, i in link_df.iterrows():
    base_url = i['Link']   #link_df['Link'][i]
    
    driver.get(base_url)
    WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//div[./span[text()='Newest']]"))).click()
    print('Restaurant number is ',index)
    
    title = driver.find_element_by_xpath("//div[@class='P5Bobd']").text
    address = driver.find_element_by_xpath("//div[@class='T6pBCe']").text
    overall_rating = driver.find_element_by_xpath("//div[@class='review-score-container']//span[@class='Aq14fc']").text
    
    total_reviews_text =driver.find_element_by_xpath("//div[@class='review-score-container']//div//div//span//span[@class='z5jxId']").text
    num_reviews = int (total_reviews_text.split()[0])
    all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
    time.sleep(2)
    total_reviews = len(all_reviews)
    
    while total_reviews < num_reviews:
        driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
        WebDriverWait(driver, 5, 0.25).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class$="activityIndicator"]')))
        time.sleep(5)
        all_reviews = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
        print(total_reviews)
        total_reviews +=1
    reviews_info = driver.find_elements_by_xpath("//div[@class='jxjCjc']")
    review_information = pd.DataFrame(columns=["Restaurant title","Restaurant rating","Total reviews","Reviewer Name","Rating", "Review"])
    name= ''
    rating = ''
    text = ''
    
    
    for index,review_info in enumerate(reviews_info):
        name = review_info.find_element_by_xpath("./div/div/a").text
        rating = review_info.find_element_by_xpath(".//div[@class='PuaHbe']//g-review-stars//span").get_attribute('aria-label')
        text = review_info.find_element_by_xpath(".//div[@class='Jtu6Td']//span").text
        review_information.at[len(review_information)] = [title,overall_rating,num_reviews,name,rating,text]
    
    filename = 'Google_reviews' + ' ' +pd.to_datetime("now").strftime("%Y_%m_%d")+'.csv'
    files_present = glob.glob(filename)
    if files_present:
        review_information.to_csv(filename,index=False,mode='a',header=False)
    else:
        review_information.to_csv(filename,index=False)
    
    driver.get('https:ww.google.com')
    time.sleep(3)

问题是脚本在到达下一行时抛出错误。

driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])

它抛出以下错误:

StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
  (Session info: chrome=95.0.4638.69)

当我尝试相同的程序但没有在数据帧中存储 google link 时(即没有 for 循环而不是 base_url = i['Link'],我写了 base_url = google 评论 link) 它工作正常。

我不确定我在哪里犯了错误。任何解决问题的建议或帮助将不胜感激?

编辑

  1. 你把驱动程序的创建放在 for 循环之外
  2. 当第一个弹出窗口总是在前面时,您无法使用 gps 数据启动新的 url,如果您启动它,它会留在后门,更简单的方法是启动新的 url没有 gps 数据 -> https:ww.google.com 并在 12 月 3 日之前等待你的循环:
  3. 你的计数不好,我已经更改了你的选择器并更改了总数并在评论中设置了一些行

from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.firefox.options import Options
import time

link_df =  ["https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]",
            "https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]"
           ]
i = 0
binary = r'C:\Program Files (x86)\Mozilla Firefox\firefox.exe'
cap = DesiredCapabilities().FIREFOX
cap["marionette"] = True
options = Options()
options.binary = binary
driver = webdriver.Firefox(options=options, capabilities=cap, executable_path="E:\Téléchargement\geckodriver.exe")

# i have to launch one time to accept the cookies manually 
#by setting a breakpoint after, but you dont have that i think
#driver.get(link_df[0])  

print ("Headless Firefox Initialized")


print(link_df)
for url in link_df:
    base_url = url    # i['Link']  # link_df['Link'][i]
    print(base_url)
    driver.get(base_url)
    WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[./span[text()='Avis les plus récents']]"))).click()

    title = driver.find_element_by_xpath("//div[@class='P5Bobd']").text
    address = driver.find_element_by_xpath("//div[@class='T6pBCe']").text
    overall_rating = driver.find_element_by_xpath("//div[@class='review-score-container']//span[@class='Aq14fc']").text

    total_reviews_text = driver.find_element_by_xpath(
        "//div[@class='review-score-container']//div//div//span//span[@class='z5jxId']").text
    num_reviews = int(total_reviews_text.split()[0])
    all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
    # time.sleep(2)
    total_reviews = 0

    while total_reviews < num_reviews:
        driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
        WebDriverWait(driver, 5, 0.25).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class$="activityIndicator"]')))
        
        all_reviews = WebDriverWait(driver, 5).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
        total_reviews = len(all_reviews)
        print(total_reviews, len(all_reviews))

    driver.get('https:ww.google.com') # or driver.close() if no bugs
    time.sleep(3)

driver.close()
driver.quit()

chrome 的解决方案似乎需要一些修正:

org.openqa.selenium.StaleElementReferenceException:过时的元素引用:元素未附加到页面文档

字面意思是,引用的元素已经过时,不再附加到当前页面。通常,这是因为页面被刷新或跳过,解决方法是,重新使用 findElement 或 findElements 方法定位元素。

所以chrome好像刷新有问题,所以我建议在滚动前加载记录数,有DOM项的新副本,我有在 while 循环结束时添加等待 1 秒

from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
#from selenium.webdriver.firefox.options import Options
from selenium.webdriver.chrome.options import Options
import time

link_df =  [
    "https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]",
    "https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]"
]

i = 0
binaryfirefox = r'C:\Program Files (x86)\Mozilla Firefox\firefox.exe'
binarychrome = r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'


options = Options()

#cap = DesiredCapabilities().CHROME
#cap["marionette"] = True
#cap = DesiredCapabilities().FIREFOX
#options.binary = binaryfirefox
#driver = webdriver.Firefox(options=options, capabilities=cap, executable_path="E:\Téléchargement\geckodriver.exe")

options.binary_location  = binarychrome
driver = webdriver.Chrome(options=options, executable_path="E:\Téléchargement\chromedriver.exe" )

# same reason tha Firefox i have to load one time
# an url to accept manually the cookies
#driver.get(link_df[0])   



print(link_df)
for url in link_df:
    base_url = url    # i['Link']  # link_df['Link'][i]
    print(base_url)
    driver.get(base_url)
    WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[./span[text()='Newest']]"))).click()

    title = driver.find_element_by_xpath("//div[@class='P5Bobd']").text
    address = driver.find_element_by_xpath("//div[@class='T6pBCe']").text
    overall_rating = driver.find_element_by_xpath("//div[@class='review-score-container']//span[@class='Aq14fc']").text

    total_reviews_text = driver.find_element_by_xpath(
        "//div[@class='review-score-container']//div//div//span//span[@class='z5jxId']").text
    num_reviews = int(total_reviews_text.split()[0])
    all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
    # time.sleep(2)
    total_reviews = 0

    while total_reviews < num_reviews:
        #reload to avoid exception, or trap scroll with try/except but more expznsive
        all_reviews = WebDriverWait(driver, 20).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))

        driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])

        total_reviews = len(all_reviews)
        print(total_reviews, len(all_reviews))
        time.sleep(1)

    driver.get('https:ww.google.com') # or driver.close() if no bugs
    time.sleep(3)

driver.close()
driver.quit()