用 Selenium 迭代打开 URL 的列表

Question

我在使用 selenium 迭代打开包含 URLs 的列表的操作时遇到问题。问题出在我的代码中标记为 #Second Part 的部分。 linklinkfin 目前是长度为 9 的列表，但随着时间的推移收集到更多 URL，此长度可能会发生变化。当代码运行s 时，它似乎一遍又一遍地打开第一个 URL，并且它不会出现运行嵌套 while 循环中的追加操作，因为最后 when我打印 textreal_listing 它是空的。作为代码运行s，我不断看到 https://www.nj.gov/dobi/division_insurance/bfd/enforcement2014.htm opening/refreshing 直到程序结束。在每个 while 循环结束时，应该将 1 添加到 browsercount，然后代码用新的 URL 重复，但这似乎没有发生，有什么想法吗？

我的代码：

#FIRST PART
from selenium.common.exceptions import NoSuchElementException
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
textreal_listing=[]
browser = webdriver.Chrome(r'\homedirpva1a01\USERSNC$3225\chromedriver\chromedriver.exe')
time.sleep(5)
browser.get("link")
time.sleep(5)

linkslist=browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[6]/td[1]/table/tbody/tr[2]/td/ul/li/font/a")
linkslist2=browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[6]/td[1]/table/tbody/tr[2]/td/ul/li/font/font/a")
linklinkfin=linkslist+linkslist2

#SECOND PART
textcount=1
textpage=6
browsercount=2014
for i in linklinkfin:
    browser.get("link.htm".format(browsercount))
    time.sleep(2)
    if "404 Error" in browser.page_source:
        browser.get("link.html".format(browsercount))
        time.sleep(2)
        while len(textreal_listing)<100:
            texttreesing=browser.find_element_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount))
            textreal_listing.append(texttreesing.text)
            textcount+=1
            if len(browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount)))==0:
                textpage+=3
                textcount=2
                if len(browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount)))==0:
                    break
                browsercount+=1
        else:
            while len(textreal_listing)<100:
                texttreesing=browser.find_element_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount))
                textreal_listing.append(texttreesing.text)
                textcount+=1
                if len(browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount)))==0:
                    textpage+=3
                    textcount=2
                    if len(browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount)))==0:
                        break
                browsercount+=1

print(textreal_listing)

Answer 1

这有效

#FIRST PART
import pandas as pd
from selenium.common.exceptions import NoSuchElementException
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
textreal_listing=[]
browser = webdriver.Chrome(r'\homedirpva1a01\USERSNC$3225\chromedriver\chromedriver.exe')
time.sleep(5)
browser.get("https://www.nj.gov/dobi/division_insurance/bfd/enforcement.htm")
time.sleep(5)

linkslist=browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[6]/td[1]/table/tbody/tr[2]/td/ul/li/font/a")
linkslist2=browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[6]/td[1]/table/tbody/tr[2]/td/ul/li/font/font/a")

linktext=[]
for my_href in linkslist:
    linktext.append(my_href.get_attribute("href"))

for my_hrefs in linkslist2:
    linktext.append(my_hrefs.get_attribute("href"))

#SECOND Part
textcount=1
textpage=6
browsercount=2014
for i in linktext:
    browser.get(i)
    time.sleep(5)
    while len(textreal_listing)<100:
        texttreesing=browser.find_element_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount))
        textreal_listing.append(texttreesing.text)
        textcount+=1
        if len(browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount)))==0:
            textpage+=3
            textcount=2
            if len(browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount)))==0:
                break
    
print(textreal_listing)

用 Selenium 迭代打开 URL 的列表

Iterate Opening a List of URL's with Selenium

python

selenium

for-loop

while-loop