用 Selenium 迭代打开 URL 的列表
Iterate Opening a List of URL's with Selenium
我在使用 selenium 迭代打开包含 URLs 的列表的操作时遇到问题。
问题出在我的代码中标记为 #Second Part
的部分。 linklinkfin
目前是长度为 9 的列表,但随着时间的推移收集到更多 URL,此长度可能会发生变化。当代码 运行s 时,它似乎一遍又一遍地打开第一个 URL,并且它不会出现 运行 嵌套 while 循环中的追加操作,因为最后 when我打印 textreal_listing
它是空的。作为代码 运行s,我不断看到 https://www.nj.gov/dobi/division_insurance/bfd/enforcement2014.htm opening/refreshing 直到程序结束。在每个 while 循环结束时,应该将 1 添加到 browsercount
,然后代码用新的 URL 重复,但这似乎没有发生,有什么想法吗?
我的代码:
#FIRST PART
from selenium.common.exceptions import NoSuchElementException
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
textreal_listing=[]
browser = webdriver.Chrome(r'\homedirpva1a01\USERSNC$3225\chromedriver\chromedriver.exe')
time.sleep(5)
browser.get("link")
time.sleep(5)
linkslist=browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[6]/td[1]/table/tbody/tr[2]/td/ul/li/font/a")
linkslist2=browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[6]/td[1]/table/tbody/tr[2]/td/ul/li/font/font/a")
linklinkfin=linkslist+linkslist2
#SECOND PART
textcount=1
textpage=6
browsercount=2014
for i in linklinkfin:
browser.get("link.htm".format(browsercount))
time.sleep(2)
if "404 Error" in browser.page_source:
browser.get("link.html".format(browsercount))
time.sleep(2)
while len(textreal_listing)<100:
texttreesing=browser.find_element_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount))
textreal_listing.append(texttreesing.text)
textcount+=1
if len(browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount)))==0:
textpage+=3
textcount=2
if len(browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount)))==0:
break
browsercount+=1
else:
while len(textreal_listing)<100:
texttreesing=browser.find_element_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount))
textreal_listing.append(texttreesing.text)
textcount+=1
if len(browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount)))==0:
textpage+=3
textcount=2
if len(browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount)))==0:
break
browsercount+=1
print(textreal_listing)
这有效
#FIRST PART
import pandas as pd
from selenium.common.exceptions import NoSuchElementException
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
textreal_listing=[]
browser = webdriver.Chrome(r'\homedirpva1a01\USERSNC$3225\chromedriver\chromedriver.exe')
time.sleep(5)
browser.get("https://www.nj.gov/dobi/division_insurance/bfd/enforcement.htm")
time.sleep(5)
linkslist=browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[6]/td[1]/table/tbody/tr[2]/td/ul/li/font/a")
linkslist2=browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[6]/td[1]/table/tbody/tr[2]/td/ul/li/font/font/a")
linktext=[]
for my_href in linkslist:
linktext.append(my_href.get_attribute("href"))
for my_hrefs in linkslist2:
linktext.append(my_hrefs.get_attribute("href"))
#SECOND Part
textcount=1
textpage=6
browsercount=2014
for i in linktext:
browser.get(i)
time.sleep(5)
while len(textreal_listing)<100:
texttreesing=browser.find_element_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount))
textreal_listing.append(texttreesing.text)
textcount+=1
if len(browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount)))==0:
textpage+=3
textcount=2
if len(browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount)))==0:
break
print(textreal_listing)
我在使用 selenium 迭代打开包含 URLs 的列表的操作时遇到问题。
问题出在我的代码中标记为 #Second Part
的部分。 linklinkfin
目前是长度为 9 的列表,但随着时间的推移收集到更多 URL,此长度可能会发生变化。当代码 运行s 时,它似乎一遍又一遍地打开第一个 URL,并且它不会出现 运行 嵌套 while 循环中的追加操作,因为最后 when我打印 textreal_listing
它是空的。作为代码 运行s,我不断看到 https://www.nj.gov/dobi/division_insurance/bfd/enforcement2014.htm opening/refreshing 直到程序结束。在每个 while 循环结束时,应该将 1 添加到 browsercount
,然后代码用新的 URL 重复,但这似乎没有发生,有什么想法吗?
我的代码:
#FIRST PART
from selenium.common.exceptions import NoSuchElementException
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
textreal_listing=[]
browser = webdriver.Chrome(r'\homedirpva1a01\USERSNC$3225\chromedriver\chromedriver.exe')
time.sleep(5)
browser.get("link")
time.sleep(5)
linkslist=browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[6]/td[1]/table/tbody/tr[2]/td/ul/li/font/a")
linkslist2=browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[6]/td[1]/table/tbody/tr[2]/td/ul/li/font/font/a")
linklinkfin=linkslist+linkslist2
#SECOND PART
textcount=1
textpage=6
browsercount=2014
for i in linklinkfin:
browser.get("link.htm".format(browsercount))
time.sleep(2)
if "404 Error" in browser.page_source:
browser.get("link.html".format(browsercount))
time.sleep(2)
while len(textreal_listing)<100:
texttreesing=browser.find_element_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount))
textreal_listing.append(texttreesing.text)
textcount+=1
if len(browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount)))==0:
textpage+=3
textcount=2
if len(browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount)))==0:
break
browsercount+=1
else:
while len(textreal_listing)<100:
texttreesing=browser.find_element_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount))
textreal_listing.append(texttreesing.text)
textcount+=1
if len(browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount)))==0:
textpage+=3
textcount=2
if len(browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount)))==0:
break
browsercount+=1
print(textreal_listing)
这有效
#FIRST PART
import pandas as pd
from selenium.common.exceptions import NoSuchElementException
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
textreal_listing=[]
browser = webdriver.Chrome(r'\homedirpva1a01\USERSNC$3225\chromedriver\chromedriver.exe')
time.sleep(5)
browser.get("https://www.nj.gov/dobi/division_insurance/bfd/enforcement.htm")
time.sleep(5)
linkslist=browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[6]/td[1]/table/tbody/tr[2]/td/ul/li/font/a")
linkslist2=browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[6]/td[1]/table/tbody/tr[2]/td/ul/li/font/font/a")
linktext=[]
for my_href in linkslist:
linktext.append(my_href.get_attribute("href"))
for my_hrefs in linkslist2:
linktext.append(my_hrefs.get_attribute("href"))
#SECOND Part
textcount=1
textpage=6
browsercount=2014
for i in linktext:
browser.get(i)
time.sleep(5)
while len(textreal_listing)<100:
texttreesing=browser.find_element_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount))
textreal_listing.append(texttreesing.text)
textcount+=1
if len(browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount)))==0:
textpage+=3
textcount=2
if len(browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount)))==0:
break
print(textreal_listing)